aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild5
-rw-r--r--arch/x86/Kconfig494
-rw-r--r--arch/x86/Kconfig.cpu17
-rw-r--r--arch/x86/Kconfig.debug34
-rw-r--r--arch/x86/Makefile137
-rw-r--r--arch/x86/Makefile.um2
-rw-r--r--arch/x86/Makefile_32.cpu14
-rw-r--r--arch/x86/boot/.gitignore1
-rw-r--r--arch/x86/boot/Makefile45
-rw-r--r--arch/x86/boot/boot.h75
-rw-r--r--arch/x86/boot/compressed/Makefile27
-rw-r--r--arch/x86/boot/compressed/acpi.c176
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c3
-rw-r--r--arch/x86/boot/compressed/efi.c234
-rw-r--r--arch/x86/boot/compressed/efi.h126
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S46
-rw-r--r--arch/x86/boot/compressed/head_32.S3
-rw-r--r--arch/x86/boot/compressed/head_64.S234
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c39
-rw-r--r--arch/x86/boot/compressed/idt_64.c32
-rw-r--r--arch/x86/boot/compressed/kaslr.c13
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S160
-rw-r--r--arch/x86/boot/compressed/misc.c34
-rw-r--r--arch/x86/boot/compressed/misc.h72
-rw-r--r--arch/x86/boot/compressed/pgtable.h2
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c5
-rw-r--r--arch/x86/boot/compressed/sev-es.c213
-rw-r--r--arch/x86/boot/compressed/sev.c437
-rw-r--r--arch/x86/boot/compressed/tdcall.S3
-rw-r--r--arch/x86/boot/compressed/tdx.c77
-rw-r--r--arch/x86/boot/compressed/tdx.h13
-rw-r--r--arch/x86/boot/cpucheck.c30
-rw-r--r--arch/x86/boot/cpuflags.c3
-rw-r--r--arch/x86/boot/cpuflags.h1
-rw-r--r--arch/x86/boot/genimage.sh304
-rw-r--r--arch/x86/boot/header.S4
-rwxr-xr-x[-rw-r--r--]arch/x86/boot/install.sh22
-rw-r--r--arch/x86/boot/io.h41
-rw-r--r--arch/x86/boot/main.c6
-rw-r--r--arch/x86/boot/msr.h26
-rw-r--r--arch/x86/boot/mtools.conf.in6
-rw-r--r--arch/x86/boot/string.h3
-rw-r--r--arch/x86/boot/video-vesa.c4
-rw-r--r--arch/x86/coco/Makefile8
-rw-r--r--arch/x86/coco/core.c140
-rw-r--r--arch/x86/coco/tdx/Makefile3
-rw-r--r--arch/x86/coco/tdx/tdcall.S205
-rw-r--r--arch/x86/coco/tdx/tdx.c777
-rw-r--r--arch/x86/configs/i386_defconfig51
-rw-r--r--arch/x86/configs/x86_64_defconfig38
-rw-r--r--arch/x86/crypto/Makefile15
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S48
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S65
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S60
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S68
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c9
-rw-r--r--arch/x86/crypto/blake2s-core.S4
-rw-r--r--arch/x86/crypto/blake2s-glue.c68
-rw-r--r--arch/x86/crypto/blake2s-shash.c77
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/blowfish_glue.c20
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S14
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S19
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/camellia_glue.c8
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/chacha-avx2-x86_64.S6
-rw-r--r--arch/x86/crypto/chacha-avx512vl-x86_64.S10
-rw-r--r--arch/x86/crypto/chacha-ssse3-x86_64.S8
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S2
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c2
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S12
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S2
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c775
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S4
-rw-r--r--arch/x86/crypto/des3_ede_glue.c12
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S6
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S2
-rw-r--r--arch/x86/crypto/nh-sse2-x86_64.S2
-rw-r--r--arch/x86/crypto/poly1305-x86_64-cryptogams.pl38
-rw-r--r--arch/x86/crypto/poly1305_glue.c6
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S10
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S6
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S6
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c8
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S10
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S10
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S2
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S15
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S2
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S43
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S44
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S43
-rw-r--r--arch/x86/crypto/sm3-avx-asm_64.S517
-rw-r--r--arch/x86/crypto/sm3_avx_glue.c134
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S594
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S501
-rw-r--r--arch/x86/crypto/sm4-avx.h24
-rw-r--r--arch/x86/crypto/sm4_aesni_avx2_glue.c169
-rw-r--r--arch/x86/crypto/sm4_aesni_avx_glue.c487
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S4
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S8
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S4
-rw-r--r--arch/x86/crypto/twofish_glue.c8
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c10
-rw-r--r--arch/x86/entry/Makefile10
-rw-r--r--arch/x86/entry/calling.h58
-rw-r--r--arch/x86/entry/common.c95
-rw-r--r--arch/x86/entry/entry_32.S144
-rw-r--r--arch/x86/entry/entry_64.S171
-rw-r--r--arch/x86/entry/entry_64_compat.S116
-rw-r--r--arch/x86/entry/syscall_32.c20
-rw-r--r--arch/x86/entry/syscall_64.c17
-rw-r--r--arch/x86/entry/syscall_x32.c35
-rw-r--r--arch/x86/entry/syscalls/Makefile41
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl14
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl10
-rw-r--r--arch/x86/entry/syscalls/syscallhdr.sh35
-rw-r--r--arch/x86/entry/syscalls/syscalltbl.sh46
-rw-r--r--arch/x86/entry/thunk_32.S2
-rw-r--r--arch/x86/entry/thunk_64.S2
-rw-r--r--arch/x86/entry/vdso/Makefile7
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S1
-rw-r--r--arch/x86/entry/vdso/vdso2c.c2
-rw-r--r--arch/x86/entry/vdso/vdso2c.h2
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S6
-rw-r--r--arch/x86/entry/vdso/vma.c4
-rw-r--r--arch/x86/entry/vdso/vsgx.S4
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c5
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_emu_64.S6
-rw-r--r--arch/x86/events/Kconfig30
-rw-r--r--arch/x86/events/amd/Makefile6
-rw-r--r--arch/x86/events/amd/brs.c367
-rw-r--r--arch/x86/events/amd/core.c507
-rw-r--r--arch/x86/events/amd/ibs.c241
-rw-r--r--arch/x86/events/amd/iommu.c56
-rw-r--r--arch/x86/events/amd/iommu.h21
-rw-r--r--arch/x86/events/amd/power.c1
-rw-r--r--arch/x86/events/amd/uncore.c46
-rw-r--r--arch/x86/events/core.c418
-rw-r--r--arch/x86/events/intel/Makefile2
-rw-r--r--arch/x86/events/intel/bts.c8
-rw-r--r--arch/x86/events/intel/core.c926
-rw-r--r--arch/x86/events/intel/cstate.c59
-rw-r--r--arch/x86/events/intel/ds.c80
-rw-r--r--arch/x86/events/intel/lbr.c284
-rw-r--r--arch/x86/events/intel/p4.c22
-rw-r--r--arch/x86/events/intel/pt.c84
-rw-r--r--arch/x86/events/intel/uncore.c241
-rw-r--r--arch/x86/events/intel/uncore.h26
-rw-r--r--arch/x86/events/intel/uncore_discovery.c630
-rw-r--r--arch/x86/events/intel/uncore_discovery.h152
-rw-r--r--arch/x86/events/intel/uncore_snb.c659
-rw-r--r--arch/x86/events/intel/uncore_snbep.c922
-rw-r--r--arch/x86/events/msr.c6
-rw-r--r--arch/x86/events/perf_event.h300
-rw-r--r--arch/x86/events/rapl.c17
-rw-r--r--arch/x86/events/zhaoxin/core.c2
-rw-r--r--arch/x86/hyperv/Makefile2
-rw-r--r--arch/x86/hyperv/hv_apic.c81
-rw-r--r--arch/x86/hyperv/hv_init.c320
-rw-r--r--arch/x86/hyperv/hv_proc.c26
-rw-r--r--arch/x86/hyperv/hv_spinlock.c8
-rw-r--r--arch/x86/hyperv/irqdomain.c61
-rw-r--r--arch/x86/hyperv/ivm.c389
-rw-r--r--arch/x86/hyperv/mmu.c37
-rw-r--r--arch/x86/hyperv/nested.c8
-rw-r--r--arch/x86/ia32/Makefile2
-rw-r--r--arch/x86/ia32/audit.c13
-rw-r--r--arch/x86/ia32/ia32_aout.c327
-rw-r--r--arch/x86/ia32/ia32_signal.c15
-rw-r--r--arch/x86/include/asm/GEN-for-each-reg.h14
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/acenv.h14
-rw-r--r--arch/x86/include/asm/agp.h2
-rw-r--r--arch/x86/include/asm/alternative-asm.h114
-rw-r--r--arch/x86/include/asm/alternative.h151
-rw-r--r--arch/x86/include/asm/amd-ibs.h132
-rw-r--r--arch/x86/include/asm/amd_hsmp.h16
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/apic.h8
-rw-r--r--arch/x86/include/asm/apicdef.h6
-rw-r--r--arch/x86/include/asm/asm-prototypes.h17
-rw-r--r--arch/x86/include/asm/asm.h115
-rw-r--r--arch/x86/include/asm/atomic.h2
-rw-r--r--arch/x86/include/asm/barrier.h17
-rw-r--r--arch/x86/include/asm/bitops.h2
-rw-r--r--arch/x86/include/asm/bootparam_utils.h1
-rw-r--r--arch/x86/include/asm/bug.h32
-rw-r--r--arch/x86/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h21
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h6
-rw-r--r--arch/x86/include/asm/coco.h32
-rw-r--r--arch/x86/include/asm/compat.h137
-rw-r--r--arch/x86/include/asm/cpu.h39
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h10
-rw-r--r--arch/x86/include/asm/cpufeature.h57
-rw-r--r--arch/x86/include/asm/cpufeatures.h29
-rw-r--r--arch/x86/include/asm/cpuid.h34
-rw-r--r--arch/x86/include/asm/cpumask.h10
-rw-r--r--arch/x86/include/asm/crash.h6
-rw-r--r--arch/x86/include/asm/desc.h25
-rw-r--r--arch/x86/include/asm/disabled-features.h22
-rw-r--r--arch/x86/include/asm/dma-mapping.h12
-rw-r--r--arch/x86/include/asm/efi.h39
-rw-r--r--arch/x86/include/asm/elf.h29
-rw-r--r--arch/x86/include/asm/entry-common.h22
-rw-r--r--arch/x86/include/asm/extable.h48
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h67
-rw-r--r--arch/x86/include/asm/floppy.h1
-rw-r--r--arch/x86/include/asm/fpu/api.h74
-rw-r--r--arch/x86/include/asm/fpu/internal.h596
-rw-r--r--arch/x86/include/asm/fpu/sched.h68
-rw-r--r--arch/x86/include/asm/fpu/signal.h12
-rw-r--r--arch/x86/include/asm/fpu/types.h246
-rw-r--r--arch/x86/include/asm/fpu/xcr.h23
-rw-r--r--arch/x86/include/asm/fpu/xstate.h99
-rw-r--r--arch/x86/include/asm/ftrace.h16
-rw-r--r--arch/x86/include/asm/futex.h28
-rw-r--r--arch/x86/include/asm/gart.h5
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h207
-rw-r--r--arch/x86/include/asm/i8259.h2
-rw-r--r--arch/x86/include/asm/ia32.h2
-rw-r--r--arch/x86/include/asm/ibt.h105
-rw-r--r--arch/x86/include/asm/idtentry.h79
-rw-r--r--arch/x86/include/asm/inat.h2
-rw-r--r--arch/x86/include/asm/insn-eval.h20
-rw-r--r--arch/x86/include/asm/insn.h48
-rw-r--r--arch/x86/include/asm/intel-family.h58
-rw-r--r--arch/x86/include/asm/intel_ds.h5
-rw-r--r--arch/x86/include/asm/intel_pconfig.h2
-rw-r--r--arch/x86/include/asm/intel_pt.h4
-rw-r--r--arch/x86/include/asm/io.h74
-rw-r--r--arch/x86/include/asm/iommu.h8
-rw-r--r--arch/x86/include/asm/iommu_table.h102
-rw-r--r--arch/x86/include/asm/irq_stack.h46
-rw-r--r--arch/x86/include/asm/irq_vectors.h7
-rw-r--r--arch/x86/include/asm/irqflags.h17
-rw-r--r--arch/x86/include/asm/jump_label.h89
-rw-r--r--arch/x86/include/asm/kexec.h15
-rw-r--r--arch/x86/include/asm/kfence.h11
-rw-r--r--arch/x86/include/asm/kprobes.h22
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h133
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h31
-rw-r--r--arch/x86/include/asm/kvm_host.h732
-rw-r--r--arch/x86/include/asm/kvm_page_track.h13
-rw-r--r--arch/x86/include/asm/kvm_para.h42
-rw-r--r--arch/x86/include/asm/kvmclock.h14
-rw-r--r--arch/x86/include/asm/linkage.h45
-rw-r--r--arch/x86/include/asm/livepatch.h20
-rw-r--r--arch/x86/include/asm/mce.h54
-rw-r--r--arch/x86/include/asm/mem_encrypt.h23
-rw-r--r--arch/x86/include/asm/microcode.h5
-rw-r--r--arch/x86/include/asm/microcode_amd.h2
-rw-r--r--arch/x86/include/asm/microcode_intel.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/mmx.h15
-rw-r--r--arch/x86/include/asm/mshyperv.h165
-rw-r--r--arch/x86/include/asm/msi.h19
-rw-r--r--arch/x86/include/asm/msr-index.h112
-rw-r--r--arch/x86/include/asm/msr.h45
-rw-r--r--arch/x86/include/asm/mtrr.h8
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/nops.h184
-rw-r--r--arch/x86/include/asm/nospec-branch.h101
-rw-r--r--arch/x86/include/asm/page.h16
-rw-r--r--arch/x86/include/asm/page_32.h14
-rw-r--r--arch/x86/include/asm/page_64.h36
-rw-r--r--arch/x86/include/asm/page_64_types.h25
-rw-r--r--arch/x86/include/asm/paravirt.h212
-rw-r--r--arch/x86/include/asm/paravirt_api_clock.h1
-rw-r--r--arch/x86/include/asm/paravirt_types.h222
-rw-r--r--arch/x86/include/asm/pc-conf-reg.h33
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h24
-rw-r--r--arch/x86/include/asm/percpu.h6
-rw-r--r--arch/x86/include/asm/perf_event.h51
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable.h146
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h8
-rw-r--r--arch/x86/include/asm/pkeys.h19
-rw-r--r--arch/x86/include/asm/pkru.h62
-rw-r--r--arch/x86/include/asm/preempt.h12
-rw-r--r--arch/x86/include/asm/processor-cyrix.h8
-rw-r--r--arch/x86/include/asm/processor.h79
-rw-r--r--arch/x86/include/asm/proto.h10
-rw-r--r--arch/x86/include/asm/ptrace.h13
-rw-r--r--arch/x86/include/asm/qspinlock.h1
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h7
-rw-r--r--arch/x86/include/asm/realmode.h2
-rw-r--r--arch/x86/include/asm/required-features.h4
-rw-r--r--arch/x86/include/asm/segment.h44
-rw-r--r--arch/x86/include/asm/set_memory.h55
-rw-r--r--arch/x86/include/asm/setup.h53
-rw-r--r--arch/x86/include/asm/sev-common.h174
-rw-r--r--arch/x86/include/asm/sev-es.h114
-rw-r--r--arch/x86/include/asm/sev.h231
-rw-r--r--arch/x86/include/asm/sgx.h (renamed from arch/x86/kernel/cpu/sgx/arch.h)70
-rw-r--r--arch/x86/include/asm/shared/io.h34
-rw-r--r--arch/x86/include/asm/shared/msr.h15
-rw-r--r--arch/x86/include/asm/shared/tdx.h40
-rw-r--r--arch/x86/include/asm/sigframe.h2
-rw-r--r--arch/x86/include/asm/signal.h1
-rw-r--r--arch/x86/include/asm/smap.h29
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/special_insns.h27
-rw-r--r--arch/x86/include/asm/stackprotector.h79
-rw-r--r--arch/x86/include/asm/stacktrace.h10
-rw-r--r--arch/x86/include/asm/static_call.h6
-rw-r--r--arch/x86/include/asm/string_32.h33
-rw-r--r--arch/x86/include/asm/suspend_32.h8
-rw-r--r--arch/x86/include/asm/suspend_64.h12
-rw-r--r--arch/x86/include/asm/svm.h230
-rw-r--r--arch/x86/include/asm/swiotlb.h30
-rw-r--r--arch/x86/include/asm/switch_to.h15
-rw-r--r--arch/x86/include/asm/syscall.h46
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h17
-rw-r--r--arch/x86/include/asm/sysfb.h94
-rw-r--r--arch/x86/include/asm/tdx.h91
-rw-r--r--arch/x86/include/asm/text-patching.h32
-rw-r--r--arch/x86/include/asm/thermal.h4
-rw-r--r--arch/x86/include/asm/thread_info.h21
-rw-r--r--arch/x86/include/asm/timex.h9
-rw-r--r--arch/x86/include/asm/tlbflush.h152
-rw-r--r--arch/x86/include/asm/topology.h18
-rw-r--r--arch/x86/include/asm/trace/fpu.h4
-rw-r--r--arch/x86/include/asm/trace/hyperv.h2
-rw-r--r--arch/x86/include/asm/traps.h10
-rw-r--r--arch/x86/include/asm/tsc.h7
-rw-r--r--arch/x86/include/asm/uaccess.h227
-rw-r--r--arch/x86/include/asm/uaccess_64.h7
-rw-r--r--arch/x86/include/asm/unaligned.h15
-rw-r--r--arch/x86/include/asm/unistd.h9
-rw-r--r--arch/x86/include/asm/unwind.h28
-rw-r--r--arch/x86/include/asm/unwind_hints.h5
-rw-r--r--arch/x86/include/asm/user_32.h4
-rw-r--r--arch/x86/include/asm/user_64.h4
-rw-r--r--arch/x86/include/asm/uv/uv_geo.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h2
-rw-r--r--arch/x86/include/asm/vdso.h2
-rw-r--r--arch/x86/include/asm/vdso/clocksource.h2
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h3
-rw-r--r--arch/x86/include/asm/vmalloc.h20
-rw-r--r--arch/x86/include/asm/vmx.h11
-rw-r--r--arch/x86/include/asm/word-at-a-time.h66
-rw-r--r--arch/x86/include/asm/x86_init.h22
-rw-r--r--arch/x86/include/asm/xen/cpuid.h7
-rw-r--r--arch/x86/include/asm/xen/hypercall.h235
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h19
-rw-r--r--arch/x86/include/asm/xen/page-coherent.h24
-rw-r--r--arch/x86/include/asm/xen/page.h22
-rw-r--r--arch/x86/include/asm/xen/pci.h18
-rw-r--r--arch/x86/include/asm/xen/swiotlb-xen.h12
-rw-r--r--arch/x86/include/asm/xor.h42
-rw-r--r--arch/x86/include/asm/xor_32.h42
-rw-r--r--arch/x86/include/asm/xor_avx.h21
-rw-r--r--arch/x86/include/uapi/asm/amd_hsmp.h307
-rw-r--r--arch/x86/include/uapi/asm/auxvec.h4
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h8
-rw-r--r--arch/x86/include/uapi/asm/debugreg.h1
-rw-r--r--arch/x86/include/uapi/asm/hwcap2.h6
-rw-r--r--arch/x86/include/uapi/asm/kvm.h50
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h14
-rw-r--r--arch/x86/include/uapi/asm/mman.h14
-rw-r--r--arch/x86/include/uapi/asm/msgbuf.h2
-rw-r--r--arch/x86/include/uapi/asm/prctl.h24
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/include/uapi/asm/sgx.h4
-rw-r--r--arch/x86/include/uapi/asm/shmbuf.h8
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h2
-rw-r--r--arch/x86/include/uapi/asm/signal.h2
-rw-r--r--arch/x86/include/uapi/asm/svm.h16
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/Makefile25
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c294
-rw-r--r--arch/x86/kernel/acpi/cppc.c102
-rw-r--r--arch/x86/kernel/acpi/cppc_msr.c49
-rw-r--r--arch/x86/kernel/acpi/cstate.c18
-rw-r--r--arch/x86/kernel/acpi/sleep.c25
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S6
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kernel/alternative.c738
-rw-r--r--arch/x86/kernel/amd_gart_64.c23
-rw-r--r--arch/x86/kernel/amd_nb.c71
-rw-r--r--arch/x86/kernel/aperture_64.c32
-rw-r--r--arch/x86/kernel/apic/apic.c48
-rw-r--r--arch/x86/kernel/apic/io_apic.c34
-rw-r--r--arch/x86/kernel/apic/msi.c22
-rw-r--r--arch/x86/kernel/apic/vector.c43
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c27
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c49
-rw-r--r--arch/x86/kernel/apm_32.c13
-rw-r--r--arch/x86/kernel/asm-offsets.c27
-rw-r--r--arch/x86/kernel/asm-offsets_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/audit_64.c10
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c68
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c480
-rw-r--r--arch/x86/kernel/cpu/bugs.c523
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c10
-rw-r--r--arch/x86/kernel/cpu/common.c462
-rw-r--r--arch/x86/kernel/cpu/cpu.h6
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c6
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c71
-rw-r--r--arch/x86/kernel/cpu/hygon.c10
-rw-r--r--arch/x86/kernel/cpu/intel.c276
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c45
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c377
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c11
-rw-r--r--arch/x86/kernel/cpu/mce/core.c646
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c58
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c45
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h96
-rw-r--r--arch/x86/kernel/cpu/mce/p5.c6
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c185
-rw-r--r--arch/x86/kernel/cpu/mce/winchip.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c14
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c160
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c68
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c127
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c10
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/proc.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c284
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c163
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h250
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c81
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c28
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c485
-rw-r--r--arch/x86/kernel/cpu/scattered.c4
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile1
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c17
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c310
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h14
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h64
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c43
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c442
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h48
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c432
-rw-r--r--arch/x86/kernel/cpu/topology.c4
-rw-r--r--arch/x86/kernel/cpu/tsx.c127
-rw-r--r--arch/x86/kernel/cpu/vmware.c7
-rw-r--r--arch/x86/kernel/cpu/vortex.c39
-rw-r--r--arch/x86/kernel/crash.c29
-rw-r--r--arch/x86/kernel/crash_dump_32.c29
-rw-r--r--arch/x86/kernel/crash_dump_64.c51
-rw-r--r--arch/x86/kernel/devicetree.c10
-rw-r--r--arch/x86/kernel/doublefault_32.c7
-rw-r--r--arch/x86/kernel/dumpstack.c12
-rw-r--r--arch/x86/kernel/dumpstack_64.c6
-rw-r--r--arch/x86/kernel/e820.c47
-rw-r--r--arch/x86/kernel/early-quirks.c23
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/fpu/bugs.c2
-rw-r--r--arch/x86/kernel/fpu/context.h83
-rw-r--r--arch/x86/kernel/fpu/core.c748
-rw-r--r--arch/x86/kernel/fpu/init.c89
-rw-r--r--arch/x86/kernel/fpu/internal.h28
-rw-r--r--arch/x86/kernel/fpu/legacy.h111
-rw-r--r--arch/x86/kernel/fpu/regset.c236
-rw-r--r--arch/x86/kernel/fpu/signal.c562
-rw-r--r--arch/x86/kernel/fpu/xstate.c1671
-rw-r--r--arch/x86/kernel/fpu/xstate.h326
-rw-r--r--arch/x86/kernel/ftrace.c90
-rw-r--r--arch/x86/kernel/ftrace_32.S6
-rw-r--r--arch/x86/kernel/ftrace_64.S73
-rw-r--r--arch/x86/kernel/head64.c111
-rw-r--r--arch/x86/kernel/head_32.S22
-rw-r--r--arch/x86/kernel/head_64.S104
-rw-r--r--arch/x86/kernel/hpet.c91
-rw-r--r--arch/x86/kernel/i8259.c8
-rw-r--r--arch/x86/kernel/idt.c59
-rw-r--r--arch/x86/kernel/irq.c6
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irqflags.S4
-rw-r--r--arch/x86/kernel/itmt.c2
-rw-r--r--arch/x86/kernel/jump_label.c108
-rw-r--r--arch/x86/kernel/kdebugfs.c37
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c2
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes/common.h1
-rw-r--r--arch/x86/kernel/kprobes/core.c699
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c4
-rw-r--r--arch/x86/kernel/kprobes/opt.c40
-rw-r--r--arch/x86/kernel/ksysfs.c77
-rw-r--r--arch/x86/kernel/kvm.c508
-rw-r--r--arch/x86/kernel/kvmclock.c50
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c15
-rw-r--r--arch/x86/kernel/machine_kexec_64.c68
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/module.c35
-rw-r--r--arch/x86/kernel/mpparse.c3
-rw-r--r--arch/x86/kernel/nmi.c25
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c9
-rw-r--r--arch/x86/kernel/paravirt.c161
-rw-r--r--arch/x86/kernel/paravirt_patch.c99
-rw-r--r--arch/x86/kernel/pci-dma.c114
-rw-r--r--arch/x86/kernel/pci-iommu_table.c77
-rw-r--r--arch/x86/kernel/pci-swiotlb.c78
-rw-r--r--arch/x86/kernel/probe_roms.c15
-rw-r--r--arch/x86/kernel/process.c176
-rw-r--r--arch/x86/kernel/process.h4
-rw-r--r--arch/x86/kernel/process_32.c18
-rw-r--r--arch/x86/kernel/process_64.c39
-rw-r--r--arch/x86/kernel/ptrace.c15
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/reboot.c21
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S12
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S24
-rw-r--r--arch/x86/kernel/resource.c17
-rw-r--r--arch/x86/kernel/rethook.c127
-rw-r--r--arch/x86/kernel/setup.c267
-rw-r--r--arch/x86/kernel/setup_percpu.c71
-rw-r--r--arch/x86/kernel/sev-es-shared.c533
-rw-r--r--arch/x86/kernel/sev-es.c1434
-rw-r--r--arch/x86/kernel/sev-shared.c1000
-rw-r--r--arch/x86/kernel/sev.c2251
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S2
-rw-r--r--arch/x86/kernel/signal.c188
-rw-r--r--arch/x86/kernel/signal_compat.c18
-rw-r--r--arch/x86/kernel/smp.c4
-rw-r--r--arch/x86/kernel/smpboot.c654
-rw-r--r--arch/x86/kernel/stacktrace.c8
-rw-r--r--arch/x86/kernel/static_call.c28
-rw-r--r--arch/x86/kernel/step.c3
-rw-r--r--arch/x86/kernel/sys_x86_64.c7
-rw-r--r--arch/x86/kernel/sysfb.c70
-rw-r--r--arch/x86/kernel/sysfb_efi.c284
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c111
-rw-r--r--arch/x86/kernel/tboot.c45
-rw-r--r--arch/x86/kernel/tls.c8
-rw-r--r--arch/x86/kernel/topology.c7
-rw-r--r--arch/x86/kernel/trace.c234
-rw-r--r--arch/x86/kernel/tracepoint.c6
-rw-r--r--arch/x86/kernel/traps.c425
-rw-r--r--arch/x86/kernel/tsc.c47
-rw-r--r--arch/x86/kernel/tsc_sync.c43
-rw-r--r--arch/x86/kernel/umip.c22
-rw-r--r--arch/x86/kernel/unwind_frame.c3
-rw-r--r--arch/x86/kernel/unwind_guess.c3
-rw-r--r--arch/x86/kernel/unwind_orc.c31
-rw-r--r--arch/x86/kernel/uprobes.c8
-rw-r--r--arch/x86/kernel/verify_cpu.S4
-rw-r--r--arch/x86/kernel/vm86_32.c14
-rw-r--r--arch/x86/kernel/vmlinux.lds.S40
-rw-r--r--arch/x86/kernel/x86_init.c28
-rw-r--r--arch/x86/kvm/Kconfig30
-rw-r--r--arch/x86/kvm/Makefile18
-rw-r--r--arch/x86/kvm/cpuid.c577
-rw-r--r--arch/x86/kvm/cpuid.h157
-rw-r--r--arch/x86/kvm/debugfs.c125
-rw-r--r--arch/x86/kvm/emulate.c453
-rw-r--r--arch/x86/kvm/fpu.h140
-rw-r--r--arch/x86/kvm/hyperv.c713
-rw-r--r--arch/x86/kvm/hyperv.h11
-rw-r--r--arch/x86/kvm/i8254.c11
-rw-r--r--arch/x86/kvm/i8259.c34
-rw-r--r--arch/x86/kvm/ioapic.c22
-rw-r--r--arch/x86/kvm/ioapic.h9
-rw-r--r--arch/x86/kvm/irq.c10
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/irq_comm.c21
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h40
-rw-r--r--arch/x86/kvm/kvm_emulate.h24
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c108
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h20
-rw-r--r--arch/x86/kvm/lapic.c452
-rw-r--r--arch/x86/kvm/lapic.h22
-rw-r--r--arch/x86/kvm/mmu.h221
-rw-r--r--arch/x86/kvm/mmu/mmu.c3529
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c303
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h217
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h52
-rw-r--r--arch/x86/kvm/mmu/page_track.c61
-rw-r--r--arch/x86/kvm/mmu/paging.h14
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h427
-rw-r--r--arch/x86/kvm/mmu/spte.c332
-rw-r--r--arch/x86/kvm/mmu/spte.h305
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c29
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h62
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c1615
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h107
-rw-r--r--arch/x86/kvm/pmu.c252
-rw-r--r--arch/x86/kvm/pmu.h31
-rw-r--r--arch/x86/kvm/reverse_cpuid.h186
-rw-r--r--arch/x86/kvm/svm/avic.c516
-rw-r--r--arch/x86/kvm/svm/hyperv.h35
-rw-r--r--arch/x86/kvm/svm/nested.c1181
-rw-r--r--arch/x86/kvm/svm/pmu.c70
-rw-r--r--arch/x86/kvm/svm/sev.c1743
-rw-r--r--arch/x86/kvm/svm/svm.c2596
-rw-r--r--arch/x86/kvm/svm/svm.h414
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c40
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h103
-rw-r--r--arch/x86/kvm/svm/svm_ops.h6
-rw-r--r--arch/x86/kvm/svm/vmenter.S51
-rw-r--r--arch/x86/kvm/trace.h118
-rw-r--r--arch/x86/kvm/vmx/capabilities.h21
-rw-r--r--arch/x86/kvm/vmx/evmcs.c20
-rw-r--r--arch/x86/kvm/vmx/evmcs.h62
-rw-r--r--arch/x86/kvm/vmx/nested.c1211
-rw-r--r--arch/x86/kvm/vmx/nested.h19
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c107
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c299
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h19
-rw-r--r--arch/x86/kvm/vmx/sgx.c496
-rw-r--r--arch/x86/kvm/vmx/sgx.h34
-rw-r--r--arch/x86/kvm/vmx/vmcs.h25
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c7
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h22
-rw-r--r--arch/x86/kvm/vmx/vmenter.S14
-rw-r--r--arch/x86/kvm/vmx/vmx.c2123
-rw-r--r--arch/x86/kvm/vmx/vmx.h187
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h51
-rw-r--r--arch/x86/kvm/x86.c4239
-rw-r--r--arch/x86/kvm/x86.h73
-rw-r--r--arch/x86/kvm/xen.c1374
-rw-r--r--arch/x86/kvm/xen.h78
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/atomic64_386_32.S88
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S18
-rw-r--r--arch/x86/lib/checksum_32.S27
-rw-r--r--arch/x86/lib/clear_page_64.S6
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S4
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S4
-rw-r--r--arch/x86/lib/copy_mc_64.S24
-rw-r--r--arch/x86/lib/copy_page_64.S6
-rw-r--r--arch/x86/lib/copy_user_64.S151
-rw-r--r--arch/x86/lib/csum-copy_64.S2
-rw-r--r--arch/x86/lib/csum-partial_64.c165
-rw-r--r--arch/x86/lib/csum-wrappers_64.c2
-rw-r--r--arch/x86/lib/delay.c4
-rw-r--r--arch/x86/lib/error-inject.c5
-rw-r--r--arch/x86/lib/getuser.S22
-rw-r--r--arch/x86/lib/hweight.S6
-rw-r--r--arch/x86/lib/inat.c2
-rw-r--r--arch/x86/lib/insn-eval.c265
-rw-r--r--arch/x86/lib/insn.c235
-rw-r--r--arch/x86/lib/iomap_copy_64.S2
-rw-r--r--arch/x86/lib/iomem.c65
-rw-r--r--arch/x86/lib/kaslr.c20
-rw-r--r--arch/x86/lib/memcpy_32.c5
-rw-r--r--arch/x86/lib/memcpy_64.S24
-rw-r--r--arch/x86/lib/memmove_64.S10
-rw-r--r--arch/x86/lib/memset_64.S14
-rw-r--r--arch/x86/lib/mmx_32.c388
-rw-r--r--arch/x86/lib/msr-reg.S4
-rw-r--r--arch/x86/lib/msr-smp.c4
-rw-r--r--arch/x86/lib/msr.c4
-rw-r--r--arch/x86/lib/pc-conf-reg.c13
-rw-r--r--arch/x86/lib/putuser.S10
-rw-r--r--arch/x86/lib/retpoline.S44
-rw-r--r--arch/x86/lib/string_32.c1
-rw-r--r--arch/x86/lib/usercopy.c2
-rw-r--r--arch/x86/lib/usercopy_32.c67
-rw-r--r--arch/x86/lib/usercopy_64.c10
-rw-r--r--arch/x86/lib/x86-opcode-map.txt111
-rw-r--r--arch/x86/math-emu/div_Xsig.S2
-rw-r--r--arch/x86/math-emu/div_small.S2
-rw-r--r--arch/x86/math-emu/fpu_aux.c2
-rw-r--r--arch/x86/math-emu/fpu_entry.c6
-rw-r--r--arch/x86/math-emu/fpu_proto.h2
-rw-r--r--arch/x86/math-emu/fpu_system.h2
-rw-r--r--arch/x86/math-emu/fpu_trig.c11
-rw-r--r--arch/x86/math-emu/get_address.c2
-rw-r--r--arch/x86/math-emu/load_store.c2
-rw-r--r--arch/x86/math-emu/mul_Xsig.S6
-rw-r--r--arch/x86/math-emu/polynom_Xsig.S2
-rw-r--r--arch/x86/math-emu/reg_ld_str.c4
-rw-r--r--arch/x86/math-emu/reg_norm.S6
-rw-r--r--arch/x86/math-emu/reg_round.S4
-rw-r--r--arch/x86/math-emu/reg_u_add.S2
-rw-r--r--arch/x86/math-emu/reg_u_div.S2
-rw-r--r--arch/x86/math-emu/reg_u_mul.S2
-rw-r--r--arch/x86/math-emu/reg_u_sub.S2
-rw-r--r--arch/x86/math-emu/round_Xsig.S4
-rw-r--r--arch/x86/math-emu/shr_Xsig.S8
-rw-r--r--arch/x86/math-emu/wm_shrx.S16
-rw-r--r--arch/x86/mm/Makefile10
-rw-r--r--arch/x86/mm/amdtopology.c2
-rw-r--r--arch/x86/mm/cpu_entry_area.c7
-rw-r--r--arch/x86/mm/extable.c218
-rw-r--r--arch/x86/mm/fault.c65
-rw-r--r--arch/x86/mm/init.c37
-rw-r--r--arch/x86/mm/init_32.c52
-rw-r--r--arch/x86/mm/init_64.c232
-rw-r--r--arch/x86/mm/ioremap.c103
-rw-r--r--arch/x86/mm/kasan_init_64.c6
-rw-r--r--arch/x86/mm/kaslr.c2
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/maccess.c7
-rw-r--r--arch/x86/mm/mem_encrypt.c436
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c530
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S6
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c81
-rw-r--r--arch/x86/mm/mmio-mod.c6
-rw-r--r--arch/x86/mm/numa.c48
-rw-r--r--arch/x86/mm/numa_emulation.c12
-rw-r--r--arch/x86/mm/pat/memtype.c13
-rw-r--r--arch/x86/mm/pat/set_memory.c125
-rw-r--r--arch/x86/mm/pgprot.c35
-rw-r--r--arch/x86/mm/pgtable.c28
-rw-r--r--arch/x86/mm/pkeys.c26
-rw-r--r--arch/x86/mm/pti.c13
-rw-r--r--arch/x86/mm/setup_nx.c62
-rw-r--r--arch/x86/mm/tlb.c275
-rw-r--r--arch/x86/net/bpf_jit_comp.c765
-rw-r--r--arch/x86/net/bpf_jit_comp32.c241
-rw-r--r--arch/x86/pci/acpi.c89
-rw-r--r--arch/x86/pci/amd_bus.c4
-rw-r--r--arch/x86/pci/common.c10
-rw-r--r--arch/x86/pci/fixup.c50
-rw-r--r--arch/x86/pci/irq.c650
-rw-r--r--arch/x86/pci/mmconfig-shared.c10
-rw-r--r--arch/x86/pci/numachip.c1
-rw-r--r--arch/x86/pci/sta2x11-fixup.c5
-rw-r--r--arch/x86/pci/xen.c129
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts4
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c5
-rw-r--r--arch/x86/platform/efi/efi_64.c15
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S2
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S2
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S24
-rw-r--r--arch/x86/platform/efi/quirks.c19
-rw-r--r--arch/x86/platform/intel-quark/imr.c4
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c2
-rw-r--r--arch/x86/platform/intel/iosf_mbi.c4
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c2
-rw-r--r--arch/x86/platform/olpc/olpc.c2
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c2
-rw-r--r--arch/x86/platform/olpc/xo1-wakeup.S6
-rw-r--r--arch/x86/platform/pvh/enlighten.c12
-rw-r--r--arch/x86/platform/pvh/head.S21
-rw-r--r--arch/x86/platform/uv/uv_nmi.c71
-rw-r--r--arch/x86/power/cpu.c72
-rw-r--r--arch/x86/power/hibernate.c89
-rw-r--r--arch/x86/power/hibernate_asm_32.S4
-rw-r--r--arch/x86/power/hibernate_asm_64.S4
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/purgatory/purgatory.c2
-rw-r--r--arch/x86/realmode/Makefile1
-rw-r--r--arch/x86/realmode/init.c68
-rw-r--r--arch/x86/realmode/rm/header.S1
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S61
-rw-r--r--arch/x86/realmode/rm/trampoline_common.S12
-rw-r--r--arch/x86/realmode/rm/wakemain.c4
-rw-r--r--arch/x86/tools/chkobjdump.awk1
-rw-r--r--arch/x86/tools/insn_decoder_test.c10
-rw-r--r--arch/x86/tools/insn_sanity.c8
-rw-r--r--arch/x86/tools/relocs.c142
-rw-r--r--arch/x86/tools/relocs.h1
-rw-r--r--arch/x86/um/Kconfig3
-rw-r--r--arch/x86/um/Makefile3
-rw-r--r--arch/x86/um/asm/barrier.h1
-rw-r--r--arch/x86/um/asm/elf.h2
-rw-r--r--arch/x86/um/asm/segment.h8
-rw-r--r--arch/x86/um/checksum_32.S4
-rw-r--r--arch/x86/um/ldt.c6
-rw-r--r--arch/x86/um/os-Linux/registers.c1
-rw-r--r--arch/x86/um/ptrace_32.c1
-rw-r--r--arch/x86/um/ptrace_64.c1
-rw-r--r--arch/x86/um/setjmp_32.S2
-rw-r--r--arch/x86/um/setjmp_64.S2
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h14
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h12
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_64.h8
-rw-r--r--arch/x86/um/signal.c1
-rw-r--r--arch/x86/um/stub_segv.c3
-rw-r--r--arch/x86/um/sys_call_table_32.c16
-rw-r--r--arch/x86/um/sys_call_table_64.c32
-rw-r--r--arch/x86/um/syscalls_64.c13
-rw-r--r--arch/x86/um/user-offsets.c9
-rw-r--r--arch/x86/virt/vmx/tdx/tdxcall.S96
-rw-r--r--arch/x86/xen/Kconfig20
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/apic.c2
-rw-r--r--arch/x86/xen/enlighten.c169
-rw-r--r--arch/x86/xen/enlighten_hvm.c30
-rw-r--r--arch/x86/xen/enlighten_pv.c205
-rw-r--r--arch/x86/xen/enlighten_pvh.c10
-rw-r--r--arch/x86/xen/irq.c73
-rw-r--r--arch/x86/xen/mmu_hvm.c37
-rw-r--r--arch/x86/xen/mmu_pv.c172
-rw-r--r--arch/x86/xen/p2m.c6
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c96
-rw-r--r--arch/x86/xen/platform-pci-unplug.c16
-rw-r--r--arch/x86/xen/pmu.c42
-rw-r--r--arch/x86/xen/pmu.h3
-rw-r--r--arch/x86/xen/setup.c18
-rw-r--r--arch/x86/xen/smp.c28
-rw-r--r--arch/x86/xen/smp_hvm.c6
-rw-r--r--arch/x86/xen/smp_pv.c52
-rw-r--r--arch/x86/xen/time.c50
-rw-r--r--arch/x86/xen/vga.c12
-rw-r--r--arch/x86/xen/xen-asm.S117
-rw-r--r--arch/x86/xen/xen-head.S52
-rw-r--r--arch/x86/xen/xen-ops.h9
812 files changed, 55993 insertions, 30252 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 30dec019756b..5a83da703e87 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,4 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/
+
obj-y += entry/
obj-$(CONFIG_PERF_EVENTS) += events/
@@ -25,3 +27,6 @@ obj-y += platform/
obj-y += net/
obj-$(CONFIG_KEXEC_FILE) += purgatory/
+
+# for cleaning
+subdir- += boot tools
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..be0b95e51df6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -33,6 +33,7 @@ config X86_64
select NEED_DMA_MAP_STATE
select SWIOTLB
select ARCH_HAS_ELFCORE_COMPAT
+ select ZONE_DMA32
config FORCE_DYNAMIC_FTRACE
def_bool y
@@ -40,11 +41,11 @@ config FORCE_DYNAMIC_FTRACE
depends on FUNCTION_TRACER
select DYNAMIC_FTRACE
help
- We keep the static function tracing (!DYNAMIC_FTRACE) around
- in order to test the non static function tracing in the
- generic code, as other architectures still use it. But we
- only need to keep it around for x86_64. No need to keep it
- for x86_32. For x86_32, force DYNAMIC_FTRACE.
+ We keep the static function tracing (!DYNAMIC_FTRACE) around
+ in order to test the non static function tracing in the
+ generic code, as other architectures still use it. But we
+ only need to keep it around for x86_64. No need to keep it
+ for x86_32. For x86_32, force DYNAMIC_FTRACE.
#
# Arch settings
#
@@ -60,17 +61,24 @@ config X86
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_INIT
+ select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
+ select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
+ select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
+ select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
+ select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
+ select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_CACHE_LINE_SIZE
+ select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_EARLY_DEBUG if KGDB
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
- select ARCH_HAS_FILTER_PGPROT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_KCOV if X86_64 && STACK_VALIDATION
+ select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
@@ -86,7 +94,9 @@ config X86
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_HAS_VM_GET_PAGE_PROT
select ARCH_HAS_DEBUG_WX
+ select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -95,25 +105,32 @@ config X86
select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
- select ARCH_SUPPORTS_LTO_CLANG if X86_64
- select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64
+ select ARCH_SUPPORTS_LTO_CLANG
+ select ARCH_SUPPORTS_LTO_CLANG_THIN
select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_USE_SYM_ANNOTATIONS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_DEFAULT_BPF_JIT if X86_64
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_WANTS_NO_INSTR
+ select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_HUGE_PMD_SHARE
+ select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_THP_SWAP if X86_64
+ select ARCH_HAS_PARANOID_L1D_FLUSH
select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
select CLOCKSOURCE_WATCHDOG
select DCACHE_WORD_ACCESS
+ select DYNAMIC_SIGFRAME
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
@@ -123,7 +140,6 @@ config X86
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_ENTRY
- select GENERIC_FIND_FIRST_BIT
select GENERIC_IOMAP
select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC
@@ -134,8 +150,6 @@ config X86
select GENERIC_PENDING_IRQ if SMP
select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
- select GENERIC_STRNCPY_FROM_USER
- select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
@@ -147,6 +161,7 @@ config X86
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
+ select HAVE_ARCH_HUGE_VMALLOC if X86_64
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if X86_64
@@ -164,7 +179,9 @@ config X86
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
+ select HAVE_ARCH_USERFAULTFD_MINOR if X86_64 && USERFAULTFD
select HAVE_ARCH_VMAP_STACK if X86_64
+ select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_ASM_MODVERSIONS
select HAVE_CMPXCHG_DOUBLE
@@ -172,13 +189,16 @@ config X86
select HAVE_CONTEXT_TRACKING if X86_64
select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
- select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION
+ select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL
+ select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64
select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ select HAVE_SAMPLE_FTRACE_DIRECT if X86_64
+ select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EISA
@@ -186,14 +206,14 @@ config X86
select HAVE_FAST_GUP
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
- select HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE)
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT
- select HAVE_IDE
select HAVE_IOREMAP_PROT
select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
select HAVE_IRQ_TIME_ACCOUNTING
+ select HAVE_JUMP_LABEL_HACK if HAVE_OBJTOOL
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZ4
@@ -205,13 +225,17 @@ config X86
select HAVE_KPROBES_ON_FTRACE
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_KRETPROBES
+ select HAVE_RETHOOK
select HAVE_KVM
select HAVE_LIVEPATCH if X86_64
select HAVE_MIXED_BREAKPOINTS_REGS
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_MOVE_PMD
select HAVE_MOVE_PUD
+ select HAVE_NOINSTR_HACK if HAVE_OBJTOOL
select HAVE_NMI
+ select HAVE_NOINSTR_VALIDATION if HAVE_OBJTOOL
+ select HAVE_OBJTOOL if X86_64
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
@@ -220,24 +244,28 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
+ select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION
select HAVE_FUNCTION_ARG_ACCESS_API
+ select HAVE_SETUP_PER_CPU_AREA
select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
- select HAVE_STACK_VALIDATION if X86_64
+ select HAVE_STACK_VALIDATION if HAVE_OBJTOOL
select HAVE_STATIC_CALL
- select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
- select HAVE_PREEMPT_DYNAMIC
+ select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL
+ select HAVE_PREEMPT_DYNAMIC_CALL
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
+ select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
select HOTPLUG_SMT if SMP
select IRQ_FORCED_THREADING
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+ select NEED_PER_CPU_PAGE_FIRST_CHUNK
select NEED_SG_DMA_LENGTH
select PCI_DOMAINS if PCI
select PCI_LOCKLESS_CONFIG if PCI
@@ -246,14 +274,15 @@ config X86
select RTC_MC146818_LIB
select SPARSE_IRQ
select SRCU
- select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
select USER_STACKTRACE_SUPPORT
select VIRT_TO_BUS
select HAVE_ARCH_KCSAN if X86_64
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
+ select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
config INSTRUCTION_DECODER
@@ -313,33 +342,17 @@ config GENERIC_CALIBRATE_DELAY
config ARCH_HAS_CPU_RELAX
def_bool y
-config ARCH_HAS_CACHE_LINE_SIZE
- def_bool y
-
-config ARCH_HAS_FILTER_PGPROT
- def_bool y
-
-config HAVE_SETUP_PER_CPU_AREA
- def_bool y
-
-config NEED_PER_CPU_EMBED_FIRST_CHUNK
- def_bool y
-
-config NEED_PER_CPU_PAGE_FIRST_CHUNK
- def_bool y
-
config ARCH_HIBERNATION_POSSIBLE
def_bool y
-config ARCH_SUSPEND_POSSIBLE
- def_bool y
+config ARCH_NR_GPIO
+ int
+ default 1024 if X86_64
+ default 512
-config ARCH_WANT_GENERAL_HUGETLB
+config ARCH_SUSPEND_POSSIBLE
def_bool y
-config ZONE_DMA32
- def_bool y if X86_64
-
config AUDIT_ARCH
def_bool y if X86_64
@@ -360,10 +373,6 @@ config X86_64_SMP
def_bool y
depends on X86_64 && SMP
-config X86_32_LAZY_GS
- def_bool y
- depends on X86_32 && !STACKPROTECTOR
-
config ARCH_SUPPORTS_UPROBES
def_bool y
@@ -385,21 +394,12 @@ config CC_HAS_SANE_STACKPROTECTOR
default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC)) if 64BIT
default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC))
help
- We have to make sure stack protector is unconditionally disabled if
- the compiler produces broken code.
+ We have to make sure stack protector is unconditionally disabled if
+ the compiler produces broken code or if it does not let us control
+ the segment on 32-bit kernels.
menu "Processor type and features"
-config ZONE_DMA
- bool "DMA memory allocation support" if EXPERT
- default y
- help
- DMA memory allocation support allows devices with less than 32-bit
- addressing to allocate within the first 16MB of address space.
- Disable if no such devices will be used.
-
- If unsure, say Y.
-
config SMP
bool "Symmetric multi-processing support"
help
@@ -464,6 +464,7 @@ config GOLDFISH
config RETPOLINE
bool "Avoid speculative indirect branches in kernel"
+ select OBJTOOL if HAVE_OBJTOOL
default y
help
Compile kernel with the retpoline compiler options to guard against
@@ -471,6 +472,19 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
+config CC_HAS_SLS
+ def_bool $(cc-option,-mharden-sls=all)
+
+config SLS
+ bool "Mitigate Straight-Line-Speculation"
+ depends on CC_HAS_SLS && X86_64
+ select OBJTOOL if HAVE_OBJTOOL
+ default n
+ help
+ Compile the kernel with straight-line-speculation options to guard
+ against straight line speculation. The kernel image might be slightly
+ larger.
+
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
@@ -518,7 +532,7 @@ config X86_EXTENDED_PLATFORM
If you have one of these systems, or if you want to build a
generic distribution kernel, say Y here - otherwise say N.
-endif
+endif # X86_32
if X86_64
config X86_EXTENDED_PLATFORM
@@ -537,7 +551,7 @@ config X86_EXTENDED_PLATFORM
If you have one of these systems, or if you want to build a
generic distribution kernel, say Y here - otherwise say N.
-endif
+endif # X86_64
# This is an alphabetically sorted list of 64 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
config X86_NUMACHIP
@@ -571,6 +585,7 @@ config X86_UV
depends on X86_EXTENDED_PLATFORM
depends on NUMA
depends on EFI
+ depends on KEXEC_CORE
depends on X86_X2APIC
depends on PCI
help
@@ -584,9 +599,9 @@ config X86_GOLDFISH
bool "Goldfish (Virtual Platform)"
depends on X86_EXTENDED_PLATFORM
help
- Enable support for the Goldfish virtual platform used primarily
- for Android development. Unless you are building for the Android
- Goldfish emulator say N here.
+ Enable support for the Goldfish virtual platform used primarily
+ for Android development. Unless you are building for the Android
+ Goldfish emulator say N here.
config X86_INTEL_CE
bool "CE4100 TV platform"
@@ -612,9 +627,7 @@ config X86_INTEL_MID
depends on X86_IO_APIC
select I2C
select DW_APB_TIMER
- select APB_TIMER
select INTEL_SCU_PCI
- select MFD_INTEL_MSIC
help
Select to build a kernel capable of supporting Intel MID (Mobile
Internet Device) platform systems which do not have the PCI legacy
@@ -777,6 +790,7 @@ if HYPERVISOR_GUEST
config PARAVIRT
bool "Enable paravirtualization code"
+ depends on HAVE_STATIC_CALL
help
This changes the kernel so it can modify itself when it is run
under a hypervisor, potentially improving performance significantly
@@ -871,7 +885,22 @@ config ACRN_GUEST
IOT with small footprint and real-time features. More details can be
found in https://projectacrn.org/.
-endif #HYPERVISOR_GUEST
+config INTEL_TDX_GUEST
+ bool "Intel TDX (Trust Domain Extensions) - Guest Support"
+ depends on X86_64 && CPU_SUP_INTEL
+ depends on X86_X2APIC
+ select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
+ select X86_MCE
+ help
+ Support running as a guest under Intel TDX. Without this support,
+ the guest kernel can not boot or run under TDX.
+ TDX includes memory encryption and integrity capabilities
+ which protect the confidentiality and integrity of guest
+ memory contents and CPU state. TDX guests are protected from
+ some attacks from the VMM.
+
+endif # HYPERVISOR_GUEST
source "arch/x86/Kconfig.cpu"
@@ -933,6 +962,12 @@ config GART_IOMMU
If unsure, say Y.
+config BOOT_VESA_SUPPORT
+ bool
+ help
+ If true, at least one selected framebuffer driver can take advantage
+ of VESA video modes set at an early boot stage via the vga= parameter.
+
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL
@@ -1002,6 +1037,17 @@ config NR_CPUS
This is purely to save memory: each supported CPU adds about 8KB
to the kernel image.
+config SCHED_CLUSTER
+ bool "Cluster scheduler support"
+ depends on SMP
+ default y
+ help
+ Cluster scheduler support improves the CPU scheduler's decision
+ making when dealing with machines that have clusters of CPUs.
+ Cluster usually means a couple of CPUs which are placed closely
+ by sharing mid-level caches, last-level cache tags or internal
+ busses.
+
config SCHED_SMT
def_bool y if SMP
@@ -1121,16 +1167,16 @@ config X86_MCE_INTEL
prompt "Intel MCE features"
depends on X86_MCE && X86_LOCAL_APIC
help
- Additional support for intel specific MCE features such as
- the thermal monitor.
+ Additional support for intel specific MCE features such as
+ the thermal monitor.
config X86_MCE_AMD
def_bool y
prompt "AMD MCE features"
depends on X86_MCE && X86_LOCAL_APIC && AMD_NB
help
- Additional support for AMD specific MCE features such as
- the DRAM Error Threshold.
+ Additional support for AMD specific MCE features such as
+ the DRAM Error Threshold.
config X86_ANCIENT_MCE
bool "Support for old Pentium 5 / WinChip machine checks"
@@ -1208,18 +1254,18 @@ config X86_VSYSCALL_EMULATION
default y
depends on X86_64
help
- This enables emulation of the legacy vsyscall page. Disabling
- it is roughly equivalent to booting with vsyscall=none, except
- that it will also disable the helpful warning if a program
- tries to use a vsyscall. With this option set to N, offending
- programs will just segfault, citing addresses of the form
- 0xffffffffff600?00.
+ This enables emulation of the legacy vsyscall page. Disabling
+ it is roughly equivalent to booting with vsyscall=none, except
+ that it will also disable the helpful warning if a program
+ tries to use a vsyscall. With this option set to N, offending
+ programs will just segfault, citing addresses of the form
+ 0xffffffffff600?00.
- This option is required by many programs built before 2013, and
- care should be used even with newer programs if set to N.
+ This option is required by many programs built before 2013, and
+ care should be used even with newer programs if set to N.
- Disabling this option saves about 7K of kernel size and
- possibly 4K of additional runtime pagetable memory.
+ Disabling this option saves about 7K of kernel size and
+ possibly 4K of additional runtime pagetable memory.
config X86_IOPL_IOPERM
bool "IOPERM and IOPL Emulation"
@@ -1255,22 +1301,6 @@ config TOSHIBA
Say Y if you intend to run this kernel on a Toshiba portable.
Say N otherwise.
-config I8K
- tristate "Dell i8k legacy laptop support"
- select HWMON
- select SENSORS_DELL_SMM
- help
- This option enables legacy /proc/i8k userspace interface in hwmon
- dell-smm-hwmon driver. Character file /proc/i8k reports bios version,
- temperature and allows controlling fan speeds of Dell laptops via
- System Management Mode. For old Dell laptops (like Dell Inspiron 8000)
- it reports also power and hotkey status. For fan speed control is
- needed userspace package i8kutils.
-
- Say Y if you intend to run this kernel on old Dell laptops or want to
- use userspace package i8kutils.
- Say N otherwise.
-
config X86_REBOOTFIXUPS
bool "Enable X86 board specific fixups for reboot"
depends on X86_32
@@ -1311,7 +1341,7 @@ config MICROCODE
config MICROCODE_INTEL
bool "Intel microcode loading support"
- depends on MICROCODE
+ depends on CPU_SUP_INTEL && MICROCODE
default MICROCODE
help
This options enables microcode patch loading support for Intel
@@ -1323,22 +1353,21 @@ config MICROCODE_INTEL
config MICROCODE_AMD
bool "AMD microcode loading support"
- depends on MICROCODE
+ depends on CPU_SUP_AMD && MICROCODE
help
If you select this option, microcode patch loading support for AMD
processors will be enabled.
-config MICROCODE_OLD_INTERFACE
- bool "Ancient loading interface (DEPRECATED)"
+config MICROCODE_LATE_LOADING
+ bool "Late microcode loading (DANGEROUS)"
default n
depends on MICROCODE
help
- DO NOT USE THIS! This is the ancient /dev/cpu/microcode interface
- which was used by userspace tools like iucode_tool and microcode.ctl.
- It is inadequate because it runs too late to be able to properly
- load microcode on a machine and it needs special tools. Instead, you
- should've switched to the early loading method with the initrd or
- builtin microcode by now: Documentation/x86/microcode.rst
+ Loading microcode late, when the system is up and executing instructions
+ is a tricky business and should be avoided if possible. Just the sequence
+ of synchronizing all cores and SMT threads is one fragile dance which does
+ not guarantee that cores might not softlock after the loading. Therefore,
+ use this at your own risk. Late loading taints the kernel too.
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
@@ -1406,7 +1435,7 @@ config HIGHMEM4G
config HIGHMEM64G
bool "64GB"
- depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6
+ depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6
select X86_PAE
help
Select this if you have a 32-bit processor and more than 4
@@ -1510,14 +1539,19 @@ config X86_CPA_STATISTICS
helps to determine the effectiveness of preserving large and huge
page mappings when mapping protections are changed.
+config X86_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ select DYNAMIC_PHYSICAL_MASK
+ def_bool n
+
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
select DMA_COHERENT_POOL
- select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT
- select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
+ select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1525,7 +1559,6 @@ config AMD_MEM_ENCRYPT
config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
bool "Activate AMD Secure Memory Encryption (SME) by default"
- default y
depends on AMD_MEM_ENCRYPT
help
Say yes to have system memory encrypted by default if running on
@@ -1543,6 +1576,7 @@ config NUMA
depends on SMP
depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
default y if X86_BIGSMP
+ select USE_PERCPU_NUMA_NODE_ID
help
Enable NUMA (Non-Uniform Memory Access) support.
@@ -1591,7 +1625,7 @@ config NODES_SHIFT
default "10" if MAXSMP
default "6" if X86_64
default "3"
- depends on NEED_MULTIPLE_NODES
+ depends on NUMA
help
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accommodate various tables.
@@ -1611,11 +1645,11 @@ config ARCH_SPARSEMEM_DEFAULT
config ARCH_SELECT_MEMORY_MODEL
def_bool y
- depends on ARCH_SPARSEMEM_ENABLE
+ depends on ARCH_SPARSEMEM_ENABLE && ARCH_FLATMEM_ENABLE
config ARCH_MEMORY_PROBE
bool "Enable sysfs memory/probe interface"
- depends on X86_64 && MEMORY_HOTPLUG
+ depends on MEMORY_HOTPLUG
help
This option enables a sysfs memory/probe interface for testing.
See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
@@ -1687,35 +1721,6 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
Set whether the default state of memory_corruption_check is
on or off.
-config X86_RESERVE_LOW
- int "Amount of low memory, in kilobytes, to reserve for the BIOS"
- default 64
- range 4 640
- help
- Specify the amount of low memory to reserve for the BIOS.
-
- The first page contains BIOS data structures that the kernel
- must not use, so that page must always be reserved.
-
- By default we reserve the first 64K of physical RAM, as a
- number of BIOSes are known to corrupt that memory range
- during events such as suspend/resume or monitor cable
- insertion, so it must not be used by the kernel.
-
- You can set this to 4 if you are absolutely sure that you
- trust the BIOS to get all its memory reservations and usages
- right. If you know your BIOS have problems beyond the
- default 64K area, you can set this to 640 to avoid using the
- entire low memory range.
-
- If you have doubts about the BIOS (e.g. suspend/resume does
- not work or there's kernel crashes after certain hardware
- hotplug events) then you might want to enable
- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
- typical corruption patterns.
-
- Leave this to the default value of 64 if you are unsure.
-
config MATH_EMULATION
bool
depends on MODIFY_LDT_SYSCALL
@@ -1837,17 +1842,6 @@ config ARCH_RANDOM
If supported, this is a high bandwidth, cryptographically
secure hardware random number generator.
-config X86_SMAP
- def_bool y
- prompt "Supervisor Mode Access Prevention" if EXPERT
- help
- Supervisor Mode Access Prevention (SMAP) is a security
- feature in newer Intel processors. There is a small
- performance cost if this enabled and turned on; there is
- also a small increase in the kernel size if this is enabled.
-
- If unsure, say Y.
-
config X86_UMIP
def_bool y
prompt "User Mode Instruction Prevention" if EXPERT
@@ -1863,6 +1857,37 @@ config X86_UMIP
specific cases in protected and virtual-8086 modes. Emulated
results are dummy.
+config CC_HAS_IBT
+ # GCC >= 9 and binutils >= 2.29
+ # Retpoline check to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93654
+ # Clang/LLVM >= 14
+ # https://github.com/llvm/llvm-project/commit/e0b89df2e0f0130881bf6c39bf31d7f6aac00e0f
+ # https://github.com/llvm/llvm-project/commit/dfcf69770bc522b9e411c66454934a37c1f35332
+ def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || \
+ (CC_IS_CLANG && CLANG_VERSION >= 140000)) && \
+ $(as-instr,endbr64)
+
+config X86_KERNEL_IBT
+ prompt "Indirect Branch Tracking"
+ bool
+ depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL
+ # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f
+ depends on !LD_IS_LLD || LLD_VERSION >= 140000
+ select OBJTOOL
+ help
+ Build the kernel with support for Indirect Branch Tracking, a
+ hardware support course-grain forward-edge Control Flow Integrity
+ protection. It enforces that all indirect calls must land on
+ an ENDBR instruction, as such, the compiler will instrument the
+ code with them to make this happen.
+
+ In addition to building the kernel with IBT, seal all functions that
+ are not indirect call targets, avoiding them ever becoming one.
+
+ This requires LTO like objtool runs and will slow down the build. It
+ does significantly reduce the number of ENDBR instructions in the
+ kernel image.
+
config X86_INTEL_MEMORY_PROTECTION_KEYS
prompt "Memory Protection Keys"
def_bool y
@@ -1931,6 +1956,8 @@ config X86_SGX
depends on CRYPTO_SHA256=y
select SRCU
select MMU_NOTIFIER
+ select NUMA_KEEP_MEMINFO if NUMA
+ select XARRAY_MULTI
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
@@ -1946,6 +1973,7 @@ config EFI
depends on ACPI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
help
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
@@ -1959,7 +1987,7 @@ config EFI
config EFI_STUB
bool "EFI stub support"
- depends on EFI && !X86_USE_3DNOW
+ depends on EFI
depends on $(cc-option,-mabi=ms) || X86_32
select RELOCATABLE
help
@@ -1972,15 +2000,15 @@ config EFI_MIXED
bool "EFI mixed-mode support"
depends on EFI_STUB && X86_64
help
- Enabling this feature allows a 64-bit kernel to be booted
- on a 32-bit firmware, provided that your CPU supports 64-bit
- mode.
+ Enabling this feature allows a 64-bit kernel to be booted
+ on a 32-bit firmware, provided that your CPU supports 64-bit
+ mode.
- Note that it is not possible to boot a mixed-mode enabled
- kernel via the EFI boot stub - a bootloader that supports
- the EFI handover protocol must be used.
+ Note that it is not possible to boot a mixed-mode enabled
+ kernel via the EFI boot stub - a bootloader that supports
+ the EFI handover protocol must be used.
- If unsure, say N.
+ If unsure, say N.
source "kernel/Kconfig.hz"
@@ -2205,16 +2233,16 @@ config RANDOMIZE_MEMORY
select DYNAMIC_MEMORY_LAYOUT
default RANDOMIZE_BASE
help
- Randomizes the base virtual address of kernel memory sections
- (physical memory mapping, vmalloc & vmemmap). This security feature
- makes exploits relying on predictable memory locations less reliable.
+ Randomizes the base virtual address of kernel memory sections
+ (physical memory mapping, vmalloc & vmemmap). This security feature
+ makes exploits relying on predictable memory locations less reliable.
- The order of allocations remains unchanged. Entropy is generated in
- the same way as RANDOMIZE_BASE. Current implementation in the optimal
- configuration have in average 30,000 different possible virtual
- addresses for each memory section.
+ The order of allocations remains unchanged. Entropy is generated in
+ the same way as RANDOMIZE_BASE. Current implementation in the optimal
+ configuration have in average 30,000 different possible virtual
+ addresses for each memory section.
- If unsure, say Y.
+ If unsure, say Y.
config RANDOMIZE_MEMORY_PHYSICAL_PADDING
hex "Physical memory mapping padding" if EXPERT
@@ -2224,12 +2252,12 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING
range 0x1 0x40 if MEMORY_HOTPLUG
range 0x0 0x40
help
- Define the padding in terabytes added to the existing physical
- memory size during kernel memory randomization. It is useful
- for memory hotplug support but reduces the entropy available for
- address randomization.
+ Define the padding in terabytes added to the existing physical
+ memory size during kernel memory randomization. It is useful
+ for memory hotplug support but reduces the entropy available for
+ address randomization.
- If unsure, leave at the default value.
+ If unsure, leave at the default value.
config HOTPLUG_CPU
def_bool y
@@ -2314,7 +2342,9 @@ choice
it can be used to assist security vulnerability exploitation.
This setting can be changed at boot time via the kernel command
- line parameter vsyscall=[emulate|xonly|none].
+ line parameter vsyscall=[emulate|xonly|none]. Emulate mode
+ is deprecated and can only be enabled using the kernel command
+ line.
On a system with recent enough glibc (2.14 or newer) and no
static binaries, you can say None without a performance penalty
@@ -2322,20 +2352,6 @@ choice
If unsure, select "Emulate execution only".
- config LEGACY_VSYSCALL_EMULATE
- bool "Full emulation"
- help
- The kernel traps and emulates calls into the fixed vsyscall
- address mapping. This makes the mapping non-executable, but
- it still contains readable known contents, which could be
- used in certain rare security vulnerability exploits. This
- configuration is recommended when using legacy userspace
- that still uses vsyscalls along with legacy binary
- instrumentation tools that require code to be readable.
-
- An example of this type of legacy userspace is running
- Pin on an old binary that still uses vsyscalls.
-
config LEGACY_VSYSCALL_XONLY
bool "Emulate execution only"
help
@@ -2417,37 +2433,32 @@ config MODIFY_LDT_SYSCALL
Saying 'N' here may make sense for embedded or server kernels.
+config STRICT_SIGALTSTACK_SIZE
+ bool "Enforce strict size checking for sigaltstack"
+ depends on DYNAMIC_SIGFRAME
+ help
+ For historical reasons MINSIGSTKSZ is a constant which became
+ already too small with AVX512 support. Add a mechanism to
+ enforce strict checking of the sigaltstack size against the
+ real size of the FPU frame. This option enables the check
+ by default. It can also be controlled via the kernel command
+ line option 'strict_sas_size' independent of this config
+ switch. Enabling it might break existing applications which
+ allocate a too small sigaltstack but 'work' because they
+ never get a signal delivered.
+
+ Say 'N' unless you want to really enforce this check.
+
source "kernel/livepatch/Kconfig"
endmenu
config ARCH_HAS_ADD_PAGES
def_bool y
- depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
-
-config ARCH_ENABLE_MEMORY_HOTPLUG
- def_bool y
- depends on X86_64 || (X86_32 && HIGHMEM)
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
- def_bool y
- depends on MEMORY_HOTPLUG
-
-config USE_PERCPU_NUMA_NODE_ID
- def_bool y
- depends on NUMA
+ depends on ARCH_ENABLE_MEMORY_HOTPLUG
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
def_bool y
- depends on X86_64 || X86_PAE
-
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
- def_bool y
- depends on X86_64 && HUGETLB_PAGE && MIGRATION
-
-config ARCH_ENABLE_THP_MIGRATION
- def_bool y
- depends on X86_64 && TRANSPARENT_HUGEPAGE
menu "Power management and ACPI options"
@@ -2593,7 +2604,6 @@ source "drivers/idle/Kconfig"
endmenu
-
menu "Bus options (PCI etc.)"
choice
@@ -2655,7 +2665,6 @@ config PCI_OLPC
config PCI_XEN
def_bool y
depends on PCI && XEN
- select SWIOTLB_XEN
config MMCONF_FAM10H
def_bool y
@@ -2816,35 +2825,8 @@ config AMD_NB
def_bool y
depends on CPU_SUP_AMD && PCI
-config X86_SYSFB
- bool "Mark VGA/VBE/EFI FB as generic system framebuffer"
- help
- Firmwares often provide initial graphics framebuffers so the BIOS,
- bootloader or kernel can show basic video-output during boot for
- user-guidance and debugging. Historically, x86 used the VESA BIOS
- Extensions and EFI-framebuffers for this, which are mostly limited
- to x86.
- This option, if enabled, marks VGA/VBE/EFI framebuffers as generic
- framebuffers so the new generic system-framebuffer drivers can be
- used on x86. If the framebuffer is not compatible with the generic
- modes, it is advertised as fallback platform framebuffer so legacy
- drivers like efifb, vesafb and uvesafb can pick it up.
- If this option is not selected, all system framebuffers are always
- marked as fallback platform framebuffers as usual.
-
- Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will
- not be able to pick up generic system framebuffers if this option
- is selected. You are highly encouraged to enable simplefb as
- replacement if you select this option. simplefb can correctly deal
- with generic system framebuffers. But you should still keep vesafb
- and others enabled as fallback if a system framebuffer is
- incompatible with simplefb.
-
- If unsure, say Y.
-
endmenu
-
menu "Binary Emulations"
config IA32_EMULATION
@@ -2858,26 +2840,20 @@ config IA32_EMULATION
64-bit kernel. You should likely turn this on, unless you're
100% sure that you don't have any 32-bit programs left.
-config IA32_AOUT
- tristate "IA32 a.out support"
- depends on IA32_EMULATION
- depends on BROKEN
- help
- Support old a.out binaries in the 32bit emulation.
-
-config X86_X32
+config X86_X32_ABI
bool "x32 ABI for 64-bit mode"
depends on X86_64
+ # llvm-objcopy does not convert x86_64 .note.gnu.property or
+ # compressed debug sections to x86_x32 properly:
+ # https://github.com/ClangBuiltLinux/linux/issues/514
+ # https://github.com/ClangBuiltLinux/linux/issues/1141
+ depends on $(success,$(OBJCOPY) --version | head -n1 | grep -qv llvm)
help
Include code to run binaries for the x32 native 32-bit ABI
for 64-bit processors. An x32 process gets access to the
full 64-bit register file and wide data path while leaving
pointers at 32 bits for smaller memory footprint.
- You will need a recent binutils (2.22 or later) with
- elf32_x86_64 support enabled to compile a kernel with this
- option set.
-
config COMPAT_32
def_bool y
depends on IA32_EMULATION || X86_32
@@ -2886,26 +2862,18 @@ config COMPAT_32
config COMPAT
def_bool y
- depends on IA32_EMULATION || X86_X32
+ depends on IA32_EMULATION || X86_X32_ABI
-if COMPAT
config COMPAT_FOR_U64_ALIGNMENT
def_bool y
-
-config SYSVIPC_COMPAT
- def_bool y
- depends on SYSVIPC
-endif
+ depends on COMPAT
endmenu
-
config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
-source "drivers/firmware/Kconfig"
-
source "arch/x86/kvm/Kconfig"
source "arch/x86/Kconfig.assembler"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 814fe0d349b0..542377cd419d 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -342,10 +342,6 @@ config X86_USE_PPRO_CHECKSUM
def_bool y
depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
-config X86_USE_3DNOW
- def_bool y
- depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
-
#
# P6_NOPs are a relatively minor optimization that require a family >=
# 6 processor, except that it is broken on certain VIA chips.
@@ -508,3 +504,16 @@ config CPU_SUP_ZHAOXIN
CPU might render the kernel unbootable.
If unsure, say N.
+
+config CPU_SUP_VORTEX_32
+ default y
+ bool "Support Vortex processors" if PROCESSOR_SELECT
+ depends on X86_32
+ help
+ This enables detection, tunings and quirks for Vortex processors
+
+ You need this enabled if you want your kernel to run on a
+ Vortex CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller.
+
+ If unsure, say N.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 80b57e7f4947..340399f69954 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -1,8 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config TRACE_IRQFLAGS_NMI_SUPPORT
def_bool y
@@ -76,20 +73,19 @@ config DEBUG_TLBFLUSH
bool "Set upper limit of TLB entries to flush one-by-one"
depends on DEBUG_KERNEL
help
+ X86-only for now.
- X86-only for now.
-
- This option allows the user to tune the amount of TLB entries the
- kernel flushes one-by-one instead of doing a full TLB flush. In
- certain situations, the former is cheaper. This is controlled by the
- tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it
- to -1, the code flushes the whole TLB unconditionally. Otherwise,
- for positive values of it, the kernel will use single TLB entry
- invalidating instructions according to the following formula:
+ This option allows the user to tune the amount of TLB entries the
+ kernel flushes one-by-one instead of doing a full TLB flush. In
+ certain situations, the former is cheaper. This is controlled by the
+ tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it
+ to -1, the code flushes the whole TLB unconditionally. Otherwise,
+ for positive values of it, the kernel will use single TLB entry
+ invalidating instructions according to the following formula:
- flush_entries <= active_tlb_entries / 2^tlb_flushall_shift
+ flush_entries <= active_tlb_entries / 2^tlb_flushall_shift
- If in doubt, say "N".
+ If in doubt, say "N".
config IOMMU_DEBUG
bool "Enable IOMMU debugging"
@@ -122,10 +118,10 @@ config X86_DECODER_SELFTEST
depends on DEBUG_KERNEL && INSTRUCTION_DECODER
depends on !COMPILE_TEST
help
- Perform x86 instruction decoder selftests at build time.
- This option is useful for checking the sanity of x86 instruction
- decoder code.
- If unsure, say "N".
+ Perform x86 instruction decoder selftests at build time.
+ This option is useful for checking the sanity of x86 instruction
+ decoder code.
+ If unsure, say "N".
choice
prompt "IO delay type"
@@ -240,7 +236,7 @@ choice
config UNWINDER_ORC
bool "ORC unwinder"
depends on X86_64
- select STACK_VALIDATION
+ select OBJTOOL
help
This option enables the ORC (Oops Rewind Capability) unwinder for
unwinding kernel stack traces. It uses a custom data format which is
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 9a85eae37b17..a74886aed349 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -12,6 +12,18 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
+ifdef CONFIG_CC_IS_GCC
+RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
+RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
+RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register)
+endif
+ifdef CONFIG_CC_IS_CLANG
+RETPOLINE_CFLAGS := -mretpoline-external-thunk
+RETPOLINE_VDSO_CFLAGS := -mretpoline
+endif
+export RETPOLINE_CFLAGS
+export RETPOLINE_VDSO_CFLAGS
+
# For gcc stack alignment is specified with -mpreferred-stack-boundary,
# clang has the option -mstack-alignment for that purpose.
ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
@@ -24,15 +36,16 @@ endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
-REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \
+REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
REALMODE_CFLAGS += -ffreestanding
REALMODE_CFLAGS += -fno-stack-protector
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member)
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
+REALMODE_CFLAGS += -Wno-address-of-packed-member
+REALMODE_CFLAGS += $(cc_stack_align4)
+REALMODE_CFLAGS += $(CLANG_FLAGS)
export REALMODE_CFLAGS
# BITS is used as extension for files which are available in a 32 bit
@@ -47,20 +60,30 @@ export BITS
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
-# Intel CET isn't enabled in the kernel
+ifeq ($(CONFIG_X86_KERNEL_IBT),y)
+#
+# Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
+# NOTRACK prefixes. Current generation compilers unconditionally employ NOTRACK
+# for jump-tables, as such, disable jump-tables for now.
+#
+# (jump-tables are implicitly disabled by RETPOLINE)
+#
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816
+#
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables)
+else
KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
+endif
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
CHECKFLAGS += -D__i386__
- biarch := $(call cc-option,-m32)
- KBUILD_AFLAGS += $(biarch)
- KBUILD_CFLAGS += $(biarch)
+ KBUILD_AFLAGS += -m32
+ KBUILD_CFLAGS += -m32
KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
@@ -71,20 +94,27 @@ ifeq ($(CONFIG_X86_32),y)
# Align the stack to the register width instead of using the default
# alignment of 16 bytes. This reduces stack usage and the number of
# alignment instructions.
- KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4))
+ KBUILD_CFLAGS += $(cc_stack_align4)
# CPU-specific tuning. Anything which can be shared with UML should go here.
- include arch/x86/Makefile_32.cpu
+ include $(srctree)/arch/x86/Makefile_32.cpu
KBUILD_CFLAGS += $(cflags-y)
# temporary until string.h is fixed
KBUILD_CFLAGS += -ffreestanding
+
+ ifeq ($(CONFIG_STACKPROTECTOR),y)
+ ifeq ($(CONFIG_SMP),y)
+ KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard
+ else
+ KBUILD_CFLAGS += -mstack-protector-guard=global
+ endif
+ endif
else
BITS := 64
UTS_MACHINE := x86_64
CHECKFLAGS += -D__x86_64__
- biarch := -m64
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
@@ -95,7 +125,7 @@ else
KBUILD_CFLAGS += $(call cc-option,-falign-loops=1)
# Don't autogenerate traditional x87 instructions
- KBUILD_CFLAGS += $(call cc-option,-mno-80387)
+ KBUILD_CFLAGS += -mno-80387
KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
# By default gcc and clang use a stack alignment of 16 bytes for x86.
@@ -105,42 +135,23 @@ else
# default alignment which keep the stack *mis*aligned.
# Furthermore an alignment to the register width reduces stack usage
# and the number of alignment instructions.
- KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8))
+ KBUILD_CFLAGS += $(cc_stack_align8)
# Use -mskip-rax-setup if supported.
KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
# FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
- cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
- cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
-
- cflags-$(CONFIG_MCORE2) += \
- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
- cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
- cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
+ cflags-$(CONFIG_MK8) += -march=k8
+ cflags-$(CONFIG_MPSC) += -march=nocona
+ cflags-$(CONFIG_MCORE2) += -march=core2
+ cflags-$(CONFIG_MATOM) += -march=atom
+ cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic
KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += -mno-red-zone
KBUILD_CFLAGS += -mcmodel=kernel
endif
-ifdef CONFIG_X86_X32
- x32_ld_ok := $(call try-run,\
- /bin/echo -e '1: .quad 1b' | \
- $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" - && \
- $(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \
- $(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n)
- ifeq ($(x32_ld_ok),y)
- CONFIG_X86_X32_ABI := y
- KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI
- KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI
- else
- $(warning CONFIG_X86_X32 enabled but no binutils support)
- endif
-endif
-export CONFIG_X86_X32_ABI
-
#
# If the function graph tracer is used with mcount instead of fentry,
# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
@@ -149,18 +160,6 @@ export CONFIG_X86_X32_ABI
ifdef CONFIG_FUNCTION_GRAPH_TRACER
ifndef CONFIG_HAVE_FENTRY
ACCUMULATE_OUTGOING_ARGS := 1
- else
- ifeq ($(call cc-option-yn, -mfentry), n)
- ACCUMULATE_OUTGOING_ARGS := 1
-
- # GCC ignores '-maccumulate-outgoing-args' when used with '-Os'.
- # If '-Os' is enabled, disable it and print a warning.
- ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
- undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
- $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
- endif
-
- endif
endif
endif
@@ -169,11 +168,6 @@ ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,)
endif
-ifdef CONFIG_LTO_CLANG
-KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \
- -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
-endif
-
# Workaround for a gcc prelease that unfortunately was shipped in a suse release
KBUILD_CFLAGS += -Wno-sign-compare
#
@@ -189,11 +183,21 @@ ifdef CONFIG_RETPOLINE
# only been fixed starting from gcc stable version 8.4.0 and
# onwards, but not for older ones. See gcc bug #86952.
ifndef CONFIG_CC_IS_CLANG
- KBUILD_CFLAGS += $(call cc-option,-fno-jump-tables)
+ KBUILD_CFLAGS += -fno-jump-tables
endif
endif
-KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
+ifdef CONFIG_SLS
+ KBUILD_CFLAGS += -mharden-sls=all
+endif
+
+KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
+
+ifdef CONFIG_LTO_CLANG
+ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 130000; echo $$?),0)
+KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
+endif
+endif
ifdef CONFIG_X86_NEED_RELOCS
LDFLAGS_vmlinux := --emit-relocs --discard-none
@@ -230,9 +234,6 @@ head-y += arch/x86/kernel/platform-quirks.o
libs-y += arch/x86/lib/
-# See arch/x86/Kbuild for content of core part of the kernel
-core-y += arch/x86/
-
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
@@ -247,7 +248,7 @@ drivers-$(CONFIG_FB) += arch/x86/video/
boot := arch/x86/boot
-BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 isoimage
+BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 hdimage isoimage
PHONY += bzImage $(BOOT_TARGETS)
@@ -268,9 +269,9 @@ endif
$(BOOT_TARGETS): vmlinux
$(Q)$(MAKE) $(build)=$(boot) $@
-PHONY += install bzlilo
-install bzlilo:
- $(Q)$(MAKE) $(build)=$(boot) $@
+PHONY += install
+install:
+ $(call cmd,install)
PHONY += vdso_install
vdso_install:
@@ -293,8 +294,6 @@ endif
archclean:
$(Q)rm -rf $(objtree)/arch/i386
$(Q)rm -rf $(objtree)/arch/x86_64
- $(Q)$(MAKE) $(clean)=$(boot)
- $(Q)$(MAKE) $(clean)=arch/x86/tools
define archhelp
echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
@@ -305,12 +304,14 @@ define archhelp
echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' hdimage - Create a BIOS/EFI hard disk image (arch/x86/boot/hdimage)'
echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
- echo ' bzdisk/fdimage*/isoimage also accept:'
+ echo ' bzdisk/fdimage*/hdimage/isoimage also accept:'
echo ' FDARGS="..." arguments for the booted kernel'
- echo ' FDINITRD=file initrd for the booted kernel'
+ echo ' FDINITRD=file initrd for the booted kernel'
echo ''
echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
+ echo ' x86_debug.config - Enable tip tree debugging options for testing'
endef
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 1db7913795f5..b3c1ae084180 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -44,7 +44,7 @@ ELF_FORMAT := elf64-x86-64
# Not on all 64-bit distros /lib is a symlink to /lib64. PLD is an example.
-LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib64
+LINK-$(CONFIG_LD_SCRIPT_DYN_RPATH) += -Wl,-rpath,/lib64
LINK-y += -m64
endif
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index cd3056759880..94834c4b5e5e 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -2,12 +2,12 @@
# CPU tuning section - shared with UML.
# Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML.
-#-mtune exists since gcc 3.4
-HAS_MTUNE := $(call cc-option-yn, -mtune=i386)
-ifeq ($(HAS_MTUNE),y)
tune = $(call cc-option,-mtune=$(1),$(2))
+
+ifdef CONFIG_CC_IS_CLANG
+align := -falign-functions=0 $(call cc-option,-falign-jumps=0) $(call cc-option,-falign-loops=0)
else
-tune = $(call cc-option,-mcpu=$(1),$(2))
+align := -falign-functions=0 -falign-jumps=0 -falign-loops=0
endif
cflags-$(CONFIG_M486SX) += -march=i486
@@ -25,11 +25,11 @@ cflags-$(CONFIG_MK6) += -march=k6
# They make zero difference whatsosever to performance at this time.
cflags-$(CONFIG_MK7) += -march=athlon
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
-cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0
-cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0
+cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)
+cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)
cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
-cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-functions=0 -falign-jumps=0 -falign-loops=0
+cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)
cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
cflags-$(CONFIG_MVIAC7) += -march=i686
cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 9cc7f1357b9b..1189be057ebd 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -11,3 +11,4 @@ setup.elf
fdimage
mtools.conf
image.iso
+hdimage
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index fe605205b4ce..b5aecb524a8a 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -29,7 +29,7 @@ KCOV_INSTRUMENT := n
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
targets := vmlinux.bin setup.bin setup.elf bzImage
-targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
+targets += fdimage fdimage144 fdimage288 image.iso hdimage
subdir- := compressed
setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
@@ -115,47 +115,44 @@ $(obj)/compressed/vmlinux: FORCE
$(Q)$(MAKE) $(build)=$(obj)/compressed $@
# Set this if you want to pass append arguments to the
-# bzdisk/fdimage/isoimage kernel
+# bzdisk/fdimage/hdimage/isoimage kernel
FDARGS =
-# Set this if you want an initrd included with the
-# bzdisk/fdimage/isoimage kernel
+# Set this if you want one or more initrds included in the image
FDINITRD =
-image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
+imgdeps = $(obj)/bzImage $(obj)/mtools.conf $(src)/genimage.sh
$(obj)/mtools.conf: $(src)/mtools.conf.in
sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+targets += mtools.conf
+
+# genimage.sh requires bash, but it also has a bunch of other
+# external dependencies.
quiet_cmd_genimage = GENIMAGE $3
-cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \
- $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD)
+cmd_genimage = $(BASH) $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \
+ $(obj)/mtools.conf '$(FDARGS)' $(FDINITRD)
-PHONY += bzdisk fdimage fdimage144 fdimage288 isoimage bzlilo install
+PHONY += bzdisk fdimage fdimage144 fdimage288 hdimage isoimage
# This requires write access to /dev/fd0
-bzdisk: $(obj)/bzImage $(obj)/mtools.conf
+# All images require syslinux to be installed; hdimage also requires
+# EDK2/OVMF if the kernel is compiled with the EFI stub.
+bzdisk: $(imgdeps)
$(call cmd,genimage,bzdisk,/dev/fd0)
-# These require being root or having syslinux 2.02 or higher installed
-fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
+fdimage fdimage144: $(imgdeps)
$(call cmd,genimage,fdimage144,$(obj)/fdimage)
@$(kecho) 'Kernel: $(obj)/fdimage is ready'
-fdimage288: $(obj)/bzImage $(obj)/mtools.conf
+fdimage288: $(imgdeps)
$(call cmd,genimage,fdimage288,$(obj)/fdimage)
@$(kecho) 'Kernel: $(obj)/fdimage is ready'
-isoimage: $(obj)/bzImage
+hdimage: $(imgdeps)
+ $(call cmd,genimage,hdimage,$(obj)/hdimage)
+ @$(kecho) 'Kernel: $(obj)/hdimage is ready'
+
+isoimage: $(imgdeps)
$(call cmd,genimage,isoimage,$(obj)/image.iso)
@$(kecho) 'Kernel: $(obj)/image.iso is ready'
-
-bzlilo:
- if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
- if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
- cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
- cp System.map $(INSTALL_PATH)/
- if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
-
-install:
- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
- System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index ca866f1cca2e..148ba5c5106e 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -18,7 +18,7 @@
#ifndef __ASSEMBLY__
-#include <stdarg.h>
+#include <linux/stdarg.h>
#include <linux/types.h>
#include <linux/edd.h>
#include <asm/setup.h>
@@ -26,6 +26,7 @@
#include "bitops.h"
#include "ctype.h"
#include "cpuflags.h"
+#include "io.h"
/* Useful macros */
#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
@@ -35,44 +36,10 @@ extern struct boot_params boot_params;
#define cpu_relax() asm volatile("rep; nop")
-/* Basic port I/O */
-static inline void outb(u8 v, u16 port)
-{
- asm volatile("outb %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u8 inb(u16 port)
-{
- u8 v;
- asm volatile("inb %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
-
-static inline void outw(u16 v, u16 port)
-{
- asm volatile("outw %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u16 inw(u16 port)
-{
- u16 v;
- asm volatile("inw %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
-
-static inline void outl(u32 v, u16 port)
-{
- asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u32 inl(u16 port)
-{
- u32 v;
- asm volatile("inl %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
-
static inline void io_delay(void)
{
const u16 DELAY_PORT = 0x80;
- asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT));
+ outb(0, DELAY_PORT);
}
/* These functions are used to reference data in other segments. */
@@ -110,66 +77,78 @@ typedef unsigned int addr_t;
static inline u8 rdfs8(addr_t addr)
{
+ u8 *ptr = (u8 *)absolute_pointer(addr);
u8 v;
- asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+ asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr));
return v;
}
static inline u16 rdfs16(addr_t addr)
{
+ u16 *ptr = (u16 *)absolute_pointer(addr);
u16 v;
- asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline u32 rdfs32(addr_t addr)
{
+ u32 *ptr = (u32 *)absolute_pointer(addr);
u32 v;
- asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline void wrfs8(u8 v, addr_t addr)
{
- asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
+ u8 *ptr = (u8 *)absolute_pointer(addr);
+ asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v));
}
static inline void wrfs16(u16 v, addr_t addr)
{
- asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
+ u16 *ptr = (u16 *)absolute_pointer(addr);
+ asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline void wrfs32(u32 v, addr_t addr)
{
- asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
+ u32 *ptr = (u32 *)absolute_pointer(addr);
+ asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline u8 rdgs8(addr_t addr)
{
+ u8 *ptr = (u8 *)absolute_pointer(addr);
u8 v;
- asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+ asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr));
return v;
}
static inline u16 rdgs16(addr_t addr)
{
+ u16 *ptr = (u16 *)absolute_pointer(addr);
u16 v;
- asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline u32 rdgs32(addr_t addr)
{
+ u32 *ptr = (u32 *)absolute_pointer(addr);
u32 v;
- asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline void wrgs8(u8 v, addr_t addr)
{
- asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
+ u8 *ptr = (u8 *)absolute_pointer(addr);
+ asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v));
}
static inline void wrgs16(u16 v, addr_t addr)
{
- asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
+ u16 *ptr = (u16 *)absolute_pointer(addr);
+ asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline void wrgs32(u32 v, addr_t addr)
{
- asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
+ u32 *ptr = (u32 *)absolute_pointer(addr);
+ asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
}
/* Note: these only return true/false, not a signed return value! */
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index e0bc3988c3fa..19e1905dcbf6 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -28,8 +28,13 @@ KCOV_INSTRUMENT := n
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst
-KBUILD_CFLAGS := -m$(BITS) -O2
+# CLANG_FLAGS must come before any cc-disable-warning or cc-option calls in
+# case of cross compiling, as it has the '--target=' flag, which is needed to
+# avoid errors with '-march=i386', and future flags may depend on the target to
+# be valid.
+KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS)
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
+KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
@@ -47,10 +52,10 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
-# sev-es.c indirectly inludes inat-table.h which is generated during
+# sev.c indirectly inludes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
# that the compiler finds it even with out-of-tree builds (make O=/some/path).
-CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/
+CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
@@ -92,12 +97,14 @@ ifdef CONFIG_X86_64
vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
vmlinux-objs-y += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
- vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
+vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o
vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
+vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a
$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE
@@ -121,17 +128,17 @@ vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,bzip2)
+ $(call if_changed,bzip2_with_size)
$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzma)
+ $(call if_changed,lzma_with_size)
$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,xzkern)
+ $(call if_changed,xzkern_with_size)
$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzo)
+ $(call if_changed,lzo_with_size)
$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lz4)
+ $(call if_changed,lz4_with_size)
$(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,zstd22)
+ $(call if_changed,zstd22_with_size)
suffix-$(CONFIG_KERNEL_GZIP) := gz
suffix-$(CONFIG_KERNEL_BZIP2) := bz2
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 8bcbcee54aa1..9caf89063e77 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -3,10 +3,9 @@
#include "misc.h"
#include "error.h"
#include "../string.h"
+#include "efi.h"
#include <linux/numa.h>
-#include <linux/efi.h>
-#include <asm/efi.h>
/*
* Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0'
@@ -20,153 +19,56 @@
*/
struct mem_vector immovable_mem[MAX_NUMNODES*2];
-/*
- * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and
- * ACPI_TABLE_GUID are found, take the former, which has more features.
- */
static acpi_physical_address
-__efi_get_rsdp_addr(unsigned long config_tables, unsigned int nr_tables,
- bool efi_64)
+__efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len)
{
- acpi_physical_address rsdp_addr = 0;
-
#ifdef CONFIG_EFI
- int i;
-
- /* Get EFI tables from systab. */
- for (i = 0; i < nr_tables; i++) {
- acpi_physical_address table;
- efi_guid_t guid;
-
- if (efi_64) {
- efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables + i;
-
- guid = tbl->guid;
- table = tbl->table;
-
- if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) {
- debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n");
- return 0;
- }
- } else {
- efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables + i;
-
- guid = tbl->guid;
- table = tbl->table;
- }
+ unsigned long rsdp_addr;
+ int ret;
- if (!(efi_guidcmp(guid, ACPI_TABLE_GUID)))
- rsdp_addr = table;
- else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID)))
- return table;
- }
+ /*
+ * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to
+ * ACPI_TABLE_GUID because it has more features.
+ */
+ rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len,
+ ACPI_20_TABLE_GUID);
+ if (rsdp_addr)
+ return (acpi_physical_address)rsdp_addr;
+
+ /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */
+ rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len,
+ ACPI_TABLE_GUID);
+ if (rsdp_addr)
+ return (acpi_physical_address)rsdp_addr;
+
+ debug_putstr("Error getting RSDP address.\n");
#endif
- return rsdp_addr;
-}
-
-/* EFI/kexec support is 64-bit only. */
-#ifdef CONFIG_X86_64
-static struct efi_setup_data *get_kexec_setup_data_addr(void)
-{
- struct setup_data *data;
- u64 pa_data;
-
- pa_data = boot_params->hdr.setup_data;
- while (pa_data) {
- data = (struct setup_data *)pa_data;
- if (data->type == SETUP_EFI)
- return (struct efi_setup_data *)(pa_data + sizeof(struct setup_data));
-
- pa_data = data->next;
- }
- return NULL;
-}
-
-static acpi_physical_address kexec_get_rsdp_addr(void)
-{
- efi_system_table_64_t *systab;
- struct efi_setup_data *esd;
- struct efi_info *ei;
- char *sig;
-
- esd = (struct efi_setup_data *)get_kexec_setup_data_addr();
- if (!esd)
- return 0;
-
- if (!esd->tables) {
- debug_putstr("Wrong kexec SETUP_EFI data.\n");
- return 0;
- }
-
- ei = &boot_params->efi_info;
- sig = (char *)&ei->efi_loader_signature;
- if (strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
- debug_putstr("Wrong kexec EFI loader signature.\n");
- return 0;
- }
-
- /* Get systab from boot params. */
- systab = (efi_system_table_64_t *) (ei->efi_systab | ((__u64)ei->efi_systab_hi << 32));
- if (!systab)
- error("EFI system table not found in kexec boot_params.");
-
- return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables, true);
+ return 0;
}
-#else
-static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; }
-#endif /* CONFIG_X86_64 */
static acpi_physical_address efi_get_rsdp_addr(void)
{
#ifdef CONFIG_EFI
- unsigned long systab, config_tables;
+ unsigned long cfg_tbl_pa = 0;
+ unsigned int cfg_tbl_len;
+ unsigned long systab_pa;
unsigned int nr_tables;
- struct efi_info *ei;
- bool efi_64;
- char *sig;
-
- ei = &boot_params->efi_info;
- sig = (char *)&ei->efi_loader_signature;
-
- if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
- efi_64 = true;
- } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
- efi_64 = false;
- } else {
- debug_putstr("Wrong EFI loader signature.\n");
- return 0;
- }
+ enum efi_type et;
+ int ret;
- /* Get systab from boot params. */
-#ifdef CONFIG_X86_64
- systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
-#else
- if (ei->efi_systab_hi || ei->efi_memmap_hi) {
- debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n");
+ et = efi_get_type(boot_params);
+ if (et == EFI_TYPE_NONE)
return 0;
- }
- systab = ei->efi_systab;
-#endif
- if (!systab)
- error("EFI system table not found.");
- /* Handle EFI bitness properly */
- if (efi_64) {
- efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab;
+ systab_pa = efi_get_system_table(boot_params);
+ if (!systab_pa)
+ error("EFI support advertised, but unable to locate system table.");
- config_tables = stbl->tables;
- nr_tables = stbl->nr_tables;
- } else {
- efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab;
+ ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len);
+ if (ret || !cfg_tbl_pa)
+ error("EFI config table not found.");
- config_tables = stbl->tables;
- nr_tables = stbl->nr_tables;
- }
-
- if (!config_tables)
- error("EFI config tables not found.");
-
- return __efi_get_rsdp_addr(config_tables, nr_tables, efi_64);
+ return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len);
#else
return 0;
#endif
@@ -256,14 +158,6 @@ acpi_physical_address get_rsdp_addr(void)
pa = boot_params->acpi_rsdp_addr;
- /*
- * Try to get EFI data from setup_data. This can happen when we're a
- * kexec'ed kernel and kexec(1) has passed all the required EFI info to
- * us.
- */
- if (!pa)
- pa = kexec_get_rsdp_addr();
-
if (!pa)
pa = efi_get_rsdp_addr();
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
index 261e81fb9582..70a8d1706d0f 100644
--- a/arch/x86/boot/compressed/early_serial_console.c
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -1,5 +1,6 @@
#include "misc.h"
-int early_serial_base;
+/* This might be accessed before .bss is cleared, so use .data instead. */
+int early_serial_base __section(".data");
#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c
new file mode 100644
index 000000000000..6edd034b0b30
--- /dev/null
+++ b/arch/x86/boot/compressed/efi.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for early access to EFI configuration table.
+ *
+ * Originally derived from arch/x86/boot/compressed/acpi.c
+ */
+
+#include "misc.h"
+
+/**
+ * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment.
+ *
+ * @bp: pointer to boot_params
+ *
+ * Return: EFI_TYPE_{32,64} for valid EFI environments, EFI_TYPE_NONE otherwise.
+ */
+enum efi_type efi_get_type(struct boot_params *bp)
+{
+ struct efi_info *ei;
+ enum efi_type et;
+ const char *sig;
+
+ ei = &bp->efi_info;
+ sig = (char *)&ei->efi_loader_signature;
+
+ if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
+ et = EFI_TYPE_64;
+ } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
+ et = EFI_TYPE_32;
+ } else {
+ debug_putstr("No EFI environment detected.\n");
+ et = EFI_TYPE_NONE;
+ }
+
+#ifndef CONFIG_X86_64
+ /*
+ * Existing callers like acpi.c treat this case as an indicator to
+ * fall-through to non-EFI, rather than an error, so maintain that
+ * functionality here as well.
+ */
+ if (ei->efi_systab_hi || ei->efi_memmap_hi) {
+ debug_putstr("EFI system table is located above 4GB and cannot be accessed.\n");
+ et = EFI_TYPE_NONE;
+ }
+#endif
+
+ return et;
+}
+
+/**
+ * efi_get_system_table - Given a pointer to boot_params, retrieve the physical address
+ * of the EFI system table.
+ *
+ * @bp: pointer to boot_params
+ *
+ * Return: EFI system table address on success. On error, return 0.
+ */
+unsigned long efi_get_system_table(struct boot_params *bp)
+{
+ unsigned long sys_tbl_pa;
+ struct efi_info *ei;
+ enum efi_type et;
+
+ /* Get systab from boot params. */
+ ei = &bp->efi_info;
+#ifdef CONFIG_X86_64
+ sys_tbl_pa = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
+#else
+ sys_tbl_pa = ei->efi_systab;
+#endif
+ if (!sys_tbl_pa) {
+ debug_putstr("EFI system table not found.");
+ return 0;
+ }
+
+ return sys_tbl_pa;
+}
+
+/*
+ * EFI config table address changes to virtual address after boot, which may
+ * not be accessible for the kexec'd kernel. To address this, kexec provides
+ * the initial physical address via a struct setup_data entry, which is
+ * checked for here, along with some sanity checks.
+ */
+static struct efi_setup_data *get_kexec_setup_data(struct boot_params *bp,
+ enum efi_type et)
+{
+#ifdef CONFIG_X86_64
+ struct efi_setup_data *esd = NULL;
+ struct setup_data *data;
+ u64 pa_data;
+
+ pa_data = bp->hdr.setup_data;
+ while (pa_data) {
+ data = (struct setup_data *)pa_data;
+ if (data->type == SETUP_EFI) {
+ esd = (struct efi_setup_data *)(pa_data + sizeof(struct setup_data));
+ break;
+ }
+
+ pa_data = data->next;
+ }
+
+ /*
+ * Original ACPI code falls back to attempting normal EFI boot in these
+ * cases, so maintain existing behavior by indicating non-kexec
+ * environment to the caller, but print them for debugging.
+ */
+ if (esd && !esd->tables) {
+ debug_putstr("kexec EFI environment missing valid configuration table.\n");
+ return NULL;
+ }
+
+ return esd;
+#endif
+ return NULL;
+}
+
+/**
+ * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical
+ * address of EFI configuration table.
+ *
+ * @bp: pointer to boot_params
+ * @cfg_tbl_pa: location to store physical address of config table
+ * @cfg_tbl_len: location to store number of config table entries
+ *
+ * Return: 0 on success. On error, return params are left unchanged.
+ */
+int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len)
+{
+ unsigned long sys_tbl_pa;
+ enum efi_type et;
+ int ret;
+
+ if (!cfg_tbl_pa || !cfg_tbl_len)
+ return -EINVAL;
+
+ sys_tbl_pa = efi_get_system_table(bp);
+ if (!sys_tbl_pa)
+ return -EINVAL;
+
+ /* Handle EFI bitness properly */
+ et = efi_get_type(bp);
+ if (et == EFI_TYPE_64) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa;
+ struct efi_setup_data *esd;
+
+ /* kexec provides an alternative EFI conf table, check for it. */
+ esd = get_kexec_setup_data(bp, et);
+
+ *cfg_tbl_pa = esd ? esd->tables : stbl->tables;
+ *cfg_tbl_len = stbl->nr_tables;
+ } else if (et == EFI_TYPE_32) {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa;
+
+ *cfg_tbl_pa = stbl->tables;
+ *cfg_tbl_len = stbl->nr_tables;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Get vendor table address/guid from EFI config table at the given index */
+static int get_vendor_table(void *cfg_tbl, unsigned int idx,
+ unsigned long *vendor_tbl_pa,
+ efi_guid_t *vendor_tbl_guid,
+ enum efi_type et)
+{
+ if (et == EFI_TYPE_64) {
+ efi_config_table_64_t *tbl_entry = (efi_config_table_64_t *)cfg_tbl + idx;
+
+ if (!IS_ENABLED(CONFIG_X86_64) && tbl_entry->table >> 32) {
+ debug_putstr("Error: EFI config table entry located above 4GB.\n");
+ return -EINVAL;
+ }
+
+ *vendor_tbl_pa = tbl_entry->table;
+ *vendor_tbl_guid = tbl_entry->guid;
+
+ } else if (et == EFI_TYPE_32) {
+ efi_config_table_32_t *tbl_entry = (efi_config_table_32_t *)cfg_tbl + idx;
+
+ *vendor_tbl_pa = tbl_entry->table;
+ *vendor_tbl_guid = tbl_entry->guid;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * efi_find_vendor_table - Given EFI config table, search it for the physical
+ * address of the vendor table associated with GUID.
+ *
+ * @bp: pointer to boot_params
+ * @cfg_tbl_pa: pointer to EFI configuration table
+ * @cfg_tbl_len: number of entries in EFI configuration table
+ * @guid: GUID of vendor table
+ *
+ * Return: vendor table address on success. On error, return 0.
+ */
+unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid)
+{
+ enum efi_type et;
+ unsigned int i;
+
+ et = efi_get_type(bp);
+ if (et == EFI_TYPE_NONE)
+ return 0;
+
+ for (i = 0; i < cfg_tbl_len; i++) {
+ unsigned long vendor_tbl_pa;
+ efi_guid_t vendor_tbl_guid;
+ int ret;
+
+ ret = get_vendor_table((void *)cfg_tbl_pa, i,
+ &vendor_tbl_pa,
+ &vendor_tbl_guid, et);
+ if (ret)
+ return 0;
+
+ if (!efi_guidcmp(guid, vendor_tbl_guid))
+ return vendor_tbl_pa;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h
new file mode 100644
index 000000000000..7db2f41b54cd
--- /dev/null
+++ b/arch/x86/boot/compressed/efi.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_EFI_H
+#define BOOT_COMPRESSED_EFI_H
+
+#if defined(_LINUX_EFI_H) || defined(_ASM_X86_EFI_H)
+#error Please do not include kernel proper namespace headers
+#endif
+
+typedef guid_t efi_guid_t __aligned(__alignof__(u32));
+
+#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \
+ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+ (b) & 0xff, ((b) >> 8) & 0xff, \
+ (c) & 0xff, ((c) >> 8) & 0xff, d } }
+
+#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
+#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)
+#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42)
+
+#define EFI32_LOADER_SIGNATURE "EL32"
+#define EFI64_LOADER_SIGNATURE "EL64"
+
+/*
+ * Generic EFI table header
+ */
+typedef struct {
+ u64 signature;
+ u32 revision;
+ u32 headersize;
+ u32 crc32;
+ u32 reserved;
+} efi_table_hdr_t;
+
+#define EFI_CONVENTIONAL_MEMORY 7
+
+#define EFI_MEMORY_MORE_RELIABLE \
+ ((u64)0x0000000000010000ULL) /* higher reliability */
+#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */
+
+#define EFI_PAGE_SHIFT 12
+
+typedef struct {
+ u32 type;
+ u32 pad;
+ u64 phys_addr;
+ u64 virt_addr;
+ u64 num_pages;
+ u64 attribute;
+} efi_memory_desc_t;
+
+#define efi_early_memdesc_ptr(map, desc_size, n) \
+ (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size)))
+
+typedef struct {
+ efi_guid_t guid;
+ u64 table;
+} efi_config_table_64_t;
+
+typedef struct {
+ efi_guid_t guid;
+ u32 table;
+} efi_config_table_32_t;
+
+typedef struct {
+ efi_table_hdr_t hdr;
+ u64 fw_vendor; /* physical addr of CHAR16 vendor string */
+ u32 fw_revision;
+ u32 __pad1;
+ u64 con_in_handle;
+ u64 con_in;
+ u64 con_out_handle;
+ u64 con_out;
+ u64 stderr_handle;
+ u64 stderr;
+ u64 runtime;
+ u64 boottime;
+ u32 nr_tables;
+ u32 __pad2;
+ u64 tables;
+} efi_system_table_64_t;
+
+typedef struct {
+ efi_table_hdr_t hdr;
+ u32 fw_vendor; /* physical addr of CHAR16 vendor string */
+ u32 fw_revision;
+ u32 con_in_handle;
+ u32 con_in;
+ u32 con_out_handle;
+ u32 con_out;
+ u32 stderr_handle;
+ u32 stderr;
+ u32 runtime;
+ u32 boottime;
+ u32 nr_tables;
+ u32 tables;
+} efi_system_table_32_t;
+
+/* kexec external ABI */
+struct efi_setup_data {
+ u64 fw_vendor;
+ u64 __unused;
+ u64 tables;
+ u64 smbios;
+ u64 reserved[8];
+};
+
+static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right)
+{
+ return memcmp(&left, &right, sizeof (efi_guid_t));
+}
+
+#ifdef CONFIG_EFI
+bool __pure __efi_soft_reserve_enabled(void);
+
+static inline bool __pure efi_soft_reserve_enabled(void)
+{
+ return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE)
+ && __efi_soft_reserve_enabled();
+}
+#else
+static inline bool efi_soft_reserve_enabled(void)
+{
+ return false;
+}
+#endif /* CONFIG_EFI */
+#endif /* BOOT_COMPRESSED_EFI_H */
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
index c4bb0f9363f5..67e7edcdfea8 100644
--- a/arch/x86/boot/compressed/efi_thunk_64.S
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -5,9 +5,8 @@
* Early support for invoking 32-bit EFI services from a 64-bit kernel.
*
* Because this thunking occurs before ExitBootServices() we have to
- * restore the firmware's 32-bit GDT before we make EFI serivce calls,
- * since the firmware's 32-bit IDT is still currently installed and it
- * needs to be able to service interrupts.
+ * restore the firmware's 32-bit GDT and IDT before we make EFI service
+ * calls.
*
* On the plus side, we don't have to worry about mangling 64-bit
* addresses into 32-bits because we're executing with an identity
@@ -27,8 +26,6 @@ SYM_FUNC_START(__efi64_thunk)
push %rbp
push %rbx
- leaq 1f(%rip), %rbp
-
movl %ds, %eax
push %rax
movl %es, %eax
@@ -36,27 +33,42 @@ SYM_FUNC_START(__efi64_thunk)
movl %ss, %eax
push %rax
+ /* Copy args passed on stack */
+ movq 0x30(%rsp), %rbp
+ movq 0x38(%rsp), %rbx
+ movq 0x40(%rsp), %rax
+
/*
* Convert x86-64 ABI params to i386 ABI
*/
- subq $32, %rsp
+ subq $64, %rsp
movl %esi, 0x0(%rsp)
movl %edx, 0x4(%rsp)
movl %ecx, 0x8(%rsp)
movl %r8d, 0xc(%rsp)
movl %r9d, 0x10(%rsp)
+ movl %ebp, 0x14(%rsp)
+ movl %ebx, 0x18(%rsp)
+ movl %eax, 0x1c(%rsp)
- leaq 0x14(%rsp), %rbx
+ leaq 0x20(%rsp), %rbx
sgdt (%rbx)
+ addq $16, %rbx
+ sidt (%rbx)
+
+ leaq 1f(%rip), %rbp
+
/*
- * Switch to gdt with 32-bit segments. This is the firmware GDT
- * that was installed when the kernel started executing. This
- * pointer was saved at the EFI stub entry point in head_64.S.
+ * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT
+ * and IDT that was installed when the kernel started executing. The
+ * pointers were saved at the EFI stub entry point in head_64.S.
*
* Pass the saved DS selector to the 32-bit code, and use far return to
* restore the saved CS selector.
*/
+ leaq efi32_boot_idt(%rip), %rax
+ lidt (%rax)
leaq efi32_boot_gdt(%rip), %rax
lgdt (%rax)
@@ -67,7 +79,7 @@ SYM_FUNC_START(__efi64_thunk)
pushq %rax
lretq
-1: addq $32, %rsp
+1: addq $64, %rsp
movq %rdi, %rax
pop %rbx
@@ -89,7 +101,7 @@ SYM_FUNC_START(__efi64_thunk)
pop %rbx
pop %rbp
- ret
+ RET
SYM_FUNC_END(__efi64_thunk)
.code32
@@ -128,10 +140,13 @@ SYM_FUNC_START_LOCAL(efi_enter32)
/*
* Some firmware will return with interrupts enabled. Be sure to
- * disable them before we switch GDTs.
+ * disable them before we switch GDTs and IDTs.
*/
cli
+ lidtl (%ebx)
+ subl $16, %ebx
+
lgdtl (%ebx)
movl %cr4, %eax
@@ -166,6 +181,11 @@ SYM_DATA_START(efi32_boot_gdt)
.quad 0
SYM_DATA_END(efi32_boot_gdt)
+SYM_DATA_START(efi32_boot_idt)
+ .word 0
+ .quad 0
+SYM_DATA_END(efi32_boot_idt)
+
SYM_DATA_START(efi32_boot_cs)
.word 0
SYM_DATA_END(efi32_boot_cs)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 659fad53ca82..3b354eb9516d 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -152,14 +152,13 @@ SYM_FUNC_END(startup_32)
#ifdef CONFIG_EFI_STUB
SYM_FUNC_START(efi32_stub_entry)
-SYM_FUNC_START_ALIAS(efi_stub_entry)
add $0x4, %esp
movl 8(%esp), %esi /* save boot_params pointer */
call efi_main
/* efi_main returns the possibly relocated address of startup_32 */
jmp *%eax
SYM_FUNC_END(efi32_stub_entry)
-SYM_FUNC_END_ALIAS(efi_stub_entry)
+SYM_FUNC_ALIAS(efi_stub_entry, efi32_stub_entry)
#endif
.text
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index e94874f4bbc1..d33f060900d2 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -34,6 +34,7 @@
#include <asm/asm-offsets.h>
#include <asm/bootparam.h>
#include <asm/desc_defs.h>
+#include <asm/trapnr.h>
#include "pgtable.h"
/*
@@ -107,9 +108,19 @@ SYM_FUNC_START(startup_32)
movl %eax, %gs
movl %eax, %ss
-/* setup a stack and make sure cpu supports long mode. */
+ /* Setup a stack and load CS from current GDT */
leal rva(boot_stack_end)(%ebp), %esp
+ pushl $__KERNEL32_CS
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
+1:
+
+ /* Setup Exception handling for SEV-ES */
+ call startup32_load_idt
+
+ /* Make sure cpu supports long mode. */
call verify_cpu
testl %eax, %eax
jnz .Lno_longmode
@@ -172,11 +183,21 @@ SYM_FUNC_START(startup_32)
*/
call get_sev_encryption_bit
xorl %edx, %edx
+#ifdef CONFIG_AMD_MEM_ENCRYPT
testl %eax, %eax
jz 1f
subl $32, %eax /* Encryption bit is always above bit 31 */
bts %eax, %edx /* Set encryption mask for page tables */
+ /*
+ * Set MSR_AMD64_SEV_ENABLED_BIT in sev_status so that
+ * startup32_check_sev_cbit() will do a check. sev_enable() will
+ * initialize sev_status with all the bits reported by
+ * MSR_AMD_SEV_STATUS later, but only MSR_AMD64_SEV_ENABLED_BIT
+ * needs to be set for now.
+ */
+ movl $1, rva(sev_status)(%ebp)
1:
+#endif
/* Initialize Page tables to 0 */
leal rva(pgtable)(%ebx), %edi
@@ -231,7 +252,7 @@ SYM_FUNC_START(startup_32)
/*
* Setup for the jump to 64bit mode
*
- * When the jump is performend we will be in long mode but
+ * When the jump is performed we will be in long mode but
* in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
* (and in turn EFER.LMA = 1). To jump into 64bit mode we use
* the new gdt/idt that has __KERNEL_CS with CS.L = 1.
@@ -261,11 +282,14 @@ SYM_FUNC_START(startup_32)
movl %esi, %edx
1:
#endif
+ /* Check if the C-bit position is correct when SEV is active */
+ call startup32_check_sev_cbit
+
pushl $__KERNEL_CS
pushl %eax
/* Enter paged protected Mode, activating Long Mode */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
+ movl $CR0_STATE, %eax
movl %eax, %cr0
/* Jump from 32bit compatibility mode into 64bit mode. */
@@ -295,6 +319,9 @@ SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL)
movw %cs, rva(efi32_boot_cs)(%ebp)
movw %ds, rva(efi32_boot_ds)(%ebp)
+ /* Store firmware IDT descriptor */
+ sidtl rva(efi32_boot_idt)(%ebp)
+
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
@@ -420,6 +447,23 @@ SYM_CODE_START(startup_64)
call load_stage1_idt
popq %rsi
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Now that the stage1 interrupt handlers are set up, #VC exceptions from
+ * CPUID instructions can be properly handled for SEV-ES guests.
+ *
+ * For SEV-SNP, the CPUID table also needs to be set up in advance of any
+ * CPUID instructions being issued, so go ahead and do that now via
+ * sev_enable(), which will also handle the rest of the SEV-related
+ * detection/setup to ensure that has been done in advance of any dependent
+ * code.
+ */
+ pushq %rsi
+ movq %rsi, %rdi /* real mode address */
+ call sev_enable
+ popq %rsi
+#endif
+
/*
* paging_prepare() sets up the trampoline and checks if we need to
* enable 5-level paging.
@@ -508,7 +552,6 @@ SYM_CODE_END(startup_64)
#ifdef CONFIG_EFI_STUB
.org 0x390
SYM_FUNC_START(efi64_stub_entry)
-SYM_FUNC_START_ALIAS(efi_stub_entry)
and $~0xf, %rsp /* realign the stack */
movq %rdx, %rbx /* save boot_params pointer */
call efi_main
@@ -516,7 +559,7 @@ SYM_FUNC_START_ALIAS(efi_stub_entry)
leaq rva(startup_64)(%rax), %rax
jmp *%rax
SYM_FUNC_END(efi64_stub_entry)
-SYM_FUNC_END_ALIAS(efi_stub_entry)
+SYM_FUNC_ALIAS(efi_stub_entry, efi64_stub_entry)
#endif
.text
@@ -532,17 +575,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
shrq $3, %rcx
rep stosq
-/*
- * If running as an SEV guest, the encryption mask is required in the
- * page-table setup code below. When the guest also has SEV-ES enabled
- * set_sev_encryption_mask() will cause #VC exceptions, but the stage2
- * handler can't map its GHCB because the page-table is not set up yet.
- * So set up the encryption mask here while still on the stage1 #VC
- * handler. Then load stage2 IDT and switch to the kernel's own
- * page-table.
- */
pushq %rsi
- call set_sev_encryption_mask
call load_stage2_idt
/* Pass boot_params to initialize_identity_maps() */
@@ -616,12 +649,28 @@ SYM_CODE_START(trampoline_32bit_src)
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+ jc 1f
wrmsr
- popl %edx
+1: popl %edx
popl %ecx
+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support.
+ * Clearing MCE may fault in some environments (that also force #MC
+ * support). Any machine check that occurs before #MC support is fully
+ * configured will crash the system regardless of the CR4.MCE value set
+ * here.
+ */
+ movl %cr4, %eax
+ andl $X86_CR4_MCE, %eax
+#else
+ movl $0, %eax
+#endif
+
/* Enable PAE and LA57 (if required) paging modes */
- movl $X86_CR4_PAE, %eax
+ orl $X86_CR4_PAE, %eax
testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
@@ -635,8 +684,9 @@ SYM_CODE_START(trampoline_32bit_src)
pushl $__KERNEL_CS
pushl %eax
- /* Enable paging again */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ /* Enable paging again. */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
lret
@@ -694,6 +744,19 @@ SYM_DATA_START(boot_idt)
.endr
SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+SYM_DATA_START(boot32_idt_desc)
+ .word boot32_idt_end - boot32_idt - 1
+ .long 0
+SYM_DATA_END(boot32_idt_desc)
+ .balign 8
+SYM_DATA_START(boot32_idt)
+ .rept 32
+ .quad 0
+ .endr
+SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end)
+#endif
+
#ifdef CONFIG_EFI_STUB
SYM_DATA(image_offset, .long 0)
#endif
@@ -773,7 +836,7 @@ SYM_FUNC_START(efi32_pe_entry)
2: popl %edi // restore callee-save registers
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(efi32_pe_entry)
.section ".rodata"
@@ -786,6 +849,137 @@ SYM_DATA_START_LOCAL(loaded_image_proto)
SYM_DATA_END(loaded_image_proto)
#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ __HEAD
+ .code32
+/*
+ * Write an IDT entry into boot32_idt
+ *
+ * Parameters:
+ *
+ * %eax: Handler address
+ * %edx: Vector number
+ *
+ * Physical offset is expected in %ebp
+ */
+SYM_FUNC_START(startup32_set_idt_entry)
+ push %ebx
+ push %ecx
+
+ /* IDT entry address to %ebx */
+ leal rva(boot32_idt)(%ebp), %ebx
+ shl $3, %edx
+ addl %edx, %ebx
+
+ /* Build IDT entry, lower 4 bytes */
+ movl %eax, %edx
+ andl $0x0000ffff, %edx # Target code segment offset [15:0]
+ movl $__KERNEL32_CS, %ecx # Target code segment selector
+ shl $16, %ecx
+ orl %ecx, %edx
+
+ /* Store lower 4 bytes to IDT */
+ movl %edx, (%ebx)
+
+ /* Build IDT entry, upper 4 bytes */
+ movl %eax, %edx
+ andl $0xffff0000, %edx # Target code segment offset [31:16]
+ orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
+
+ /* Store upper 4 bytes to IDT */
+ movl %edx, 4(%ebx)
+
+ pop %ecx
+ pop %ebx
+ RET
+SYM_FUNC_END(startup32_set_idt_entry)
+#endif
+
+SYM_FUNC_START(startup32_load_idt)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* #VC handler */
+ leal rva(startup32_vc_handler)(%ebp), %eax
+ movl $X86_TRAP_VC, %edx
+ call startup32_set_idt_entry
+
+ /* Load IDT */
+ leal rva(boot32_idt)(%ebp), %eax
+ movl %eax, rva(boot32_idt_desc+2)(%ebp)
+ lidt rva(boot32_idt_desc)(%ebp)
+#endif
+ RET
+SYM_FUNC_END(startup32_load_idt)
+
+/*
+ * Check for the correct C-bit position when the startup_32 boot-path is used.
+ *
+ * The check makes use of the fact that all memory is encrypted when paging is
+ * disabled. The function creates 64 bits of random data using the RDRAND
+ * instruction. RDRAND is mandatory for SEV guests, so always available. If the
+ * hypervisor violates that the kernel will crash right here.
+ *
+ * The 64 bits of random data are stored to a memory location and at the same
+ * time kept in the %eax and %ebx registers. Since encryption is always active
+ * when paging is off the random data will be stored encrypted in main memory.
+ *
+ * Then paging is enabled. When the C-bit position is correct all memory is
+ * still mapped encrypted and comparing the register values with memory will
+ * succeed. An incorrect C-bit position will map all memory unencrypted, so that
+ * the compare will use the encrypted random data and fail.
+ */
+SYM_FUNC_START(startup32_check_sev_cbit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ /* Check for non-zero sev_status */
+ movl rva(sev_status)(%ebp), %eax
+ testl %eax, %eax
+ jz 4f
+
+ /*
+ * Get two 32-bit random values - Don't bail out if RDRAND fails
+ * because it is better to prevent forward progress if no random value
+ * can be gathered.
+ */
+1: rdrand %eax
+ jnc 1b
+2: rdrand %ebx
+ jnc 2b
+
+ /* Store to memory and keep it in the registers */
+ movl %eax, rva(sev_check_data)(%ebp)
+ movl %ebx, rva(sev_check_data+4)(%ebp)
+
+ /* Enable paging to see if encryption is active */
+ movl %cr0, %edx /* Backup %cr0 in %edx */
+ movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
+ movl %ecx, %cr0
+
+ cmpl %eax, rva(sev_check_data)(%ebp)
+ jne 3f
+ cmpl %ebx, rva(sev_check_data+4)(%ebp)
+ jne 3f
+
+ movl %edx, %cr0 /* Restore previous %cr0 */
+
+ jmp 4f
+
+3: /* Check failed - hlt the machine */
+ hlt
+ jmp 3b
+
+4:
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+#endif
+ RET
+SYM_FUNC_END(startup32_check_sev_cbit)
+
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index f7213d0943b8..44c350d627c7 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -90,7 +90,7 @@ static struct x86_mapping_info mapping_info;
/*
* Adds the specified range to the identity mappings.
*/
-static void add_identity_map(unsigned long start, unsigned long end)
+void kernel_add_identity_map(unsigned long start, unsigned long end)
{
int ret;
@@ -157,14 +157,15 @@ void initialize_identity_maps(void *rmode)
* explicitly here in case the compressed kernel does not touch them,
* or does not touch all the pages covering them.
*/
- add_identity_map((unsigned long)_head, (unsigned long)_end);
+ kernel_add_identity_map((unsigned long)_head, (unsigned long)_end);
boot_params = rmode;
- add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1));
+ kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1));
cmdline = get_cmd_line_ptr();
- add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
+ kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
+
+ sev_prep_identity_maps(top_level_pgt);
/* Load the new page-table. */
- sev_verify_cbit(top_level_pgt);
write_cr3(top_level_pgt);
}
@@ -246,10 +247,10 @@ static int set_clr_page_flags(struct x86_mapping_info *info,
* It should already exist, but keep things generic.
*
* To map the page just read from it and fault it in if there is no
- * mapping yet. add_identity_map() can't be called here because that
- * would unconditionally map the address on PMD level, destroying any
- * PTE-level mappings that might already exist. Use assembly here so
- * the access won't be optimized away.
+ * mapping yet. kernel_add_identity_map() can't be called here because
+ * that would unconditionally map the address on PMD level, destroying
+ * any PTE-level mappings that might already exist. Use assembly here
+ * so the access won't be optimized away.
*/
asm volatile("mov %[address], %%r9"
:: [address] "g" (*(unsigned long *)address)
@@ -275,15 +276,31 @@ static int set_clr_page_flags(struct x86_mapping_info *info,
* Changing encryption attributes of a page requires to flush it from
* the caches.
*/
- if ((set | clr) & _PAGE_ENC)
+ if ((set | clr) & _PAGE_ENC) {
clflush_page(address);
+ /*
+ * If the encryption attribute is being cleared, change the page state
+ * to shared in the RMP table.
+ */
+ if (clr)
+ snp_set_page_shared(__pa(address & PAGE_MASK));
+ }
+
/* Update PTE */
pte = *ptep;
pte = pte_set_flags(pte, set);
pte = pte_clear_flags(pte, clr);
set_pte(ptep, pte);
+ /*
+ * If the encryption attribute is being set, then change the page state to
+ * private in the RMP entry. The page state change must be done after the PTE
+ * is updated.
+ */
+ if (set & _PAGE_ENC)
+ snp_set_page_private(__pa(address & PAGE_MASK));
+
/* Flush TLB after changing encryption attribute */
write_cr3(top_level_pgt);
@@ -347,5 +364,5 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
* Error code is sane - now identity map the 2M region around
* the faulting address.
*/
- add_identity_map(address, end);
+ kernel_add_identity_map(address, end);
}
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 804a502ee0d2..6debb816e83d 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -39,7 +39,23 @@ void load_stage1_idt(void)
load_boot_idt(&boot_idt_desc);
}
-/* Setup IDT after kernel jumping to .Lrelocated */
+/*
+ * Setup IDT after kernel jumping to .Lrelocated.
+ *
+ * initialize_identity_maps() needs a #PF handler to be setup
+ * in order to be able to fault-in identity mapping ranges; see
+ * do_boot_page_fault().
+ *
+ * This #PF handler setup needs to happen in load_stage2_idt() where the
+ * IDT is loaded and there the #VC IDT entry gets setup too.
+ *
+ * In order to be able to handle #VCs, one needs a GHCB which
+ * gets setup with an already set up pagetable, which is done in
+ * initialize_identity_maps(). And there's the catch 22: the boot #VC
+ * handler do_boot_stage2_vc() needs to call early_setup_ghcb() itself
+ * (and, especially set_page_decrypted()) because the SEV-ES setup code
+ * cannot initialize a GHCB as there's no #PF handler yet...
+ */
void load_stage2_idt(void)
{
boot_idt_desc.address = (unsigned long)boot_idt;
@@ -52,3 +68,17 @@ void load_stage2_idt(void)
load_boot_idt(&boot_idt_desc);
}
+
+void cleanup_exception_handling(void)
+{
+ /*
+ * Flush GHCB from cache and map it encrypted again when running as
+ * SEV-ES guest.
+ */
+ sev_es_shutdown_ghcb();
+
+ /* Set a null-idt, disabling #PF and #VC handling */
+ boot_idt_desc.size = 0;
+ boot_idt_desc.address = 0;
+ load_boot_idt(&boot_idt_desc);
+}
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index b92fffbe761f..4a3f223973f4 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -22,19 +22,14 @@
#include "misc.h"
#include "error.h"
#include "../string.h"
+#include "efi.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/ctype.h>
-#include <linux/efi.h>
#include <generated/utsrelease.h>
-#include <asm/efi.h>
-
-/* Macros used by the included decompressor code below. */
-#define STATIC
-#include <linux/decompress/mm.h>
#define _SETUP
#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
@@ -639,9 +634,9 @@ static bool process_mem_region(struct mem_vector *region,
if (slot_area_index == MAX_SLOT_AREA) {
debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n");
- return 1;
+ return true;
}
- return 0;
+ return false;
}
#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
@@ -668,7 +663,7 @@ static bool process_mem_region(struct mem_vector *region,
if (slot_area_index == MAX_SLOT_AREA) {
debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n");
- return 1;
+ return true;
}
}
#endif
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index aa561795efd1..a73e4d783cae 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -23,12 +23,6 @@ SYM_FUNC_START(get_sev_encryption_bit)
push %ecx
push %edx
- /* Check if running under a hypervisor */
- movl $1, %eax
- cpuid
- bt $31, %ecx /* Check the hypervisor bit */
- jnc .Lno_sev
-
movl $0x80000000, %eax /* CPUID to check the highest leaf */
cpuid
cmpl $0x8000001f, %eax /* See if 0x8000001f is available */
@@ -64,49 +58,135 @@ SYM_FUNC_START(get_sev_encryption_bit)
#endif /* CONFIG_AMD_MEM_ENCRYPT */
- ret
+ RET
SYM_FUNC_END(get_sev_encryption_bit)
- .code64
-
-#include "../../kernel/sev_verify_cbit.S"
-
-SYM_FUNC_START(set_sev_encryption_mask)
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- push %rbp
- push %rdx
-
- movq %rsp, %rbp /* Save current stack pointer */
-
- call get_sev_encryption_bit /* Get the encryption bit position */
- testl %eax, %eax
- jz .Lno_sev_mask
+/**
+ * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using
+ * the GHCB MSR protocol
+ *
+ * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX)
+ * @%edx: CPUID Function
+ *
+ * Returns 0 in %eax on success, non-zero on failure
+ * %edx returns CPUID value on success
+ */
+SYM_CODE_START_LOCAL(sev_es_req_cpuid)
+ shll $30, %eax
+ orl $0x00000004, %eax
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall # VMGEXIT
+ rdmsr
- bts %rax, sme_me_mask(%rip) /* Create the encryption mask */
+ /* Check response */
+ movl %eax, %ecx
+ andl $0x3ffff000, %ecx # Bits [12-29] MBZ
+ jnz 2f
+
+ /* Check return code */
+ andl $0xfff, %eax
+ cmpl $5, %eax
+ jne 2f
+
+ /* All good - return success */
+ xorl %eax, %eax
+1:
+ RET
+2:
+ movl $-1, %eax
+ jmp 1b
+SYM_CODE_END(sev_es_req_cpuid)
+
+SYM_CODE_START(startup32_vc_handler)
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ /* Keep CPUID function in %ebx */
+ movl %eax, %ebx
+
+ /* Check if error-code == SVM_EXIT_CPUID */
+ cmpl $0x72, 16(%esp)
+ jne .Lfail
+
+ movl $0, %eax # Request CPUID[fn].EAX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 12(%esp) # Store result
+
+ movl $1, %eax # Request CPUID[fn].EBX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 8(%esp) # Store result
+
+ movl $2, %eax # Request CPUID[fn].ECX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 4(%esp) # Store result
+
+ movl $3, %eax # Request CPUID[fn].EDX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 0(%esp) # Store result
/*
- * Read MSR_AMD64_SEV again and store it to sev_status. Can't do this in
- * get_sev_encryption_bit() because this function is 32-bit code and
- * shared between 64-bit and 32-bit boot path.
+ * Sanity check CPUID results from the Hypervisor. See comment in
+ * do_vc_no_ghcb() for more details on why this is necessary.
*/
- movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */
- rdmsr
-
- /* Store MSR value in sev_status */
- shlq $32, %rdx
- orq %rdx, %rax
- movq %rax, sev_status(%rip)
-.Lno_sev_mask:
- movq %rbp, %rsp /* Restore original stack pointer */
+ /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */
+ cmpl $0x80000000, %ebx
+ jne .Lcheck_sev
+ cmpl $0x8000001f, 12(%esp)
+ jb .Lfail
+ jmp .Ldone
+
+.Lcheck_sev:
+ /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */
+ cmpl $0x8000001f, %ebx
+ jne .Ldone
+ btl $1, 12(%esp)
+ jnc .Lfail
+
+.Ldone:
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ /* Remove error code */
+ addl $4, %esp
+
+ /* Jump over CPUID instruction */
+ addl $2, (%esp)
+
+ iret
+.Lfail:
+ /* Send terminate request to Hypervisor */
+ movl $0x100, %eax
+ xorl %edx, %edx
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall
+
+ /* If request fails, go to hlt loop */
+ hlt
+ jmp .Lfail
+SYM_CODE_END(startup32_vc_handler)
- pop %rdx
- pop %rbp
-#endif
+ .code64
- xor %rax, %rax
- ret
-SYM_FUNC_END(set_sev_encryption_mask)
+#include "../../kernel/sev_verify_cbit.S"
.data
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 267e7f93050e..cf690d8712f4 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -28,28 +28,37 @@
/* Macros used by the included decompressor code below. */
#define STATIC static
+/* Define an externally visible malloc()/free(). */
+#define MALLOC_VISIBLE
+#include <linux/decompress/mm.h>
/*
* Provide definitions of memzero and memmove as some of the decompressors will
* try to define their own functions if these are not defined as macros.
*/
#define memzero(s, n) memset((s), 0, (n))
+#ifndef memmove
#define memmove memmove
-
/* Functions used by the included decompressor code below. */
void *memmove(void *dest, const void *src, size_t n);
+#endif
/*
* This is set up by the setup-routine at boot-time
*/
struct boot_params *boot_params;
+struct port_io_ops pio_ops;
+
memptr free_mem_ptr;
memptr free_mem_end_ptr;
static char *vidmem;
static int vidport;
-static int lines, cols;
+
+/* These might be accessed before .bss is cleared, so use .data instead. */
+static int lines __section(".data");
+static int cols __section(".data");
#ifdef CONFIG_KERNEL_GZIP
#include "../../../../lib/decompress_inflate.c"
@@ -172,7 +181,7 @@ void __puthex(unsigned long value)
}
}
-#if CONFIG_X86_NEED_RELOCS
+#ifdef CONFIG_X86_NEED_RELOCS
static void handle_relocations(void *output, unsigned long output_len,
unsigned long virt_addr)
{
@@ -367,6 +376,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
lines = boot_params->screen_info.orig_video_lines;
cols = boot_params->screen_info.orig_video_cols;
+ init_default_io_ops();
+
+ /*
+ * Detect TDX guest environment.
+ *
+ * It has to be done before console_init() in order to use
+ * paravirtualized port I/O operations if needed.
+ */
+ early_tdx_detect();
+
console_init();
/*
@@ -430,8 +449,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
error("Destination address too large");
#endif
#ifndef CONFIG_RELOCATABLE
- if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
- error("Destination address does not match LOAD_PHYSICAL_ADDR");
if (virt_addr != LOAD_PHYSICAL_ADDR)
error("Destination virtual address changed when not relocatable");
#endif
@@ -443,11 +460,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");
- /*
- * Flush GHCB from cache and map it encrypted again when running as
- * SEV-ES guest.
- */
- sev_es_shutdown_ghcb();
+ /* Disable exception handling before booting the kernel */
+ cleanup_exception_handling();
return output;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 901ea5ebec22..4910bf230d7b 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -14,23 +14,29 @@
#undef CONFIG_KASAN
#undef CONFIG_KASAN_GENERIC
+#define __NO_FORTIFY
+
/* cpu_feature_enabled() cannot be used this early */
#define USE_EARLY_PGTABLE_L5
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
-#include <linux/io.h>
#include <asm/page.h>
#include <asm/boot.h>
#include <asm/bootparam.h>
#include <asm/desc_defs.h>
+#include "tdx.h"
+
#define BOOT_CTYPE_H
#include <linux/acpi.h>
#define BOOT_BOOT_H
#include "../ctype.h"
+#include "../io.h"
+
+#include "efi.h"
#ifdef CONFIG_X86_64
#define memptr long
@@ -44,6 +50,8 @@ extern char _head[], _end[];
/* misc.c */
extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr;
+void *malloc(int size);
+void free(void *where);
extern struct boot_params *boot_params;
void __putstr(const char *s);
void __puthex(unsigned long value);
@@ -79,7 +87,7 @@ struct mem_vector {
u64 size;
};
-#if CONFIG_RANDOMIZE_BASE
+#ifdef CONFIG_RANDOMIZE_BASE
/* kaslr.c */
void choose_random_location(unsigned long input,
unsigned long input_size,
@@ -116,17 +124,23 @@ static inline void console_init(void)
{ }
#endif
-void set_sev_encryption_mask(void);
-
#ifdef CONFIG_AMD_MEM_ENCRYPT
+void sev_enable(struct boot_params *bp);
void sev_es_shutdown_ghcb(void);
extern bool sev_es_check_ghcb_fault(unsigned long address);
+void snp_set_page_private(unsigned long paddr);
+void snp_set_page_shared(unsigned long paddr);
+void sev_prep_identity_maps(unsigned long top_level_pgt);
#else
+static inline void sev_enable(struct boot_params *bp) { }
static inline void sev_es_shutdown_ghcb(void) { }
static inline bool sev_es_check_ghcb_fault(unsigned long address)
{
return false;
}
+static inline void snp_set_page_private(unsigned long paddr) { }
+static inline void snp_set_page_shared(unsigned long paddr) { }
+static inline void sev_prep_identity_maps(unsigned long top_level_pgt) { }
#endif
/* acpi.c */
@@ -147,6 +161,7 @@ static inline int count_immovable_mem_regions(void) { return 0; }
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
#endif
+extern void kernel_add_identity_map(unsigned long start, unsigned long end);
/* Used by PAGE_KERN* macros: */
extern pteval_t __default_kernel_pte_mask;
@@ -155,6 +170,12 @@ extern pteval_t __default_kernel_pte_mask;
extern gate_desc boot_idt[BOOT_IDT_ENTRIES];
extern struct desc_ptr boot_idt_desc;
+#ifdef CONFIG_X86_64
+void cleanup_exception_handling(void);
+#else
+static inline void cleanup_exception_handling(void) { }
+#endif
+
/* IDT Entry Points */
void boot_page_fault(void);
void boot_stage1_vc(void);
@@ -162,4 +183,47 @@ void boot_stage2_vc(void);
unsigned long sev_verify_cbit(unsigned long cr3);
+enum efi_type {
+ EFI_TYPE_64,
+ EFI_TYPE_32,
+ EFI_TYPE_NONE,
+};
+
+#ifdef CONFIG_EFI
+/* helpers for early EFI config table access */
+enum efi_type efi_get_type(struct boot_params *bp);
+unsigned long efi_get_system_table(struct boot_params *bp);
+int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len);
+unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid);
+#else
+static inline enum efi_type efi_get_type(struct boot_params *bp)
+{
+ return EFI_TYPE_NONE;
+}
+
+static inline unsigned long efi_get_system_table(struct boot_params *bp)
+{
+ return 0;
+}
+
+static inline int efi_get_conf_table(struct boot_params *bp,
+ unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len)
+{
+ return -ENOENT;
+}
+
+static inline unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid)
+{
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index 6ff7e81b5628..cc9b2529a086 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -6,7 +6,7 @@
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x80
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 2a78746f5a4c..2ac12ff4111b 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,9 +1,10 @@
-#include <linux/efi.h>
+// SPDX-License-Identifier: GPL-2.0
+#include "misc.h"
#include <asm/e820/types.h>
#include <asm/processor.h>
-#include <asm/efi.h>
#include "pgtable.h"
#include "../string.h"
+#include "efi.h"
#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c
deleted file mode 100644
index 27826c265aab..000000000000
--- a/arch/x86/boot/compressed/sev-es.c
+++ /dev/null
@@ -1,213 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * AMD Encrypted Register State Support
- *
- * Author: Joerg Roedel <jroedel@suse.de>
- */
-
-/*
- * misc.h needs to be first because it knows how to include the other kernel
- * headers in the pre-decompression code in a way that does not break
- * compilation.
- */
-#include "misc.h"
-
-#include <asm/pgtable_types.h>
-#include <asm/sev-es.h>
-#include <asm/trapnr.h>
-#include <asm/trap_pf.h>
-#include <asm/msr-index.h>
-#include <asm/fpu/xcr.h>
-#include <asm/ptrace.h>
-#include <asm/svm.h>
-
-#include "error.h"
-
-struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
-struct ghcb *boot_ghcb;
-
-/*
- * Copy a version of this function here - insn-eval.c can't be used in
- * pre-decompression code.
- */
-static bool insn_has_rep_prefix(struct insn *insn)
-{
- insn_byte_t p;
- int i;
-
- insn_get_prefixes(insn);
-
- for_each_insn_prefix(insn, i, p) {
- if (p == 0xf2 || p == 0xf3)
- return true;
- }
-
- return false;
-}
-
-/*
- * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
- * doesn't use segments.
- */
-static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
-{
- return 0UL;
-}
-
-static inline u64 sev_es_rd_ghcb_msr(void)
-{
- unsigned long low, high;
-
- asm volatile("rdmsr" : "=a" (low), "=d" (high) :
- "c" (MSR_AMD64_SEV_ES_GHCB));
-
- return ((high << 32) | low);
-}
-
-static inline void sev_es_wr_ghcb_msr(u64 val)
-{
- u32 low, high;
-
- low = val & 0xffffffffUL;
- high = val >> 32;
-
- asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB),
- "a"(low), "d" (high) : "memory");
-}
-
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
-{
- char buffer[MAX_INSN_SIZE];
- enum es_result ret;
-
- memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
-
- insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1);
- insn_get_length(&ctxt->insn);
-
- ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
-
- return ret;
-}
-
-static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
- void *dst, char *buf, size_t size)
-{
- memcpy(dst, buf, size);
-
- return ES_OK;
-}
-
-static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
- void *src, char *buf, size_t size)
-{
- memcpy(buf, src, size);
-
- return ES_OK;
-}
-
-#undef __init
-#undef __pa
-#define __init
-#define __pa(x) ((unsigned long)(x))
-
-#define __BOOT_COMPRESSED
-
-/* Basic instruction decoding support needed */
-#include "../../lib/inat.c"
-#include "../../lib/insn.c"
-
-/* Include code for early handlers */
-#include "../../kernel/sev-es-shared.c"
-
-static bool early_setup_sev_es(void)
-{
- if (!sev_es_negotiate_protocol())
- sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED);
-
- if (set_page_decrypted((unsigned long)&boot_ghcb_page))
- return false;
-
- /* Page is now mapped decrypted, clear it */
- memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page));
-
- boot_ghcb = &boot_ghcb_page;
-
- /* Initialize lookup tables for the instruction decoder */
- inat_init_tables();
-
- return true;
-}
-
-void sev_es_shutdown_ghcb(void)
-{
- if (!boot_ghcb)
- return;
-
- if (!sev_es_check_cpu_features())
- error("SEV-ES CPU Features missing.");
-
- /*
- * GHCB Page must be flushed from the cache and mapped encrypted again.
- * Otherwise the running kernel will see strange cache effects when
- * trying to use that page.
- */
- if (set_page_encrypted((unsigned long)&boot_ghcb_page))
- error("Can't map GHCB page encrypted");
-
- /*
- * GHCB page is mapped encrypted again and flushed from the cache.
- * Mark it non-present now to catch bugs when #VC exceptions trigger
- * after this point.
- */
- if (set_page_non_present((unsigned long)&boot_ghcb_page))
- error("Can't unmap GHCB page");
-}
-
-bool sev_es_check_ghcb_fault(unsigned long address)
-{
- /* Check whether the fault was on the GHCB page */
- return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page);
-}
-
-void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
-{
- struct es_em_ctxt ctxt;
- enum es_result result;
-
- if (!boot_ghcb && !early_setup_sev_es())
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
-
- vc_ghcb_invalidate(boot_ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, exit_code);
- if (result != ES_OK)
- goto finish;
-
- switch (exit_code) {
- case SVM_EXIT_RDTSC:
- case SVM_EXIT_RDTSCP:
- result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
- break;
- case SVM_EXIT_IOIO:
- result = vc_handle_ioio(boot_ghcb, &ctxt);
- break;
- case SVM_EXIT_CPUID:
- result = vc_handle_cpuid(boot_ghcb, &ctxt);
- break;
- default:
- result = ES_UNSUPPORTED;
- break;
- }
-
-finish:
- if (result == ES_OK) {
- vc_finish_insn(&ctxt);
- } else if (result != ES_RETRY) {
- /*
- * For now, just halt the machine. That makes debugging easier,
- * later we just call sev_es_terminate() here.
- */
- while (true)
- asm volatile("hlt\n");
- }
-}
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
new file mode 100644
index 000000000000..52f989f6acc2
--- /dev/null
+++ b/arch/x86/boot/compressed/sev.c
@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+/*
+ * misc.h needs to be first because it knows how to include the other kernel
+ * headers in the pre-decompression code in a way that does not break
+ * compilation.
+ */
+#include "misc.h"
+
+#include <asm/pgtable_types.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+#include <asm/trap_pf.h>
+#include <asm/msr-index.h>
+#include <asm/fpu/xcr.h>
+#include <asm/ptrace.h>
+#include <asm/svm.h>
+#include <asm/cpuid.h>
+
+#include "error.h"
+#include "../msr.h"
+
+struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
+struct ghcb *boot_ghcb;
+
+/*
+ * Copy a version of this function here - insn-eval.c can't be used in
+ * pre-decompression code.
+ */
+static bool insn_has_rep_prefix(struct insn *insn)
+{
+ insn_byte_t p;
+ int i;
+
+ insn_get_prefixes(insn);
+
+ for_each_insn_prefix(insn, i, p) {
+ if (p == 0xf2 || p == 0xf3)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
+ * doesn't use segments.
+ */
+static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ return 0UL;
+}
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ struct msr m;
+
+ boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+
+ return m.q;
+}
+
+static inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ struct msr m;
+
+ m.q = val;
+ boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+}
+
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int ret;
+
+ memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+
+ return ES_OK;
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ void *dst, char *buf, size_t size)
+{
+ memcpy(dst, buf, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ void *src, char *buf, size_t size)
+{
+ memcpy(buf, src, size);
+
+ return ES_OK;
+}
+
+#undef __init
+#undef __pa
+#define __init
+#define __pa(x) ((unsigned long)(x))
+
+#define __BOOT_COMPRESSED
+
+/* Basic instruction decoding support needed */
+#include "../../lib/inat.c"
+#include "../../lib/insn.c"
+
+/* Include code for early handlers */
+#include "../../kernel/sev-shared.c"
+
+static inline bool sev_snp_enabled(void)
+{
+ return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
+}
+
+static void __page_state_change(unsigned long paddr, enum psc_op op)
+{
+ u64 val;
+
+ if (!sev_snp_enabled())
+ return;
+
+ /*
+ * If private -> shared then invalidate the page before requesting the
+ * state change in the RMP table.
+ */
+ if (op == SNP_PAGE_STATE_SHARED && pvalidate(paddr, RMP_PG_SIZE_4K, 0))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+
+ /* Issue VMGEXIT to change the page state in RMP table. */
+ sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
+ VMGEXIT();
+
+ /* Read the response of the VMGEXIT. */
+ val = sev_es_rd_ghcb_msr();
+ if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+
+ /*
+ * Now that page state is changed in the RMP table, validate it so that it is
+ * consistent with the RMP entry.
+ */
+ if (op == SNP_PAGE_STATE_PRIVATE && pvalidate(paddr, RMP_PG_SIZE_4K, 1))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+}
+
+void snp_set_page_private(unsigned long paddr)
+{
+ __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE);
+}
+
+void snp_set_page_shared(unsigned long paddr)
+{
+ __page_state_change(paddr, SNP_PAGE_STATE_SHARED);
+}
+
+static bool early_setup_ghcb(void)
+{
+ if (set_page_decrypted((unsigned long)&boot_ghcb_page))
+ return false;
+
+ /* Page is now mapped decrypted, clear it */
+ memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page));
+
+ boot_ghcb = &boot_ghcb_page;
+
+ /* Initialize lookup tables for the instruction decoder */
+ inat_init_tables();
+
+ /* SNP guest requires the GHCB GPA must be registered */
+ if (sev_snp_enabled())
+ snp_register_ghcb_early(__pa(&boot_ghcb_page));
+
+ return true;
+}
+
+void sev_es_shutdown_ghcb(void)
+{
+ if (!boot_ghcb)
+ return;
+
+ if (!sev_es_check_cpu_features())
+ error("SEV-ES CPU Features missing.");
+
+ /*
+ * GHCB Page must be flushed from the cache and mapped encrypted again.
+ * Otherwise the running kernel will see strange cache effects when
+ * trying to use that page.
+ */
+ if (set_page_encrypted((unsigned long)&boot_ghcb_page))
+ error("Can't map GHCB page encrypted");
+
+ /*
+ * GHCB page is mapped encrypted again and flushed from the cache.
+ * Mark it non-present now to catch bugs when #VC exceptions trigger
+ * after this point.
+ */
+ if (set_page_non_present((unsigned long)&boot_ghcb_page))
+ error("Can't unmap GHCB page");
+}
+
+bool sev_es_check_ghcb_fault(unsigned long address)
+{
+ /* Check whether the fault was on the GHCB page */
+ return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page);
+}
+
+void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
+{
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ if (!boot_ghcb && !early_setup_ghcb())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ vc_ghcb_invalidate(boot_ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ switch (exit_code) {
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(boot_ghcb, &ctxt);
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(boot_ghcb, &ctxt);
+ break;
+ default:
+ result = ES_UNSUPPORTED;
+ break;
+ }
+
+finish:
+ if (result == ES_OK)
+ vc_finish_insn(&ctxt);
+ else if (result != ES_RETRY)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
+static void enforce_vmpl0(void)
+{
+ u64 attrs;
+ int err;
+
+ /*
+ * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
+ * higher) privilege level. Here, clear the VMPL1 permission mask of the
+ * GHCB page. If the guest is not running at VMPL0, this will fail.
+ *
+ * If the guest is running at VMPL0, it will succeed. Even if that operation
+ * modifies permission bits, it is still ok to do so currently because Linux
+ * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks
+ * changing is a don't-care.
+ */
+ attrs = 1;
+ if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
+}
+
+void sev_enable(struct boot_params *bp)
+{
+ unsigned int eax, ebx, ecx, edx;
+ struct msr m;
+ bool snp;
+
+ /*
+ * Setup/preliminary detection of SNP. This will be sanity-checked
+ * against CPUID/MSR values later.
+ */
+ snp = snp_init(bp);
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ /* Check whether SEV is supported */
+ if (!(eax & BIT(1))) {
+ if (snp)
+ error("SEV-SNP support indicated by CC blob, but not CPUID.");
+ return;
+ }
+
+ /* Set the SME mask if this is an SEV guest. */
+ boot_rdmsr(MSR_AMD64_SEV, &m);
+ sev_status = m.q;
+ if (!(sev_status & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* Negotiate the GHCB protocol version. */
+ if (sev_status & MSR_AMD64_SEV_ES_ENABLED) {
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED);
+ }
+
+ /*
+ * SNP is supported in v2 of the GHCB spec which mandates support for HV
+ * features.
+ */
+ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
+ if (!(get_hv_features() & GHCB_HV_FT_SNP))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ enforce_vmpl0();
+ }
+
+ if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ error("SEV-SNP supported indicated by CC blob, but not SEV status MSR.");
+
+ sme_me_mask = BIT_ULL(ebx & 0x3f);
+}
+
+/* Search for Confidential Computing blob in the EFI config table. */
+static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
+{
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ int ret;
+
+ ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
+ if (ret)
+ return NULL;
+
+ return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
+ cfg_table_len,
+ EFI_CC_BLOB_GUID);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the boot kernel
+ * by firmware/bootloader in the following ways:
+ *
+ * - via an entry in the EFI config table
+ * - via a setup_data structure, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ cc_info = find_cc_blob_efi(bp);
+ if (cc_info)
+ goto found_cc_info;
+
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ return cc_info;
+}
+
+/*
+ * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
+ * will verify the SNP CPUID/MSR bits.
+ */
+bool snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ /*
+ * If a SNP-specific Confidential Computing blob is present, then
+ * firmware/bootloader have indicated SNP support. Verifying this
+ * involves CPUID checks which will be more reliable if the SNP
+ * CPUID table is used. See comments over snp_setup_cpuid_table() for
+ * more details.
+ */
+ setup_cpuid_table(cc_info);
+
+ /*
+ * Pass run-time kernel a pointer to CC info via boot_params so EFI
+ * config table doesn't need to be searched again during early startup
+ * phase.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
+void sev_prep_identity_maps(unsigned long top_level_pgt)
+{
+ /*
+ * The Confidential Computing blob is used very early in uncompressed
+ * kernel to find the in-memory CPUID table to handle CPUID
+ * instructions. Make sure an identity-mapping exists so it can be
+ * accessed after switchover.
+ */
+ if (sev_snp_enabled()) {
+ unsigned long cc_info_pa = boot_params->cc_blob_address;
+ struct cc_blob_sev_info *cc_info;
+
+ kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info));
+
+ cc_info = (struct cc_blob_sev_info *)cc_info_pa;
+ kernel_add_identity_map(cc_info->cpuid_phys, cc_info->cpuid_phys + cc_info->cpuid_len);
+ }
+
+ sev_verify_cbit(top_level_pgt);
+}
diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S
new file mode 100644
index 000000000000..46d0495e0d3a
--- /dev/null
+++ b/arch/x86/boot/compressed/tdcall.S
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "../../coco/tdx/tdcall.S"
diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c
new file mode 100644
index 000000000000..918a7606f53c
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../cpuflags.h"
+#include "../string.h"
+#include "../io.h"
+#include "error.h"
+
+#include <vdso/limits.h>
+#include <uapi/asm/vmx.h>
+
+#include <asm/shared/tdx.h>
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __tdx_hypercall_failed(void)
+{
+ error("TDVMCALL failed. TDX module bug?");
+}
+
+static inline unsigned int tdx_io_in(int size, u16 port)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_IO_INSTRUCTION,
+ .r12 = size,
+ .r13 = 0,
+ .r14 = port,
+ };
+
+ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ return UINT_MAX;
+
+ return args.r11;
+}
+
+static inline void tdx_io_out(int size, u16 port, u32 value)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_IO_INSTRUCTION,
+ .r12 = size,
+ .r13 = 1,
+ .r14 = port,
+ .r15 = value,
+ };
+
+ __tdx_hypercall(&args, 0);
+}
+
+static inline u8 tdx_inb(u16 port)
+{
+ return tdx_io_in(1, port);
+}
+
+static inline void tdx_outb(u8 value, u16 port)
+{
+ tdx_io_out(1, port, value);
+}
+
+static inline void tdx_outw(u16 value, u16 port)
+{
+ tdx_io_out(2, port, value);
+}
+
+void early_tdx_detect(void)
+{
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
+
+ if (memcmp(TDX_IDENT, sig, sizeof(sig)))
+ return;
+
+ /* Use hypercalls instead of I/O instructions */
+ pio_ops.f_inb = tdx_inb;
+ pio_ops.f_outb = tdx_outb;
+ pio_ops.f_outw = tdx_outw;
+}
diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h
new file mode 100644
index 000000000000..9055482cd35c
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_TDX_H
+#define BOOT_COMPRESSED_TDX_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+void early_tdx_detect(void);
+#else
+static inline void early_tdx_detect(void) { };
+#endif
+
+#endif /* BOOT_COMPRESSED_TDX_H */
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index e1478d32de1a..fed8d13ce252 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -27,6 +27,7 @@
#include <asm/required-features.h>
#include <asm/msr-index.h>
#include "string.h"
+#include "msr.h"
static u32 err_flags[NCAPINTS];
@@ -130,12 +131,11 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
/* If this is an AMD and we're only missing SSE+SSE2, try to
turn them on */
- u32 ecx = MSR_K7_HWCR;
- u32 eax, edx;
+ struct msr m;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- eax &= ~(1 << 15);
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ boot_rdmsr(MSR_K7_HWCR, &m);
+ m.l &= ~(1 << 15);
+ boot_wrmsr(MSR_K7_HWCR, &m);
get_cpuflags(); /* Make sure it really did something */
err = check_cpuflags();
@@ -145,28 +145,28 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
/* If this is a VIA C3, we might have to enable CX8
explicitly */
- u32 ecx = MSR_VIA_FCR;
- u32 eax, edx;
+ struct msr m;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- eax |= (1<<1)|(1<<7);
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ boot_rdmsr(MSR_VIA_FCR, &m);
+ m.l |= (1 << 1) | (1 << 7);
+ boot_wrmsr(MSR_VIA_FCR, &m);
set_bit(X86_FEATURE_CX8, cpu.flags);
err = check_cpuflags();
} else if (err == 0x01 && is_transmeta()) {
/* Transmeta might have masked feature bits in word 0 */
- u32 ecx = 0x80860004;
- u32 eax, edx;
+ struct msr m, m_tmp;
u32 level = 1;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx));
+ boot_rdmsr(0x80860004, &m);
+ m_tmp = m;
+ m_tmp.l = ~0;
+ boot_wrmsr(0x80860004, &m_tmp);
asm("cpuid"
: "+a" (level), "=d" (cpu.flags[0])
: : "ecx", "ebx");
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ boot_wrmsr(0x80860004, &m);
err = check_cpuflags();
} else if (err == 0x01 &&
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index a0b75f73dc63..a83d67ec627d 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -71,8 +71,7 @@ int has_eflag(unsigned long mask)
# define EBX_REG "=b"
#endif
-static inline void cpuid_count(u32 id, u32 count,
- u32 *a, u32 *b, u32 *c, u32 *d)
+void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d)
{
asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
"cpuid \n\t"
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index 2e20814d3ce3..475b8fde90f7 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -17,5 +17,6 @@ extern u32 cpu_vendor[3];
int has_eflag(unsigned long mask);
void get_cpuflags(void);
+void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d);
#endif
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index 6a10d52a4145..c9299aeb7333 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
#
# This file is subject to the terms and conditions of the GNU General Public
# License. See the file "COPYING" in the main directory of this archive
@@ -8,15 +8,24 @@
#
# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others
#
-# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture
+# "make fdimage/fdimage144/fdimage288/hdimage/isoimage"
+# script for x86 architecture
#
# Arguments:
-# $1 - fdimage format
-# $2 - target image file
-# $3 - kernel bzImage file
-# $4 - mtool configuration file
-# $5 - kernel cmdline
-# $6 - inird image file
+# $1 - fdimage format
+# $2 - target image file
+# $3 - kernel bzImage file
+# $4 - mtools configuration file
+# $5 - kernel cmdline
+# $6+ - initrd image file(s)
+#
+# This script requires:
+# bash
+# syslinux
+# mtools (for fdimage* and hdimage)
+# edk2/OVMF (for hdimage)
+#
+# Otherwise try to stick to POSIX shell commands...
#
# Use "make V=1" to debug this script
@@ -26,105 +35,238 @@ case "${KBUILD_VERBOSE}" in
;;
esac
-verify () {
- if [ ! -f "$1" ]; then
- echo "" 1>&2
- echo " *** Missing file: $1" 1>&2
- echo "" 1>&2
- exit 1
+# Exit the top-level shell with an error
+topshell=$$
+trap 'exit 1' USR1
+die() {
+ echo "" 1>&2
+ echo " *** $*" 1>&2
+ echo "" 1>&2
+ kill -USR1 $topshell
+}
+
+# Verify the existence and readability of a file
+verify() {
+ if [ ! -f "$1" -o ! -r "$1" ]; then
+ die "Missing file: $1"
fi
}
+diskfmt="$1"
+FIMAGE="$2"
+FBZIMAGE="$3"
+MTOOLSRC="$4"
+KCMDLINE="$5"
+shift 5 # Remaining arguments = initrd files
+
+export MTOOLSRC
-export MTOOLSRC=$4
-FIMAGE=$2
-FBZIMAGE=$3
-KCMDLINE=$5
-FDINITRD=$6
+# common options for dd
+dd='dd iflag=fullblock'
# Make sure the files actually exist
verify "$FBZIMAGE"
-genbzdisk() {
- verify "$MTOOLSRC"
- mformat a:
- syslinux $FIMAGE
- echo "$KCMDLINE" | mcopy - a:syslinux.cfg
- if [ -f "$FDINITRD" ] ; then
- mcopy "$FDINITRD" a:initrd.img
+declare -a FDINITRDS
+irdpfx=' initrd='
+initrdopts_syslinux=''
+initrdopts_efi=''
+for f in "$@"; do
+ if [ -f "$f" -a -r "$f" ]; then
+ FDINITRDS=("${FDINITRDS[@]}" "$f")
+ fname="$(basename "$f")"
+ initrdopts_syslinux="${initrdopts_syslinux}${irdpfx}${fname}"
+ irdpfx=,
+ initrdopts_efi="${initrdopts_efi} initrd=${fname}"
fi
- mcopy $FBZIMAGE a:linux
+done
+
+# Read a $3-byte littleendian unsigned value at offset $2 from file $1
+le() {
+ local n=0
+ local m=1
+ for b in $(od -A n -v -j $2 -N $3 -t u1 "$1"); do
+ n=$((n + b*m))
+ m=$((m * 256))
+ done
+ echo $n
}
-genfdimage144() {
- verify "$MTOOLSRC"
- dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null
- mformat v:
- syslinux $FIMAGE
- echo "$KCMDLINE" | mcopy - v:syslinux.cfg
- if [ -f "$FDINITRD" ] ; then
- mcopy "$FDINITRD" v:initrd.img
- fi
- mcopy $FBZIMAGE v:linux
+# Get the EFI architecture name such that boot{name}.efi is the default
+# boot file name. Returns false with no output if the file is not an
+# EFI image or otherwise unknown.
+efiarch() {
+ [ -f "$1" ] || return
+ [ $(le "$1" 0 2) -eq 23117 ] || return # MZ magic
+ peoffs=$(le "$1" 60 4) # PE header offset
+ [ $peoffs -ge 64 ] || return
+ [ $(le "$1" $peoffs 4) -eq 17744 ] || return # PE magic
+ case $(le "$1" $((peoffs+4+20)) 2) in # PE type
+ 267) ;; # PE32
+ 523) ;; # PE32+
+ *) return 1 ;; # Invalid
+ esac
+ [ $(le "$1" $((peoffs+4+20+68)) 2) -eq 10 ] || return # EFI app
+ case $(le "$1" $((peoffs+4)) 2) in # Machine type
+ 332) echo i386 ;;
+ 450) echo arm ;;
+ 512) echo ia64 ;;
+ 20530) echo riscv32 ;;
+ 20580) echo riscv64 ;;
+ 20776) echo riscv128 ;;
+ 34404) echo x64 ;;
+ 43620) echo aa64 ;;
+ esac
}
-genfdimage288() {
- verify "$MTOOLSRC"
- dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null
- mformat w:
- syslinux $FIMAGE
- echo "$KCMDLINE" | mcopy - W:syslinux.cfg
- if [ -f "$FDINITRD" ] ; then
- mcopy "$FDINITRD" w:initrd.img
- fi
- mcopy $FBZIMAGE w:linux
+# Get the combined sizes in bytes of the files given, counting sparse
+# files as full length, and padding each file to cluster size
+cluster=16384
+filesizes() {
+ local t=0
+ local s
+ for s in $(ls -lnL "$@" 2>/dev/null | awk '/^-/{ print $5; }'); do
+ t=$((t + ((s+cluster-1)/cluster)*cluster))
+ done
+ echo $t
}
-geniso() {
- tmp_dir=`dirname $FIMAGE`/isoimage
- rm -rf $tmp_dir
- mkdir $tmp_dir
- for i in lib lib64 share ; do
- for j in syslinux ISOLINUX ; do
- if [ -f /usr/$i/$j/isolinux.bin ] ; then
- isolinux=/usr/$i/$j/isolinux.bin
- fi
+# Expand directory names which should be in /usr/share into a list
+# of possible alternatives
+sharedirs() {
+ local dir file
+ for dir in /usr/share /usr/lib64 /usr/lib; do
+ for file; do
+ echo "$dir/$file"
+ echo "$dir/${file^^}"
done
- for j in syslinux syslinux/modules/bios ; do
- if [ -f /usr/$i/$j/ldlinux.c32 ]; then
- ldlinux=/usr/$i/$j/ldlinux.c32
- fi
+ done
+}
+efidirs() {
+ local dir file
+ for dir in /usr/share /boot /usr/lib64 /usr/lib; do
+ for file; do
+ echo "$dir/$file"
+ echo "$dir/${file^^}"
done
- if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
- break
+ done
+}
+
+findsyslinux() {
+ local f="$(find -L $(sharedirs syslinux isolinux) \
+ -name "$1" -readable -type f -print -quit 2>/dev/null)"
+ if [ ! -f "$f" ]; then
+ die "Need a $1 file, please install syslinux/isolinux."
+ fi
+ echo "$f"
+ return 0
+}
+
+findovmf() {
+ local arch="$1"
+ shift
+ local -a names=(-false)
+ local name f
+ for name; do
+ names=("${names[@]}" -or -iname "$name")
+ done
+ for f in $(find -L $(efidirs edk2 ovmf) \
+ \( "${names[@]}" \) -readable -type f \
+ -print 2>/dev/null); do
+ if [ "$(efiarch "$f")" = "$arch" ]; then
+ echo "$f"
+ return 0
fi
done
- if [ -z "$isolinux" ] ; then
- echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
- exit 1
+ die "Need a $1 file for $arch, please install EDK2/OVMF."
+}
+
+do_mcopy() {
+ if [ ${#FDINITRDS[@]} -gt 0 ]; then
+ mcopy "${FDINITRDS[@]}" "$1"
+ fi
+ if [ -n "$efishell" ]; then
+ mmd "$1"EFI "$1"EFI/Boot
+ mcopy "$efishell" "$1"EFI/Boot/boot${kefiarch}.efi
fi
- if [ -z "$ldlinux" ] ; then
- echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.'
- exit 1
+ if [ -n "$kefiarch" ]; then
+ echo linux "$KCMDLINE$initrdopts_efi" | \
+ mcopy - "$1"startup.nsh
fi
- cp $isolinux $tmp_dir
- cp $ldlinux $tmp_dir
- cp $FBZIMAGE $tmp_dir/linux
- echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
- if [ -f "$FDINITRD" ] ; then
- cp "$FDINITRD" $tmp_dir/initrd.img
+ echo default linux "$KCMDLINE$initrdopts_syslinux" | \
+ mcopy - "$1"syslinux.cfg
+ mcopy "$FBZIMAGE" "$1"linux
+}
+
+genbzdisk() {
+ verify "$MTOOLSRC"
+ mformat -v 'LINUX_BOOT' a:
+ syslinux "$FIMAGE"
+ do_mcopy a:
+}
+
+genfdimage144() {
+ verify "$MTOOLSRC"
+ $dd if=/dev/zero of="$FIMAGE" bs=1024 count=1440 2>/dev/null
+ mformat -v 'LINUX_BOOT' v:
+ syslinux "$FIMAGE"
+ do_mcopy v:
+}
+
+genfdimage288() {
+ verify "$MTOOLSRC"
+ $dd if=/dev/zero of="$FIMAGE" bs=1024 count=2880 2>/dev/null
+ mformat -v 'LINUX_BOOT' w:
+ syslinux "$FIMAGE"
+ do_mcopy w:
+}
+
+genhdimage() {
+ verify "$MTOOLSRC"
+ mbr="$(findsyslinux mbr.bin)"
+ kefiarch="$(efiarch "$FBZIMAGE")"
+ if [ -n "$kefiarch" ]; then
+ # The efishell provides command line handling
+ efishell="$(findovmf $kefiarch shell.efi shell${kefiarch}.efi)"
+ ptype='-T 0xef' # EFI system partition, no GPT
fi
- genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \
- -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \
- -boot-info-table $tmp_dir
- isohybrid $FIMAGE 2>/dev/null || true
- rm -rf $tmp_dir
+ sizes=$(filesizes "$FBZIMAGE" "${FDINITRDS[@]}" "$efishell")
+ # Allow 1% + 2 MiB for filesystem and partition table overhead,
+ # syslinux, and config files; this is probably excessive...
+ megs=$(((sizes + sizes/100 + 2*1024*1024 - 1)/(1024*1024)))
+ $dd if=/dev/zero of="$FIMAGE" bs=$((1024*1024)) count=$megs 2>/dev/null
+ mpartition -I -c -s 32 -h 64 $ptype -b 64 -a p:
+ $dd if="$mbr" of="$FIMAGE" bs=440 count=1 conv=notrunc 2>/dev/null
+ mformat -v 'LINUX_BOOT' -s 32 -h 64 -c $((cluster/512)) -t $megs h:
+ syslinux --offset $((64*512)) "$FIMAGE"
+ do_mcopy h:
+}
+
+geniso() {
+ tmp_dir="$(dirname "$FIMAGE")/isoimage"
+ rm -rf "$tmp_dir"
+ mkdir "$tmp_dir"
+ isolinux=$(findsyslinux isolinux.bin)
+ ldlinux=$(findsyslinux ldlinux.c32)
+ cp "$isolinux" "$ldlinux" "$tmp_dir"
+ cp "$FBZIMAGE" "$tmp_dir"/linux
+ echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg
+ cp "${FDINITRDS[@]}" "$tmp_dir"/
+ genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \
+ -quiet -o "$FIMAGE" -b isolinux.bin \
+ -c boot.cat -no-emul-boot -boot-load-size 4 \
+ -boot-info-table "$tmp_dir"
+ isohybrid "$FIMAGE" 2>/dev/null || true
+ rm -rf "$tmp_dir"
}
-case $1 in
+rm -f "$FIMAGE"
+
+case "$diskfmt" in
bzdisk) genbzdisk;;
fdimage144) genfdimage144;;
fdimage288) genfdimage288;;
+ hdimage) genhdimage;;
isoimage) geniso;;
- *) echo 'Unknown image format'; exit 1;
+ *) die "Unknown image format: $diskfmt";;
esac
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 6dbd7e9f74c9..f912d7770130 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -163,7 +163,11 @@ extra_header_fields:
.long 0x200 # SizeOfHeaders
.long 0 # CheckSum
.word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application)
+#ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES
+ .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics
+#else
.word 0 # DllCharacteristics
+#endif
#ifdef CONFIG_X86_32
.long 0 # SizeOfStackReserve
.long 0 # SizeOfStackCommit
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index d13ec1c38640..0849f4b42745 100644..100755
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -15,28 +15,6 @@
# $2 - kernel image file
# $3 - kernel map file
# $4 - default install path (blank if root directory)
-#
-
-verify () {
- if [ ! -f "$1" ]; then
- echo "" 1>&2
- echo " *** Missing file: $1" 1>&2
- echo ' *** You need to run "make" before "make install".' 1>&2
- echo "" 1>&2
- exit 1
- fi
-}
-
-# Make sure the files actually exist
-verify "$2"
-verify "$3"
-
-# User may have a custom install script
-
-if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
-if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
-
-# Default install - same as make zlilo
if [ -f $4/vmlinuz ]; then
mv $4/vmlinuz $4/vmlinuz.old
diff --git a/arch/x86/boot/io.h b/arch/x86/boot/io.h
new file mode 100644
index 000000000000..110880907f87
--- /dev/null
+++ b/arch/x86/boot/io.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_IO_H
+#define BOOT_IO_H
+
+#include <asm/shared/io.h>
+
+#undef inb
+#undef inw
+#undef inl
+#undef outb
+#undef outw
+#undef outl
+
+struct port_io_ops {
+ u8 (*f_inb)(u16 port);
+ void (*f_outb)(u8 v, u16 port);
+ void (*f_outw)(u16 v, u16 port);
+};
+
+extern struct port_io_ops pio_ops;
+
+/*
+ * Use the normal I/O instructions by default.
+ * TDX guests override these to use hypercalls.
+ */
+static inline void init_default_io_ops(void)
+{
+ pio_ops.f_inb = __inb;
+ pio_ops.f_outb = __outb;
+ pio_ops.f_outw = __outw;
+}
+
+/*
+ * Redirect port I/O operations via pio_ops callbacks.
+ * TDX guests override these callbacks with TDX-specific helpers.
+ */
+#define inb pio_ops.f_inb
+#define outb pio_ops.f_outb
+#define outw pio_ops.f_outw
+
+#endif
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index e3add857c2c9..c4ea5258ab55 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -17,6 +17,8 @@
struct boot_params boot_params __attribute__((aligned(16)));
+struct port_io_ops pio_ops;
+
char *HEAP = _end;
char *heap_end = _end; /* Default end of heap = no heap */
@@ -33,7 +35,7 @@ static void copy_boot_params(void)
u16 cl_offset;
};
const struct old_cmdline * const oldcmd =
- (const struct old_cmdline *)OLD_CL_ADDRESS;
+ absolute_pointer(OLD_CL_ADDRESS);
BUILD_BUG_ON(sizeof(boot_params) != 4096);
memcpy(&boot_params.hdr, &hdr, sizeof(hdr));
@@ -133,6 +135,8 @@ static void init_heap(void)
void main(void)
{
+ init_default_io_ops();
+
/* First, copy the boot header into the "zeropage" */
copy_boot_params();
diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h
new file mode 100644
index 000000000000..aed66f7ae199
--- /dev/null
+++ b/arch/x86/boot/msr.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Helpers/definitions related to MSR access.
+ */
+
+#ifndef BOOT_MSR_H
+#define BOOT_MSR_H
+
+#include <asm/shared/msr.h>
+
+/*
+ * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the
+ * boot kernel since they rely on tracepoint/exception handling infrastructure
+ * that's not available here.
+ */
+static inline void boot_rdmsr(unsigned int reg, struct msr *m)
+{
+ asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg));
+}
+
+static inline void boot_wrmsr(unsigned int reg, const struct msr *m)
+{
+ asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory");
+}
+
+#endif /* BOOT_MSR_H */
diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in
index efd6d2490c1d..174c60508766 100644
--- a/arch/x86/boot/mtools.conf.in
+++ b/arch/x86/boot/mtools.conf.in
@@ -14,4 +14,8 @@ drive v:
drive w:
file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
-
+# Hard disk (h: for the filesystem, p: for format - old mtools bug?)
+drive h:
+ file="@OBJ@/hdimage" offset=32768 mformat_only
+drive p:
+ file="@OBJ@/hdimage" partition=1 mformat_only
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
index a232da487cd2..e5d2c6b8c2f1 100644
--- a/arch/x86/boot/string.h
+++ b/arch/x86/boot/string.h
@@ -8,8 +8,10 @@
#undef memcmp
void *memcpy(void *dst, const void *src, size_t len);
+void *memmove(void *dst, const void *src, size_t len);
void *memset(void *dst, int c, size_t len);
int memcmp(const void *s1, const void *s2, size_t len);
+int bcmp(const void *s1, const void *s2, size_t len);
/* Access builtin version by default. */
#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
@@ -25,6 +27,7 @@ extern size_t strnlen(const char *s, size_t maxlen);
extern unsigned int atou(const char *s);
extern unsigned long long simple_strtoull(const char *cp, char **endp,
unsigned int base);
+long simple_strtol(const char *cp, char **endp, unsigned int base);
int kstrtoull(const char *s, unsigned int base, unsigned long long *res);
int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res);
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 7e185977a984..c2c6d35e3a43 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -83,7 +83,7 @@ static int vesa_probe(void)
(vminfo.memory_layout == 4 ||
vminfo.memory_layout == 6) &&
vminfo.memory_planes == 1) {
-#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
+#ifdef CONFIG_BOOT_VESA_SUPPORT
/* Graphics mode, color, linear frame buffer
supported. Only register the mode if
if framebuffer is configured, however,
@@ -121,7 +121,7 @@ static int vesa_set_mode(struct mode_info *mode)
if ((vminfo.mode_attr & 0x15) == 0x05) {
/* It's a supported text mode */
is_graphic = 0;
-#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
+#ifdef CONFIG_BOOT_VESA_SUPPORT
} else if ((vminfo.mode_attr & 0x99) == 0x99) {
/* It's a graphics mode with linear frame buffer */
is_graphic = 1;
diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile
new file mode 100644
index 000000000000..c816acf78b6a
--- /dev/null
+++ b/arch/x86/coco/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS_REMOVE_core.o = -pg
+KASAN_SANITIZE_core.o := n
+CFLAGS_core.o += -fno-stack-protector
+
+obj-y += core.o
+
+obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
new file mode 100644
index 000000000000..49b44f881484
--- /dev/null
+++ b/arch/x86/coco/core.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#include <linux/export.h>
+#include <linux/cc_platform.h>
+
+#include <asm/coco.h>
+#include <asm/processor.h>
+
+static enum cc_vendor vendor __ro_after_init;
+static u64 cc_mask __ro_after_init;
+
+static bool intel_cc_platform_has(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ case CC_ATTR_HOTPLUG_DISABLED:
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ case CC_ATTR_MEM_ENCRYPT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * cc_platform_has() function is used for this. When a distinction isn't
+ * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+static bool amd_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return sme_me_mask;
+
+ case CC_ATTR_HOST_MEM_ENCRYPT:
+ return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED);
+
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ENABLED;
+
+ case CC_ATTR_GUEST_STATE_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+
+ /*
+ * With SEV, the rep string I/O instructions need to be unrolled
+ * but SEV-ES supports them through the #VC handler.
+ */
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ return (sev_status & MSR_AMD64_SEV_ENABLED) &&
+ !(sev_status & MSR_AMD64_SEV_ES_ENABLED);
+
+ case CC_ATTR_GUEST_SEV_SNP:
+ return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
+
+ default:
+ return false;
+ }
+#else
+ return false;
+#endif
+}
+
+static bool hyperv_cc_platform_has(enum cc_attr attr)
+{
+ return attr == CC_ATTR_GUEST_MEM_ENCRYPT;
+}
+
+bool cc_platform_has(enum cc_attr attr)
+{
+ switch (vendor) {
+ case CC_VENDOR_AMD:
+ return amd_cc_platform_has(attr);
+ case CC_VENDOR_INTEL:
+ return intel_cc_platform_has(attr);
+ case CC_VENDOR_HYPERV:
+ return hyperv_cc_platform_has(attr);
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
+
+u64 cc_mkenc(u64 val)
+{
+ /*
+ * Both AMD and Intel use a bit in the page table to indicate
+ * encryption status of the page.
+ *
+ * - for AMD, bit *set* means the page is encrypted
+ * - for Intel *clear* means encrypted.
+ */
+ switch (vendor) {
+ case CC_VENDOR_AMD:
+ return val | cc_mask;
+ case CC_VENDOR_INTEL:
+ return val & ~cc_mask;
+ default:
+ return val;
+ }
+}
+
+u64 cc_mkdec(u64 val)
+{
+ /* See comment in cc_mkenc() */
+ switch (vendor) {
+ case CC_VENDOR_AMD:
+ return val & ~cc_mask;
+ case CC_VENDOR_INTEL:
+ return val | cc_mask;
+ default:
+ return val;
+ }
+}
+EXPORT_SYMBOL_GPL(cc_mkdec);
+
+__init void cc_set_vendor(enum cc_vendor v)
+{
+ vendor = v;
+}
+
+__init void cc_set_mask(u64 mask)
+{
+ cc_mask = mask;
+}
diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile
new file mode 100644
index 000000000000..46c55998557d
--- /dev/null
+++ b/arch/x86/coco/tdx/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += tdx.o tdcall.o
diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
new file mode 100644
index 000000000000..f9eb1134f22d
--- /dev/null
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/asm-offsets.h>
+#include <asm/asm.h>
+#include <asm/frame.h>
+#include <asm/unwind_hints.h>
+
+#include <linux/linkage.h>
+#include <linux/bits.h>
+#include <linux/errno.h>
+
+#include "../../virt/vmx/tdx/tdxcall.S"
+
+/*
+ * Bitmasks of exposed registers (with VMM).
+ */
+#define TDX_R10 BIT(10)
+#define TDX_R11 BIT(11)
+#define TDX_R12 BIT(12)
+#define TDX_R13 BIT(13)
+#define TDX_R14 BIT(14)
+#define TDX_R15 BIT(15)
+
+/*
+ * These registers are clobbered to hold arguments for each
+ * TDVMCALL. They are safe to expose to the VMM.
+ * Each bit in this mask represents a register ID. Bit field
+ * details can be found in TDX GHCI specification, section
+ * titled "TDCALL [TDG.VP.VMCALL] leaf".
+ */
+#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \
+ TDX_R12 | TDX_R13 | \
+ TDX_R14 | TDX_R15 )
+
+/*
+ * __tdx_module_call() - Used by TDX guests to request services from
+ * the TDX module (does not include VMM services) using TDCALL instruction.
+ *
+ * Transforms function call register arguments into the TDCALL register ABI.
+ * After TDCALL operation, TDX module output is saved in @out (if it is
+ * provided by the user).
+ *
+ *-------------------------------------------------------------------------
+ * TDCALL ABI:
+ *-------------------------------------------------------------------------
+ * Input Registers:
+ *
+ * RAX - TDCALL Leaf number.
+ * RCX,RDX,R8-R9 - TDCALL Leaf specific input registers.
+ *
+ * Output Registers:
+ *
+ * RAX - TDCALL instruction error code.
+ * RCX,RDX,R8-R11 - TDCALL Leaf specific output registers.
+ *
+ *-------------------------------------------------------------------------
+ *
+ * __tdx_module_call() function ABI:
+ *
+ * @fn (RDI) - TDCALL Leaf ID, moved to RAX
+ * @rcx (RSI) - Input parameter 1, moved to RCX
+ * @rdx (RDX) - Input parameter 2, moved to RDX
+ * @r8 (RCX) - Input parameter 3, moved to R8
+ * @r9 (R8) - Input parameter 4, moved to R9
+ *
+ * @out (R9) - struct tdx_module_output pointer
+ * stored temporarily in R12 (not
+ * shared with the TDX module). It
+ * can be NULL.
+ *
+ * Return status of TDCALL via RAX.
+ */
+SYM_FUNC_START(__tdx_module_call)
+ FRAME_BEGIN
+ TDX_MODULE_CALL host=0
+ FRAME_END
+ RET
+SYM_FUNC_END(__tdx_module_call)
+
+/*
+ * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf
+ * of TDCALL instruction
+ *
+ * Transforms values in function call argument struct tdx_hypercall_args @args
+ * into the TDCALL register ABI. After TDCALL operation, VMM output is saved
+ * back in @args.
+ *
+ *-------------------------------------------------------------------------
+ * TD VMCALL ABI:
+ *-------------------------------------------------------------------------
+ *
+ * Input Registers:
+ *
+ * RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL)
+ * RCX - BITMAP which controls which part of TD Guest GPR
+ * is passed as-is to the VMM and back.
+ * R10 - Set 0 to indicate TDCALL follows standard TDX ABI
+ * specification. Non zero value indicates vendor
+ * specific ABI.
+ * R11 - VMCALL sub function number
+ * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments.
+ * R8-R9, R12-R15 - Same as above.
+ *
+ * Output Registers:
+ *
+ * RAX - TDCALL instruction status (Not related to hypercall
+ * output).
+ * R10 - Hypercall output error code.
+ * R11-R15 - Hypercall sub function specific output values.
+ *
+ *-------------------------------------------------------------------------
+ *
+ * __tdx_hypercall() function ABI:
+ *
+ * @args (RDI) - struct tdx_hypercall_args for input and output
+ * @flags (RSI) - TDX_HCALL_* flags
+ *
+ * On successful completion, return the hypercall error code.
+ */
+SYM_FUNC_START(__tdx_hypercall)
+ FRAME_BEGIN
+
+ /* Save callee-saved GPRs as mandated by the x86_64 ABI */
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+
+ /* Mangle function call ABI into TDCALL ABI: */
+ /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */
+ xor %eax, %eax
+
+ /* Copy hypercall registers from arg struct: */
+ movq TDX_HYPERCALL_r10(%rdi), %r10
+ movq TDX_HYPERCALL_r11(%rdi), %r11
+ movq TDX_HYPERCALL_r12(%rdi), %r12
+ movq TDX_HYPERCALL_r13(%rdi), %r13
+ movq TDX_HYPERCALL_r14(%rdi), %r14
+ movq TDX_HYPERCALL_r15(%rdi), %r15
+
+ movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
+
+ /*
+ * For the idle loop STI needs to be called directly before the TDCALL
+ * that enters idle (EXIT_REASON_HLT case). STI instruction enables
+ * interrupts only one instruction later. If there is a window between
+ * STI and the instruction that emulates the HALT state, there is a
+ * chance for interrupts to happen in this window, which can delay the
+ * HLT operation indefinitely. Since this is the not the desired
+ * result, conditionally call STI before TDCALL.
+ */
+ testq $TDX_HCALL_ISSUE_STI, %rsi
+ jz .Lskip_sti
+ sti
+.Lskip_sti:
+ tdcall
+
+ /*
+ * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that
+ * something has gone horribly wrong with the TDX module.
+ *
+ * The return status of the hypercall operation is in a separate
+ * register (in R10). Hypercall errors are a part of normal operation
+ * and are handled by callers.
+ */
+ testq %rax, %rax
+ jne .Lpanic
+
+ /* TDVMCALL leaf return code is in R10 */
+ movq %r10, %rax
+
+ /* Copy hypercall result registers to arg struct if needed */
+ testq $TDX_HCALL_HAS_OUTPUT, %rsi
+ jz .Lout
+
+ movq %r10, TDX_HYPERCALL_r10(%rdi)
+ movq %r11, TDX_HYPERCALL_r11(%rdi)
+ movq %r12, TDX_HYPERCALL_r12(%rdi)
+ movq %r13, TDX_HYPERCALL_r13(%rdi)
+ movq %r14, TDX_HYPERCALL_r14(%rdi)
+ movq %r15, TDX_HYPERCALL_r15(%rdi)
+.Lout:
+ /*
+ * Zero out registers exposed to the VMM to avoid speculative execution
+ * with VMM-controlled values. This needs to include all registers
+ * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15
+ * context will be restored.
+ */
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+
+ /* Restore callee-saved GPRs as mandated by the x86_64 ABI */
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+
+ FRAME_END
+
+ RET
+.Lpanic:
+ call __tdx_hypercall_failed
+ /* __tdx_hypercall_failed never returns */
+ REACHABLE
+ jmp .Lpanic
+SYM_FUNC_END(__tdx_hypercall)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
new file mode 100644
index 000000000000..928dcf7a20d9
--- /dev/null
+++ b/arch/x86/coco/tdx/tdx.c
@@ -0,0 +1,777 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021-2022 Intel Corporation */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tdx: " fmt
+
+#include <linux/cpufeature.h>
+#include <asm/coco.h>
+#include <asm/tdx.h>
+#include <asm/vmx.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/pgtable.h>
+
+/* TDX module Call Leaf IDs */
+#define TDX_GET_INFO 1
+#define TDX_GET_VEINFO 3
+#define TDX_ACCEPT_PAGE 6
+
+/* TDX hypercall Leaf IDs */
+#define TDVMCALL_MAP_GPA 0x10001
+
+/* MMIO direction */
+#define EPT_READ 0
+#define EPT_WRITE 1
+
+/* Port I/O direction */
+#define PORT_READ 0
+#define PORT_WRITE 1
+
+/* See Exit Qualification for I/O Instructions in VMX documentation */
+#define VE_IS_IO_IN(e) ((e) & BIT(3))
+#define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)
+#define VE_GET_PORT_NUM(e) ((e) >> 16)
+#define VE_IS_IO_STRING(e) ((e) & BIT(4))
+
+/*
+ * Wrapper for standard use of __tdx_hypercall with no output aside from
+ * return code.
+ */
+static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = fn,
+ .r12 = r12,
+ .r13 = r13,
+ .r14 = r14,
+ .r15 = r15,
+ };
+
+ return __tdx_hypercall(&args, 0);
+}
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __tdx_hypercall_failed(void)
+{
+ panic("TDVMCALL failed. TDX module bug?");
+}
+
+/*
+ * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined
+ * independently from but are currently matched 1:1 with VMX EXIT_REASONs.
+ * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
+ * guest sides of these calls.
+ */
+static u64 hcall_func(u64 exit_reason)
+{
+ return exit_reason;
+}
+
+#ifdef CONFIG_KVM_GUEST
+long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = nr,
+ .r11 = p1,
+ .r12 = p2,
+ .r13 = p3,
+ .r14 = p4,
+ };
+
+ return __tdx_hypercall(&args, 0);
+}
+EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
+#endif
+
+/*
+ * Used for TDX guests to make calls directly to the TD module. This
+ * should only be used for calls that have no legitimate reason to fail
+ * or where the kernel can not survive the call failing.
+ */
+static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
+ struct tdx_module_output *out)
+{
+ if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
+ panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
+}
+
+static u64 get_cc_mask(void)
+{
+ struct tdx_module_output out;
+ unsigned int gpa_width;
+
+ /*
+ * TDINFO TDX module call is used to get the TD execution environment
+ * information like GPA width, number of available vcpus, debug mode
+ * information, etc. More details about the ABI can be found in TDX
+ * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
+ * [TDG.VP.INFO].
+ *
+ * The GPA width that comes out of this call is critical. TDX guests
+ * can not meaningfully run without it.
+ */
+ tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
+
+ gpa_width = out.rcx & GENMASK(5, 0);
+
+ /*
+ * The highest bit of a guest physical address is the "sharing" bit.
+ * Set it for shared pages and clear it for private pages.
+ */
+ return BIT_ULL(gpa_width - 1);
+}
+
+/*
+ * The TDX module spec states that #VE may be injected for a limited set of
+ * reasons:
+ *
+ * - Emulation of the architectural #VE injection on EPT violation;
+ *
+ * - As a result of guest TD execution of a disallowed instruction,
+ * a disallowed MSR access, or CPUID virtualization;
+ *
+ * - A notification to the guest TD about anomalous behavior;
+ *
+ * The last one is opt-in and is not used by the kernel.
+ *
+ * The Intel Software Developer's Manual describes cases when instruction
+ * length field can be used in section "Information for VM Exits Due to
+ * Instruction Execution".
+ *
+ * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
+ * information if #VE occurred due to instruction execution, but not for EPT
+ * violations.
+ */
+static int ve_instr_len(struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ case EXIT_REASON_CPUID:
+ case EXIT_REASON_IO_INSTRUCTION:
+ /* It is safe to use ve->instr_len for #VE due instructions */
+ return ve->instr_len;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * For EPT violations, ve->insn_len is not defined. For those,
+ * the kernel must decode instructions manually and should not
+ * be using this function.
+ */
+ WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
+ return 0;
+ default:
+ WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
+ return ve->instr_len;
+ }
+}
+
+static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_HLT),
+ .r12 = irq_disabled,
+ };
+
+ /*
+ * Emulate HLT operation via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
+ *
+ * The VMM uses the "IRQ disabled" param to understand IRQ
+ * enabled status (RFLAGS.IF) of the TD guest and to determine
+ * whether or not it should schedule the halted vCPU if an
+ * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
+ * can keep the vCPU in virtual HLT, even if an IRQ is
+ * pending, without hanging/breaking the guest.
+ */
+ return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
+}
+
+static int handle_halt(struct ve_info *ve)
+{
+ /*
+ * Since non safe halt is mainly used in CPU offlining
+ * and the guest will always stay in the halt state, don't
+ * call the STI instruction (set do_sti as false).
+ */
+ const bool irq_disabled = irqs_disabled();
+ const bool do_sti = false;
+
+ if (__halt(irq_disabled, do_sti))
+ return -EIO;
+
+ return ve_instr_len(ve);
+}
+
+void __cpuidle tdx_safe_halt(void)
+{
+ /*
+ * For do_sti=true case, __tdx_hypercall() function enables
+ * interrupts using the STI instruction before the TDCALL. So
+ * set irq_disabled as false.
+ */
+ const bool irq_disabled = false;
+ const bool do_sti = true;
+
+ /*
+ * Use WARN_ONCE() to report the failure.
+ */
+ if (__halt(irq_disabled, do_sti))
+ WARN_ONCE(1, "HLT instruction emulation failed\n");
+}
+
+static int read_msr(struct pt_regs *regs, struct ve_info *ve)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_MSR_READ),
+ .r12 = regs->cx,
+ };
+
+ /*
+ * Emulate the MSR read via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
+ */
+ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ return -EIO;
+
+ regs->ax = lower_32_bits(args.r11);
+ regs->dx = upper_32_bits(args.r11);
+ return ve_instr_len(ve);
+}
+
+static int write_msr(struct pt_regs *regs, struct ve_info *ve)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
+ .r12 = regs->cx,
+ .r13 = (u64)regs->dx << 32 | regs->ax,
+ };
+
+ /*
+ * Emulate the MSR write via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
+ */
+ if (__tdx_hypercall(&args, 0))
+ return -EIO;
+
+ return ve_instr_len(ve);
+}
+
+static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_CPUID),
+ .r12 = regs->ax,
+ .r13 = regs->cx,
+ };
+
+ /*
+ * Only allow VMM to control range reserved for hypervisor
+ * communication.
+ *
+ * Return all-zeros for any CPUID outside the range. It matches CPU
+ * behaviour for non-supported leaf.
+ */
+ if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
+ regs->ax = regs->bx = regs->cx = regs->dx = 0;
+ return ve_instr_len(ve);
+ }
+
+ /*
+ * Emulate the CPUID instruction via a hypercall. More info about
+ * ABI can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
+ */
+ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ return -EIO;
+
+ /*
+ * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
+ * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
+ * So copy the register contents back to pt_regs.
+ */
+ regs->ax = args.r12;
+ regs->bx = args.r13;
+ regs->cx = args.r14;
+ regs->dx = args.r15;
+
+ return ve_instr_len(ve);
+}
+
+static bool mmio_read(int size, unsigned long addr, unsigned long *val)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
+ .r12 = size,
+ .r13 = EPT_READ,
+ .r14 = addr,
+ .r15 = *val,
+ };
+
+ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ return false;
+ *val = args.r11;
+ return true;
+}
+
+static bool mmio_write(int size, unsigned long addr, unsigned long val)
+{
+ return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
+ EPT_WRITE, addr, val);
+}
+
+static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
+{
+ unsigned long *reg, val, vaddr;
+ char buffer[MAX_INSN_SIZE];
+ struct insn insn = {};
+ enum mmio_type mmio;
+ int size, extend_size;
+ u8 extend_val = 0;
+
+ /* Only in-kernel MMIO is supported */
+ if (WARN_ON_ONCE(user_mode(regs)))
+ return -EFAULT;
+
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
+ return -EFAULT;
+
+ if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
+ return -EINVAL;
+
+ mmio = insn_decode_mmio(&insn, &size);
+ if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
+ return -EINVAL;
+
+ if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ reg = insn_get_modrm_reg_ptr(&insn, regs);
+ if (!reg)
+ return -EINVAL;
+ }
+
+ /*
+ * Reject EPT violation #VEs that split pages.
+ *
+ * MMIO accesses are supposed to be naturally aligned and therefore
+ * never cross page boundaries. Seeing split page accesses indicates
+ * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
+ *
+ * load_unaligned_zeropad() will recover using exception fixups.
+ */
+ vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
+ return -EFAULT;
+
+ /* Handle writes first */
+ switch (mmio) {
+ case MMIO_WRITE:
+ memcpy(&val, reg, size);
+ if (!mmio_write(size, ve->gpa, val))
+ return -EIO;
+ return insn.length;
+ case MMIO_WRITE_IMM:
+ val = insn.immediate.value;
+ if (!mmio_write(size, ve->gpa, val))
+ return -EIO;
+ return insn.length;
+ case MMIO_READ:
+ case MMIO_READ_ZERO_EXTEND:
+ case MMIO_READ_SIGN_EXTEND:
+ /* Reads are handled below */
+ break;
+ case MMIO_MOVS:
+ case MMIO_DECODE_FAILED:
+ /*
+ * MMIO was accessed with an instruction that could not be
+ * decoded or handled properly. It was likely not using io.h
+ * helpers or accessed MMIO accidentally.
+ */
+ return -EINVAL;
+ default:
+ WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
+ return -EINVAL;
+ }
+
+ /* Handle reads */
+ if (!mmio_read(size, ve->gpa, &val))
+ return -EIO;
+
+ switch (mmio) {
+ case MMIO_READ:
+ /* Zero-extend for 32-bit operation */
+ extend_size = size == 4 ? sizeof(*reg) : 0;
+ break;
+ case MMIO_READ_ZERO_EXTEND:
+ /* Zero extend based on operand size */
+ extend_size = insn.opnd_bytes;
+ break;
+ case MMIO_READ_SIGN_EXTEND:
+ /* Sign extend based on operand size */
+ extend_size = insn.opnd_bytes;
+ if (size == 1 && val & BIT(7))
+ extend_val = 0xFF;
+ else if (size > 1 && val & BIT(15))
+ extend_val = 0xFF;
+ break;
+ default:
+ /* All other cases has to be covered with the first switch() */
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ if (extend_size)
+ memset(reg, extend_val, extend_size);
+ memcpy(reg, &val, size);
+ return insn.length;
+}
+
+static bool handle_in(struct pt_regs *regs, int size, int port)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
+ .r12 = size,
+ .r13 = PORT_READ,
+ .r14 = port,
+ };
+ u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
+ bool success;
+
+ /*
+ * Emulate the I/O read via hypercall. More info about ABI can be found
+ * in TDX Guest-Host-Communication Interface (GHCI) section titled
+ * "TDG.VP.VMCALL<Instruction.IO>".
+ */
+ success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT);
+
+ /* Update part of the register affected by the emulated instruction */
+ regs->ax &= ~mask;
+ if (success)
+ regs->ax |= args.r11 & mask;
+
+ return success;
+}
+
+static bool handle_out(struct pt_regs *regs, int size, int port)
+{
+ u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
+
+ /*
+ * Emulate the I/O write via hypercall. More info about ABI can be found
+ * in TDX Guest-Host-Communication Interface (GHCI) section titled
+ * "TDG.VP.VMCALL<Instruction.IO>".
+ */
+ return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
+ PORT_WRITE, port, regs->ax & mask);
+}
+
+/*
+ * Emulate I/O using hypercall.
+ *
+ * Assumes the IO instruction was using ax, which is enforced
+ * by the standard io.h macros.
+ *
+ * Return True on success or False on failure.
+ */
+static int handle_io(struct pt_regs *regs, struct ve_info *ve)
+{
+ u32 exit_qual = ve->exit_qual;
+ int size, port;
+ bool in, ret;
+
+ if (VE_IS_IO_STRING(exit_qual))
+ return -EIO;
+
+ in = VE_IS_IO_IN(exit_qual);
+ size = VE_GET_IO_SIZE(exit_qual);
+ port = VE_GET_PORT_NUM(exit_qual);
+
+
+ if (in)
+ ret = handle_in(regs, size, port);
+ else
+ ret = handle_out(regs, size, port);
+ if (!ret)
+ return -EIO;
+
+ return ve_instr_len(ve);
+}
+
+/*
+ * Early #VE exception handler. Only handles a subset of port I/O.
+ * Intended only for earlyprintk. If failed, return false.
+ */
+__init bool tdx_early_handle_ve(struct pt_regs *regs)
+{
+ struct ve_info ve;
+ int insn_len;
+
+ tdx_get_ve_info(&ve);
+
+ if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
+ return false;
+
+ insn_len = handle_io(regs, &ve);
+ if (insn_len < 0)
+ return false;
+
+ regs->ip += insn_len;
+ return true;
+}
+
+void tdx_get_ve_info(struct ve_info *ve)
+{
+ struct tdx_module_output out;
+
+ /*
+ * Called during #VE handling to retrieve the #VE info from the
+ * TDX module.
+ *
+ * This has to be called early in #VE handling. A "nested" #VE which
+ * occurs before this will raise a #DF and is not recoverable.
+ *
+ * The call retrieves the #VE info from the TDX module, which also
+ * clears the "#VE valid" flag. This must be done before anything else
+ * because any #VE that occurs while the valid flag is set will lead to
+ * #DF.
+ *
+ * Note, the TDX module treats virtual NMIs as inhibited if the #VE
+ * valid flag is set. It means that NMI=>#VE will not result in a #DF.
+ */
+ tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
+
+ /* Transfer the output parameters */
+ ve->exit_reason = out.rcx;
+ ve->exit_qual = out.rdx;
+ ve->gla = out.r8;
+ ve->gpa = out.r9;
+ ve->instr_len = lower_32_bits(out.r10);
+ ve->instr_info = upper_32_bits(out.r10);
+}
+
+/*
+ * Handle the user initiated #VE.
+ *
+ * On success, returns the number of bytes RIP should be incremented (>=0)
+ * or -errno on error.
+ */
+static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_CPUID:
+ return handle_cpuid(regs, ve);
+ default:
+ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+ return -EIO;
+ }
+}
+
+/*
+ * Handle the kernel #VE.
+ *
+ * On success, returns the number of bytes RIP should be incremented (>=0)
+ * or -errno on error.
+ */
+static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ return handle_halt(ve);
+ case EXIT_REASON_MSR_READ:
+ return read_msr(regs, ve);
+ case EXIT_REASON_MSR_WRITE:
+ return write_msr(regs, ve);
+ case EXIT_REASON_CPUID:
+ return handle_cpuid(regs, ve);
+ case EXIT_REASON_EPT_VIOLATION:
+ return handle_mmio(regs, ve);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return handle_io(regs, ve);
+ default:
+ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+ return -EIO;
+ }
+}
+
+bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
+{
+ int insn_len;
+
+ if (user_mode(regs))
+ insn_len = virt_exception_user(regs, ve);
+ else
+ insn_len = virt_exception_kernel(regs, ve);
+ if (insn_len < 0)
+ return false;
+
+ /* After successful #VE handling, move the IP */
+ regs->ip += insn_len;
+
+ return true;
+}
+
+static bool tdx_tlb_flush_required(bool private)
+{
+ /*
+ * TDX guest is responsible for flushing TLB on private->shared
+ * transition. VMM is responsible for flushing on shared->private.
+ *
+ * The VMM _can't_ flush private addresses as it can't generate PAs
+ * with the guest's HKID. Shared memory isn't subject to integrity
+ * checking, i.e. the VMM doesn't need to flush for its own protection.
+ *
+ * There's no need to flush when converting from shared to private,
+ * as flushing is the VMM's responsibility in this case, e.g. it must
+ * flush to avoid integrity failures in the face of a buggy or
+ * malicious guest.
+ */
+ return !private;
+}
+
+static bool tdx_cache_flush_required(void)
+{
+ /*
+ * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
+ * TDX doesn't have such capability.
+ *
+ * Flush cache unconditionally.
+ */
+ return true;
+}
+
+static bool try_accept_one(phys_addr_t *start, unsigned long len,
+ enum pg_level pg_level)
+{
+ unsigned long accept_size = page_level_size(pg_level);
+ u64 tdcall_rcx;
+ u8 page_size;
+
+ if (!IS_ALIGNED(*start, accept_size))
+ return false;
+
+ if (len < accept_size)
+ return false;
+
+ /*
+ * Pass the page physical address to the TDX module to accept the
+ * pending, private page.
+ *
+ * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
+ */
+ switch (pg_level) {
+ case PG_LEVEL_4K:
+ page_size = 0;
+ break;
+ case PG_LEVEL_2M:
+ page_size = 1;
+ break;
+ case PG_LEVEL_1G:
+ page_size = 2;
+ break;
+ default:
+ return false;
+ }
+
+ tdcall_rcx = *start | page_size;
+ if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
+ return false;
+
+ *start += accept_size;
+ return true;
+}
+
+/*
+ * Inform the VMM of the guest's intent for this physical page: shared with
+ * the VMM or private to the guest. The VMM is expected to change its mapping
+ * of the page in response.
+ */
+static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
+{
+ phys_addr_t start = __pa(vaddr);
+ phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
+
+ if (!enc) {
+ /* Set the shared (decrypted) bits: */
+ start |= cc_mkdec(0);
+ end |= cc_mkdec(0);
+ }
+
+ /*
+ * Notify the VMM about page mapping conversion. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface (GHCI),
+ * section "TDG.VP.VMCALL<MapGPA>"
+ */
+ if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))
+ return false;
+
+ /* private->shared conversion requires only MapGPA call */
+ if (!enc)
+ return true;
+
+ /*
+ * For shared->private conversion, accept the page using
+ * TDX_ACCEPT_PAGE TDX module call.
+ */
+ while (start < end) {
+ unsigned long len = end - start;
+
+ /*
+ * Try larger accepts first. It gives chance to VMM to keep
+ * 1G/2M SEPT entries where possible and speeds up process by
+ * cutting number of hypercalls (if successful).
+ */
+
+ if (try_accept_one(&start, len, PG_LEVEL_1G))
+ continue;
+
+ if (try_accept_one(&start, len, PG_LEVEL_2M))
+ continue;
+
+ if (!try_accept_one(&start, len, PG_LEVEL_4K))
+ return false;
+ }
+
+ return true;
+}
+
+void __init tdx_early_init(void)
+{
+ u64 cc_mask;
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
+
+ if (memcmp(TDX_IDENT, sig, sizeof(sig)))
+ return;
+
+ setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+
+ cc_set_vendor(CC_VENDOR_INTEL);
+ cc_mask = get_cc_mask();
+ cc_set_mask(cc_mask);
+
+ /*
+ * All bits above GPA width are reserved and kernel treats shared bit
+ * as flag, not as part of physical address.
+ *
+ * Adjust physical mask to only cover valid GPA bits.
+ */
+ physical_mask &= cc_mask - 1;
+
+ x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
+ x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
+ x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed;
+
+ pr_info("Guest detected\n");
+}
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 9c9c4a888b1d..98a4852ed6a0 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,6 +1,7 @@
-# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_WERROR=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_USELIB=y
CONFIG_AUDIT=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
@@ -12,23 +13,30 @@ CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
+CONFIG_BLK_CGROUP=y
CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_HUGETLB=y
CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_MISC=y
+CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
-# CONFIG_64BIT is not set
CONFIG_SMP=y
-CONFIG_X86_GENERIC=y
-CONFIG_HPET_TIMER=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_NR_CPUS=8
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
-CONFIG_X86_REBOOTFIXUPS=y
CONFIG_MICROCODE_AMD=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
-CONFIG_HIGHPTE=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
# CONFIG_MTRR_SANITIZER is not set
CONFIG_EFI=y
@@ -44,12 +52,15 @@ CONFIG_ACPI_BGRT=y
CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_X86_ACPI_CPUFREQ=y
-CONFIG_EFI_VARS=y
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
+CONFIG_COMPAT_32BIT_TIME=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_BLK_CGROUP_IOLATENCY=y
+CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
CONFIG_BINFMT_MISC=y
CONFIG_NET=y
CONFIG_PACKET=y
@@ -104,12 +115,16 @@ CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
CONFIG_NET_SCHED=y
+CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_EMATCH=y
CONFIG_NET_CLS_ACT=y
+CONFIG_CGROUP_NET_PRIO=y
CONFIG_CFG80211=y
CONFIG_MAC80211=y
CONFIG_MAC80211_LEDS=y
CONFIG_RFKILL=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
CONFIG_PCI=y
CONFIG_PCIEPORTBUS=y
CONFIG_PCI_MSI=y
@@ -120,13 +135,16 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEBUG_DEVRES=y
CONFIG_CONNECTOR=y
+CONFIG_EFI_VARS=y
+CONFIG_EFI_CAPSULE_LOADER=y
CONFIG_BLK_DEV_LOOP=y
+CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_SD=y
CONFIG_BLK_DEV_SR=y
CONFIG_CHR_DEV_SG=y
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_SPI_ATTRS=y
-# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_SCSI_VIRTIO=y
CONFIG_ATA=y
CONFIG_SATA_AHCI=y
CONFIG_ATA_PIIX=y
@@ -144,6 +162,7 @@ CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
CONFIG_NETCONSOLE=y
+CONFIG_VIRTIO_NET=y
CONFIG_BNX2=y
CONFIG_TIGON3=y
CONFIG_NET_TULIP=y
@@ -156,7 +175,6 @@ CONFIG_FORCEDETH=y
CONFIG_8139TOO=y
# CONFIG_8139TOO_PIO is not set
CONFIG_R8169=y
-CONFIG_INPUT_POLLDEV=y
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_JOYSTICK=y
CONFIG_INPUT_TABLET=y
@@ -172,6 +190,7 @@ CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y
CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_VIRTIO_CONSOLE=y
CONFIG_HW_RANDOM=y
CONFIG_NVRAM=y
CONFIG_HPET=y
@@ -183,12 +202,7 @@ CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
CONFIG_DRM=y
CONFIG_DRM_I915=y
-CONFIG_FB_MODE_HELPERS=y
-CONFIG_FB_TILEBLITTING=y
-CONFIG_FB_EFI=y
-CONFIG_LOGO=y
-# CONFIG_LOGO_LINUX_MONO is not set
-# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_DRM_VIRTIO_GPU=y
CONFIG_SOUND=y
CONFIG_SND=y
CONFIG_SND_HRTIMER=y
@@ -221,6 +235,8 @@ CONFIG_USB_STORAGE=y
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_INPUT=y
CONFIG_EEEPC_LAPTOP=y
CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_POSIX_ACL=y
@@ -242,6 +258,7 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
+CONFIG_9P_FS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
@@ -253,13 +270,15 @@ CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_PRINTK_TIME=y
+CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_STACK_USAGE=y
-CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_UNWINDER_FRAME_POINTER=y
+# CONFIG_64BIT is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index b60bd2d86034..69784505a7a8 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,4 +1,4 @@
-# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_WERROR=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_AUDIT=y
@@ -12,14 +12,25 @@ CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
+CONFIG_BLK_CGROUP=y
CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_HUGETLB=y
CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_MISC=y
+CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_SMP=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_MICROCODE_AMD=y
CONFIG_X86_MSR=y
@@ -42,12 +53,14 @@ CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_X86_ACPI_CPUFREQ=y
CONFIG_IA32_EMULATION=y
-CONFIG_EFI_VARS=y
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_BLK_CGROUP_IOLATENCY=y
+CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
CONFIG_BINFMT_MISC=y
CONFIG_NET=y
CONFIG_PACKET=y
@@ -102,12 +115,16 @@ CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
CONFIG_NET_SCHED=y
+CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_EMATCH=y
CONFIG_NET_CLS_ACT=y
+CONFIG_CGROUP_NET_PRIO=y
CONFIG_CFG80211=y
CONFIG_MAC80211=y
CONFIG_MAC80211_LEDS=y
CONFIG_RFKILL=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
CONFIG_PCI=y
CONFIG_PCIEPORTBUS=y
CONFIG_HOTPLUG_PCI=y
@@ -117,13 +134,15 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEBUG_DEVRES=y
CONFIG_CONNECTOR=y
+CONFIG_EFI_VARS=y
CONFIG_BLK_DEV_LOOP=y
+CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_SD=y
CONFIG_BLK_DEV_SR=y
CONFIG_CHR_DEV_SG=y
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_SPI_ATTRS=y
-# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_SCSI_VIRTIO=y
CONFIG_ATA=y
CONFIG_SATA_AHCI=y
CONFIG_ATA_PIIX=y
@@ -139,6 +158,7 @@ CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
CONFIG_NETCONSOLE=y
+CONFIG_VIRTIO_NET=y
CONFIG_TIGON3=y
CONFIG_NET_TULIP=y
CONFIG_E100=y
@@ -148,7 +168,6 @@ CONFIG_SKY2=y
CONFIG_FORCEDETH=y
CONFIG_8139TOO=y
CONFIG_R8169=y
-CONFIG_INPUT_POLLDEV=y
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_JOYSTICK=y
CONFIG_INPUT_TABLET=y
@@ -164,6 +183,7 @@ CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y
CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_VIRTIO_CONSOLE=y
CONFIG_HW_RANDOM=y
# CONFIG_HW_RANDOM_INTEL is not set
# CONFIG_HW_RANDOM_AMD is not set
@@ -177,12 +197,7 @@ CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
CONFIG_DRM=y
CONFIG_DRM_I915=y
-CONFIG_FB_MODE_HELPERS=y
-CONFIG_FB_TILEBLITTING=y
-CONFIG_FB_EFI=y
-CONFIG_LOGO=y
-# CONFIG_LOGO_LINUX_MONO is not set
-# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_DRM_VIRTIO_GPU=y
CONFIG_SOUND=y
CONFIG_SND=y
CONFIG_SND_HRTIMER=y
@@ -215,6 +230,8 @@ CONFIG_USB_STORAGE=y
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_INPUT=y
CONFIG_EEEPC_LAPTOP=y
CONFIG_AMD_IOMMU=y
CONFIG_INTEL_IOMMU=y
@@ -239,6 +256,7 @@ CONFIG_NFS_FS=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
+CONFIG_9P_FS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ASCII=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b28e36b7c96b..2831685adf6f 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,8 +2,6 @@
#
# x86 crypto algorithms
-OBJECT_FILES_NON_STANDARD := y
-
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
@@ -64,7 +62,9 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
-blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+blake2s-x86_64-y := blake2s-shash.o
+obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
+libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
@@ -90,6 +90,15 @@ nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o
+sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o
+sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64) += sm4-aesni-avx2-x86_64.o
+sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o
+
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 51d46d93efbc..b48ddebb4748 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -122,7 +122,7 @@ SYM_FUNC_START_LOCAL(__load_partial)
pxor T0, MSG
.Lld_partial_8:
- ret
+ RET
SYM_FUNC_END(__load_partial)
/*
@@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(__store_partial)
mov %r10b, (%r9)
.Lst_partial_1:
- ret
+ RET
SYM_FUNC_END(__store_partial)
/*
@@ -225,7 +225,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_init)
/*
@@ -337,7 +337,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_1:
movdqu STATE4, 0x00(STATEP)
@@ -346,7 +346,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_2:
movdqu STATE3, 0x00(STATEP)
@@ -355,7 +355,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_3:
movdqu STATE2, 0x00(STATEP)
@@ -364,7 +364,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_4:
movdqu STATE1, 0x00(STATEP)
@@ -373,11 +373,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_ad)
.macro encrypt_block a s0 s1 s2 s3 s4 i
@@ -452,7 +452,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_1:
movdqu STATE3, 0x00(STATEP)
@@ -461,7 +461,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_2:
movdqu STATE2, 0x00(STATEP)
@@ -470,7 +470,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_3:
movdqu STATE1, 0x00(STATEP)
@@ -479,7 +479,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_4:
movdqu STATE0, 0x00(STATEP)
@@ -488,11 +488,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_enc)
/*
@@ -532,7 +532,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
.macro decrypt_block a s0 s1 s2 s3 s4 i
@@ -606,7 +606,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_1:
movdqu STATE3, 0x00(STATEP)
@@ -615,7 +615,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_2:
movdqu STATE2, 0x00(STATEP)
@@ -624,7 +624,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_3:
movdqu STATE1, 0x00(STATEP)
@@ -633,7 +633,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_4:
movdqu STATE0, 0x00(STATEP)
@@ -642,11 +642,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_dec)
/*
@@ -696,7 +696,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
/*
@@ -743,5 +743,5 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
movdqu MSG, (%rsi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_final)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index 3f0fc7dd87d7..43852ba6e19c 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -1,65 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
/*
- * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
- *
- * This is AES128/192/256 CTR mode optimization implementation. It requires
- * the support of Intel(R) AESNI and AVX instructions.
- *
- * This work was inspired by the AES CTR mode optimization published
- * in Intel Optimized IPSEC Cryptograhpic library.
- * Additional information on it can be found at:
- * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
+ * AES CTR mode by8 optimization with AVX instructions. (x86_64)
*
* Copyright(c) 2014 Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
* Contact Information:
* James Guilford <james.guilford@intel.com>
* Sean Gulley <sean.m.gulley@intel.com>
* Chandramouli Narayanan <mouli@linux.intel.com>
+ */
+/*
+ * This is AES128/192/256 CTR mode optimization implementation. It requires
+ * the support of Intel(R) AESNI and AVX instructions.
*
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
+ * This work was inspired by the AES CTR mode optimization published
+ * in Intel Optimized IPSEC Cryptographic library.
+ * Additional information on it can be found at:
+ * https://github.com/intel/intel-ipsec-mb
*/
#include <linux/linkage.h>
@@ -525,7 +482,7 @@ ddq_add_8:
/* return updated IV */
vpshufb xbyteswap, xcounter, xcounter
vmovdqu xcounter, (p_iv)
- ret
+ RET
.endm
/*
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 4e3972570916..837c1e0aa021 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1594,7 +1594,7 @@ SYM_FUNC_START(aesni_gcm_dec)
GCM_ENC_DEC dec
GCM_COMPLETE arg10, arg11
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec)
@@ -1683,7 +1683,7 @@ SYM_FUNC_START(aesni_gcm_enc)
GCM_COMPLETE arg10, arg11
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc)
/*****************************************************************************
@@ -1701,7 +1701,7 @@ SYM_FUNC_START(aesni_gcm_init)
FUNC_SAVE
GCM_INIT %arg3, %arg4,%arg5, %arg6
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init)
/*****************************************************************************
@@ -1716,7 +1716,7 @@ SYM_FUNC_START(aesni_gcm_enc_update)
FUNC_SAVE
GCM_ENC_DEC enc
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update)
/*****************************************************************************
@@ -1731,7 +1731,7 @@ SYM_FUNC_START(aesni_gcm_dec_update)
FUNC_SAVE
GCM_ENC_DEC dec
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update)
/*****************************************************************************
@@ -1746,13 +1746,11 @@ SYM_FUNC_START(aesni_gcm_finalize)
FUNC_SAVE
GCM_COMPLETE %arg3 %arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize)
#endif
-
-SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
SYM_FUNC_START_LOCAL(_key_expansion_256a)
pshufd $0b11111111, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -1762,9 +1760,9 @@ SYM_FUNC_START_LOCAL(_key_expansion_256a)
pxor %xmm1, %xmm0
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_256a)
-SYM_FUNC_END_ALIAS(_key_expansion_128)
+SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
SYM_FUNC_START_LOCAL(_key_expansion_192a)
pshufd $0b01010101, %xmm1, %xmm1
@@ -1787,7 +1785,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192a)
shufps $0b01001110, %xmm2, %xmm1
movaps %xmm1, 0x10(TKEYP)
add $0x20, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_192a)
SYM_FUNC_START_LOCAL(_key_expansion_192b)
@@ -1806,7 +1804,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192b)
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_192b)
SYM_FUNC_START_LOCAL(_key_expansion_256b)
@@ -1818,7 +1816,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b)
pxor %xmm1, %xmm2
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_256b)
/*
@@ -1933,7 +1931,7 @@ SYM_FUNC_START(aesni_set_key)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_set_key)
/*
@@ -1957,7 +1955,7 @@ SYM_FUNC_START(aesni_enc)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_enc)
/*
@@ -2014,7 +2012,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc1)
aesenc KEY, STATE
movaps 0x70(TKEYP), KEY
aesenclast KEY, STATE
- ret
+ RET
SYM_FUNC_END(_aesni_enc1)
/*
@@ -2122,7 +2120,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc4)
aesenclast KEY, STATE2
aesenclast KEY, STATE3
aesenclast KEY, STATE4
- ret
+ RET
SYM_FUNC_END(_aesni_enc4)
/*
@@ -2147,7 +2145,7 @@ SYM_FUNC_START(aesni_dec)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_dec)
/*
@@ -2204,7 +2202,7 @@ SYM_FUNC_START_LOCAL(_aesni_dec1)
aesdec KEY, STATE
movaps 0x70(TKEYP), KEY
aesdeclast KEY, STATE
- ret
+ RET
SYM_FUNC_END(_aesni_dec1)
/*
@@ -2312,7 +2310,7 @@ SYM_FUNC_START_LOCAL(_aesni_dec4)
aesdeclast KEY, STATE2
aesdeclast KEY, STATE3
aesdeclast KEY, STATE4
- ret
+ RET
SYM_FUNC_END(_aesni_dec4)
/*
@@ -2372,7 +2370,7 @@ SYM_FUNC_START(aesni_ecb_enc)
popl LEN
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ecb_enc)
/*
@@ -2433,7 +2431,7 @@ SYM_FUNC_START(aesni_ecb_dec)
popl LEN
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ecb_dec)
/*
@@ -2477,7 +2475,7 @@ SYM_FUNC_START(aesni_cbc_enc)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cbc_enc)
/*
@@ -2570,7 +2568,7 @@ SYM_FUNC_START(aesni_cbc_dec)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cbc_dec)
/*
@@ -2627,7 +2625,7 @@ SYM_FUNC_START(aesni_cts_cbc_enc)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cts_cbc_enc)
/*
@@ -2688,7 +2686,7 @@ SYM_FUNC_START(aesni_cts_cbc_dec)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cts_cbc_dec)
.pushsection .rodata
@@ -2725,7 +2723,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc_init)
mov $1, TCTR_LOW
movq TCTR_LOW, INC
movq CTR, TCTR_LOW
- ret
+ RET
SYM_FUNC_END(_aesni_inc_init)
/*
@@ -2753,7 +2751,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc)
.Linc_low:
movaps CTR, IV
pshufb BSWAP_MASK, IV
- ret
+ RET
SYM_FUNC_END(_aesni_inc)
/*
@@ -2816,7 +2814,7 @@ SYM_FUNC_START(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ctr_enc)
#endif
@@ -2932,7 +2930,7 @@ SYM_FUNC_START(aesni_xts_encrypt)
popl IVP
#endif
FRAME_END
- ret
+ RET
.Lxts_enc_1x:
add $64, LEN
@@ -3092,7 +3090,7 @@ SYM_FUNC_START(aesni_xts_decrypt)
popl IVP
#endif
FRAME_END
- ret
+ RET
.Lxts_dec_1x:
add $64, LEN
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 2cf8e94d986a..0852ab573fd3 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -212,10 +212,6 @@ HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu
#define arg4 %rcx
#define arg5 %r8
#define arg6 %r9
-#define arg7 STACK_OFFSET+8*1(%r14)
-#define arg8 STACK_OFFSET+8*2(%r14)
-#define arg9 STACK_OFFSET+8*3(%r14)
-#define arg10 STACK_OFFSET+8*4(%r14)
#define keysize 2*15*16(arg1)
i = 0
@@ -237,9 +233,6 @@ define_reg j %j
.noaltmacro
.endm
-# need to push 4 registers into stack to maintain
-STACK_OFFSET = 8*4
-
TMP1 = 16*0 # Temporary storage for AAD
TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
TMP3 = 16*2 # Temporary storage for AES State 3
@@ -256,25 +249,22 @@ VARIABLE_OFFSET = 16*8
################################
.macro FUNC_SAVE
- #the number of pushes must equal STACK_OFFSET
push %r12
push %r13
- push %r14
push %r15
- mov %rsp, %r14
-
-
+ push %rbp
+ mov %rsp, %rbp
sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes
.endm
.macro FUNC_RESTORE
- mov %r14, %rsp
+ mov %rbp, %rsp
+ pop %rbp
pop %r15
- pop %r14
pop %r13
pop %r12
.endm
@@ -294,7 +284,7 @@ VARIABLE_OFFSET = 16*8
# combined for GCM encrypt and decrypt functions
# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
+# clobbering r10, r11, r12, r13, r15, rax
.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
vmovdqu AadHash(arg2), %xmm8
vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
@@ -996,7 +986,7 @@ _partial_block_done_\@:
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg3, arg4, r14 are used as a pointer only, not modified
+## arg1, arg2, arg3, arg4 are used as pointers only, not modified
.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
i = (8-\num_initial_blocks)
@@ -1231,7 +1221,7 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg3, arg4 are used as pointers only, not modified
+# arg1, arg2, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
@@ -1777,7 +1767,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen2)
FUNC_SAVE
INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init_avx_gen2)
###############################################################################
@@ -1798,15 +1788,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
FUNC_RESTORE
- ret
+ RET
key_128_enc_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
FUNC_RESTORE
- ret
+ RET
key_256_enc_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
###############################################################################
@@ -1827,15 +1817,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
FUNC_RESTORE
- ret
+ RET
key_128_dec_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
FUNC_RESTORE
- ret
+ RET
key_256_dec_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
###############################################################################
@@ -1856,15 +1846,15 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
# must be 192
GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_128_finalize:
GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_256_finalize:
GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
###############################################################################
@@ -1944,7 +1934,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg3, arg4, r14 are used as a pointer only, not modified
+## arg1, arg2, arg3, arg4 are used as pointers only, not modified
.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
i = (8-\num_initial_blocks)
@@ -2186,7 +2176,7 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg3, arg4 are used as pointers only, not modified
+# arg1, arg2, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
@@ -2745,7 +2735,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen4)
FUNC_SAVE
INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init_avx_gen4)
###############################################################################
@@ -2766,15 +2756,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
FUNC_RESTORE
- ret
+ RET
key_128_enc_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
FUNC_RESTORE
- ret
+ RET
key_256_enc_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
###############################################################################
@@ -2795,15 +2785,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
FUNC_RESTORE
- ret
+ RET
key_128_dec_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
FUNC_RESTORE
- ret
+ RET
key_256_dec_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
###############################################################################
@@ -2824,13 +2814,13 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
# must be 192
GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_128_finalize4:
GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_256_finalize4:
GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2144e54a6c89..41901ba9d3a2 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -849,6 +849,8 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt)
return -EINVAL;
err = skcipher_walk_virt(&walk, req, false);
+ if (!walk.nbytes)
+ return err;
if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
@@ -862,7 +864,10 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt)
skcipher_request_set_crypt(&subreq, req->src, req->dst,
blocks * AES_BLOCK_SIZE, req->iv);
req = &subreq;
+
err = skcipher_walk_virt(&walk, req, false);
+ if (!walk.nbytes)
+ return err;
} else {
tail = 0;
}
@@ -1102,7 +1107,7 @@ static struct aead_alg aesni_aeads[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_alignmask = 0,
.cra_module = THIS_MODULE,
},
}, {
@@ -1119,7 +1124,7 @@ static struct aead_alg aesni_aeads[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_alignmask = 0,
.cra_module = THIS_MODULE,
},
} };
diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
index 2ca79974f819..b50b35ff1fdb 100644
--- a/arch/x86/crypto/blake2s-core.S
+++ b/arch/x86/crypto/blake2s-core.S
@@ -171,7 +171,7 @@ SYM_FUNC_START(blake2s_compress_ssse3)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
.Lendofloop:
- ret
+ RET
SYM_FUNC_END(blake2s_compress_ssse3)
#ifdef CONFIG_AS_AVX512
@@ -251,6 +251,6 @@ SYM_FUNC_START(blake2s_compress_avx512)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
vzeroupper
- retq
+ RET
SYM_FUNC_END(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..69853c13e8fb 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -5,7 +5,6 @@
#include <crypto/internal/blake2s.h>
#include <crypto/internal/simd.h>
-#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -28,9 +27,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-void blake2s_compress_arch(struct blake2s_state *state,
- const u8 *block, size_t nblocks,
- const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+ size_t nblocks, const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,49 +54,12 @@ void blake2s_compress_arch(struct blake2s_state *state,
block += blocks * BLAKE2S_BLOCK_SIZE;
} while (nblocks);
}
-EXPORT_SYMBOL(blake2s_compress_arch);
-
-static int crypto_blake2s_update_x86(struct shash_desc *desc,
- const u8 *in, unsigned int inlen)
-{
- return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
-}
-
-static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
-{
- return crypto_blake2s_final(desc, out, blake2s_compress_arch);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size) \
- { \
- .base.cra_name = name, \
- .base.cra_driver_name = driver_name, \
- .base.cra_priority = 200, \
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
- .base.cra_module = THIS_MODULE, \
- .digestsize = digest_size, \
- .setkey = crypto_blake2s_setkey, \
- .init = crypto_blake2s_init, \
- .update = crypto_blake2s_update_x86, \
- .final = crypto_blake2s_final_x86, \
- .descsize = sizeof(struct blake2s_state), \
- }
-
-static struct shash_alg blake2s_algs[] = {
- BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
- BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
- BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
- BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
-};
+EXPORT_SYMBOL(blake2s_compress);
static int __init blake2s_mod_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return 0;
-
- static_branch_enable(&blake2s_use_ssse3);
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ static_branch_enable(&blake2s_use_ssse3);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX) &&
@@ -109,26 +70,9 @@ static int __init blake2s_mod_init(void)
XFEATURE_MASK_AVX512, NULL))
static_branch_enable(&blake2s_use_avx512);
- return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
- crypto_register_shashes(blake2s_algs,
- ARRAY_SIZE(blake2s_algs)) : 0;
-}
-
-static void __exit blake2s_mod_exit(void)
-{
- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ return 0;
}
module_init(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-x86");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-x86");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-x86");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-x86");
MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c
new file mode 100644
index 000000000000..59ae28abe35c
--- /dev/null
+++ b/arch/x86/crypto/blake2s-shash.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <crypto/internal/blake2s.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/hash.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+static int crypto_blake2s_update_x86(struct shash_desc *desc,
+ const u8 *in, unsigned int inlen)
+{
+ return crypto_blake2s_update(desc, in, inlen, false);
+}
+
+static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
+{
+ return crypto_blake2s_final(desc, out, false);
+}
+
+#define BLAKE2S_ALG(name, driver_name, digest_size) \
+ { \
+ .base.cra_name = name, \
+ .base.cra_driver_name = driver_name, \
+ .base.cra_priority = 200, \
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
+ .base.cra_module = THIS_MODULE, \
+ .digestsize = digest_size, \
+ .setkey = crypto_blake2s_setkey, \
+ .init = crypto_blake2s_init, \
+ .update = crypto_blake2s_update_x86, \
+ .final = crypto_blake2s_final_x86, \
+ .descsize = sizeof(struct blake2s_state), \
+ }
+
+static struct shash_alg blake2s_algs[] = {
+ BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
+};
+
+static int __init blake2s_mod_init(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ return 0;
+}
+
+static void __exit blake2s_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+module_init(blake2s_mod_init);
+module_exit(blake2s_mod_exit);
+
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-x86");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-x86");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-x86");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-x86");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 4222ac6d6584..802d71582689 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -135,10 +135,10 @@ SYM_FUNC_START(__blowfish_enc_blk)
jnz .L__enc_xor;
write_block();
- ret;
+ RET;
.L__enc_xor:
xor_block();
- ret;
+ RET;
SYM_FUNC_END(__blowfish_enc_blk)
SYM_FUNC_START(blowfish_dec_blk)
@@ -170,7 +170,7 @@ SYM_FUNC_START(blowfish_dec_blk)
movq %r11, %r12;
- ret;
+ RET;
SYM_FUNC_END(blowfish_dec_blk)
/**********************************************************************
@@ -322,14 +322,14 @@ SYM_FUNC_START(__blowfish_enc_blk_4way)
popq %rbx;
popq %r12;
- ret;
+ RET;
.L__enc_xor4:
xor_block4();
popq %rbx;
popq %r12;
- ret;
+ RET;
SYM_FUNC_END(__blowfish_enc_blk_4way)
SYM_FUNC_START(blowfish_dec_blk_4way)
@@ -364,5 +364,5 @@ SYM_FUNC_START(blowfish_dec_blk_4way)
popq %rbx;
popq %r12;
- ret;
+ RET;
SYM_FUNC_END(blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index a880e0b1c255..ba06322c1e39 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -32,24 +32,12 @@ static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
__blowfish_enc_blk(ctx, dst, src, false);
}
-static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __blowfish_enc_blk(ctx, dst, src, true);
-}
-
static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src)
{
__blowfish_enc_blk_4way(ctx, dst, src, false);
}
-static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __blowfish_enc_blk_4way(ctx, dst, src, true);
-}
-
static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
@@ -315,7 +303,7 @@ static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
-static int __init init(void)
+static int __init blowfish_init(void)
{
int err;
@@ -339,15 +327,15 @@ static int __init init(void)
return err;
}
-static void __exit fini(void)
+static void __exit blowfish_fini(void)
{
crypto_unregister_alg(&bf_cipher_alg);
crypto_unregister_skciphers(bf_skcipher_algs,
ARRAY_SIZE(bf_skcipher_algs));
}
-module_init(init);
-module_exit(fini);
+module_init(blowfish_init);
+module_exit(blowfish_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized");
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index e2a0e0f4bf9d..2e1658ddbe1a 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -192,7 +192,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
%rcx, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
@@ -200,7 +200,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a
roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
%xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
%rax, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
@@ -778,7 +778,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
%xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Lenc_max32:
@@ -865,7 +865,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
%xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Ldec_max32:
@@ -906,7 +906,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_enc_16way)
SYM_FUNC_START(camellia_ecb_dec_16way)
@@ -936,7 +936,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_dec_16way)
SYM_FUNC_START(camellia_cbc_dec_16way)
@@ -987,5 +987,5 @@ SYM_FUNC_START(camellia_cbc_dec_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_cbc_dec_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 782e9712a1ec..0e4e9abbf4de 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -226,7 +226,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
%rcx, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
@@ -234,7 +234,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a
roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
%ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
%rax, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
@@ -814,7 +814,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
%ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Lenc_max32:
@@ -901,7 +901,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
%ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Ldec_max32:
@@ -946,7 +946,7 @@ SYM_FUNC_START(camellia_ecb_enc_32way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_enc_32way)
SYM_FUNC_START(camellia_ecb_dec_32way)
@@ -980,7 +980,7 @@ SYM_FUNC_START(camellia_ecb_dec_32way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_dec_32way)
SYM_FUNC_START(camellia_cbc_dec_32way)
@@ -990,6 +990,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
* %rdx: src (32 blocks)
*/
FRAME_BEGIN
+ subq $(16 * 32), %rsp;
vzeroupper;
@@ -1002,7 +1003,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
%ymm15, %rdx, (key_table)(CTX, %r8, 8));
- movq %rsp, %r10;
cmpq %rsi, %rdx;
je .Lcbc_dec_use_stack;
@@ -1015,7 +1015,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
* dst still in-use (because dst == src), so use stack for temporary
* storage.
*/
- subq $(16 * 32), %rsp;
movq %rsp, %rax;
.Lcbc_dec_continue:
@@ -1025,7 +1024,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
vpxor %ymm7, %ymm7, %ymm7;
vinserti128 $1, (%rdx), %ymm7, %ymm7;
vpxor (%rax), %ymm7, %ymm7;
- movq %r10, %rsp;
vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
@@ -1047,6 +1045,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
vzeroupper;
+ addq $(16 * 32), %rsp;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_cbc_dec_32way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 1372e6408850..347c059f5940 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -213,13 +213,13 @@ SYM_FUNC_START(__camellia_enc_blk)
enc_outunpack(mov, RT1);
movq RR12, %r12;
- ret;
+ RET;
.L__enc_xor:
enc_outunpack(xor, RT1);
movq RR12, %r12;
- ret;
+ RET;
SYM_FUNC_END(__camellia_enc_blk)
SYM_FUNC_START(camellia_dec_blk)
@@ -257,7 +257,7 @@ SYM_FUNC_START(camellia_dec_blk)
dec_outunpack();
movq RR12, %r12;
- ret;
+ RET;
SYM_FUNC_END(camellia_dec_blk)
/**********************************************************************
@@ -448,14 +448,14 @@ SYM_FUNC_START(__camellia_enc_blk_2way)
movq RR12, %r12;
popq %rbx;
- ret;
+ RET;
.L__enc2_xor:
enc_outunpack2(xor, RT2);
movq RR12, %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(__camellia_enc_blk_2way)
SYM_FUNC_START(camellia_dec_blk_2way)
@@ -495,5 +495,5 @@ SYM_FUNC_START(camellia_dec_blk_2way)
movq RR12, %r12;
movq RXOR, %rbx;
- ret;
+ RET;
SYM_FUNC_END(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 66c435ba9d3d..d45e9c0c42ac 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1377,7 +1377,7 @@ static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
-static int __init init(void)
+static int __init camellia_init(void)
{
int err;
@@ -1401,15 +1401,15 @@ static int __init init(void)
return err;
}
-static void __exit fini(void)
+static void __exit camellia_fini(void)
{
crypto_unregister_alg(&camellia_cipher_alg);
crypto_unregister_skciphers(camellia_skcipher_algs,
ARRAY_SIZE(camellia_skcipher_algs));
}
-module_init(init);
-module_exit(fini);
+module_init(camellia_init);
+module_exit(camellia_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 8a6181b08b59..b258af420c92 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -279,7 +279,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast5_enc_blk16)
.align 16
@@ -352,7 +352,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
- ret;
+ RET;
.L__skip_dec:
vpsrldq $4, RKR, RKR;
@@ -393,7 +393,7 @@ SYM_FUNC_START(cast5_ecb_enc_16way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ecb_enc_16way)
SYM_FUNC_START(cast5_ecb_dec_16way)
@@ -431,7 +431,7 @@ SYM_FUNC_START(cast5_ecb_dec_16way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ecb_dec_16way)
SYM_FUNC_START(cast5_cbc_dec_16way)
@@ -483,7 +483,7 @@ SYM_FUNC_START(cast5_cbc_dec_16way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_cbc_dec_16way)
SYM_FUNC_START(cast5_ctr_16way)
@@ -559,5 +559,5 @@ SYM_FUNC_START(cast5_ctr_16way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index fbddcecc3e3f..82b716fd5dba 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -289,7 +289,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast6_enc_blk8)
.align 8
@@ -336,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast6_dec_blk8)
SYM_FUNC_START(cast6_ecb_enc_8way)
@@ -359,7 +359,7 @@ SYM_FUNC_START(cast6_ecb_enc_8way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_ecb_enc_8way)
SYM_FUNC_START(cast6_ecb_dec_8way)
@@ -382,7 +382,7 @@ SYM_FUNC_START(cast6_ecb_dec_8way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_ecb_dec_8way)
SYM_FUNC_START(cast6_cbc_dec_8way)
@@ -408,5 +408,5 @@ SYM_FUNC_START(cast6_cbc_dec_8way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_cbc_dec_8way)
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
index ee9a40ab4109..f3d8fc018249 100644
--- a/arch/x86/crypto/chacha-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -193,7 +193,7 @@ SYM_FUNC_START(chacha_2block_xor_avx2)
.Ldone2:
vzeroupper
- ret
+ RET
.Lxorpart2:
# xor remaining bytes from partial register into output
@@ -498,7 +498,7 @@ SYM_FUNC_START(chacha_4block_xor_avx2)
.Ldone4:
vzeroupper
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
@@ -992,7 +992,7 @@ SYM_FUNC_START(chacha_8block_xor_avx2)
.Ldone8:
vzeroupper
lea -8(%r10),%rsp
- ret
+ RET
.Lxorpart8:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
index bb193fde123a..259383e1ad44 100644
--- a/arch/x86/crypto/chacha-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -166,13 +166,13 @@ SYM_FUNC_START(chacha_2block_xor_avx512vl)
.Ldone2:
vzeroupper
- ret
+ RET
.Lxorpart2:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
- jz .Ldone8
+ jz .Ldone2
mov %rax,%r9
and $~0xf,%r9
@@ -432,13 +432,13 @@ SYM_FUNC_START(chacha_4block_xor_avx512vl)
.Ldone4:
vzeroupper
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
- jz .Ldone8
+ jz .Ldone4
mov %rax,%r9
and $~0xf,%r9
@@ -812,7 +812,7 @@ SYM_FUNC_START(chacha_8block_xor_avx512vl)
.Ldone8:
vzeroupper
- ret
+ RET
.Lxorpart8:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
index ca1788bfee16..7111949cd5b9 100644
--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -108,7 +108,7 @@ SYM_FUNC_START_LOCAL(chacha_permute)
sub $2,%r8d
jnz .Ldoubleround
- ret
+ RET
SYM_FUNC_END(chacha_permute)
SYM_FUNC_START(chacha_block_xor_ssse3)
@@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_block_xor_ssse3)
.Ldone:
FRAME_END
- ret
+ RET
.Lxorpart:
# xor remaining bytes from partial register into output
@@ -217,7 +217,7 @@ SYM_FUNC_START(hchacha_block_ssse3)
movdqu %xmm3,0x10(%rsi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(hchacha_block_ssse3)
SYM_FUNC_START(chacha_4block_xor_ssse3)
@@ -762,7 +762,7 @@ SYM_FUNC_START(chacha_4block_xor_ssse3)
.Ldone4:
lea -8(%r10),%rsp
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index 6e7d4c4d3208..c392a6edbfff 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -236,5 +236,5 @@ fold_64:
pxor %xmm2, %xmm1
pextrd $0x01, %xmm1, %eax
- ret
+ RET
SYM_FUNC_END(crc32_pclmul_le_16)
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 7c4c7b2fbf05..98cf3b4e4c9f 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -24,7 +24,7 @@
/*
* Copyright 2012 Xyratex Technology Limited
*
- * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation.
*/
#include <linux/init.h>
#include <linux/module.h>
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 884dc767b051..ec35915f0901 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -53,7 +53,7 @@
.endm
.macro JMPTBL_ENTRY i
-.word crc_\i - crc_array
+.quad crc_\i
.endm
.macro JNC_LESS_THAN j
@@ -168,10 +168,7 @@ continue_block:
xor crc2, crc2
## branch into array
- lea jump_table(%rip), %bufp
- movzwq (%bufp, %rax, 2), len
- lea crc_array(%rip), %bufp
- lea (%bufp, len, 1), %bufp
+ mov jump_table(,%rax,8), %bufp
JMP_NOSPEC bufp
################################################################
@@ -198,6 +195,7 @@ crc_array:
.altmacro
LABEL crc_ %i
.noaltmacro
+ ENDBR
crc32q -i*8(block_0), crc_init
crc32q -i*8(block_1), crc1
crc32q -i*8(block_2), crc2
@@ -207,6 +205,7 @@ LABEL crc_ %i
.altmacro
LABEL crc_ %i
.noaltmacro
+ ENDBR
crc32q -i*8(block_0), crc_init
crc32q -i*8(block_1), crc1
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
@@ -240,6 +239,7 @@ LABEL crc_ %i
################################################################
LABEL crc_ 0
+ ENDBR
mov tmp, len
cmp $128*24, tmp
jae full_block
@@ -309,7 +309,7 @@ do_return:
popq %rsi
popq %rdi
popq %rbx
- ret
+ RET
SYM_FUNC_END(crc_pcl)
.section .rodata, "a", @progbits
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index b2533d63030e..721474abfb71 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -257,7 +257,7 @@ SYM_FUNC_START(crc_t10dif_pcl)
# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
pextrw $0, %xmm0, %eax
- ret
+ RET
.align 16
.Lless_than_256_bytes:
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 5af8021b98ce..d55fa9e9b9e6 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -64,10 +64,9 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
/* Return the carry bit in a register */
" adcx %%r11, %1;"
- : "+&r" (f2), "=&r" (carry_r)
- : "r" (out), "r" (f1)
- : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ : "+&r"(f2), "=&r"(carry_r)
+ : "r"(out), "r"(f1)
+ : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
return carry_r;
}
@@ -108,17 +107,16 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
" cmovc %0, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%1);"
- : "+&r" (f2)
- : "r" (out), "r" (f1)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ : "+&r"(f2)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
}
-/* Computes the field substraction of two field elements */
+/* Computes the field subtraction of two field elements */
static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
{
asm volatile(
- /* Compute the raw substraction of f1-f2 */
+ /* Compute the raw subtraction of f1-f2 */
" movq 0(%1), %%r8;"
" subq 0(%2), %%r8;"
" movq 8(%1), %%r9;"
@@ -135,7 +133,7 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
" mov $38, %%rcx;"
" cmovc %%rcx, %%rax;"
- /* Step 2: Substract carry*38 from the original difference */
+ /* Step 2: Subtract carry*38 from the original difference */
" sub %%rax, %%r8;"
" sbb $0, %%r9;"
" sbb $0, %%r10;"
@@ -151,10 +149,9 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
" movq %%r9, 8(%0);"
" movq %%r10, 16(%0);"
" movq %%r11, 24(%0);"
- :
- : "r" (out), "r" (f1), "r" (f2)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ :
+ : "r"(out), "r"(f1), "r"(f2)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
}
/* Computes a field multiplication: out <- f1 * f2
@@ -162,239 +159,400 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
+
/* Compute the raw multiplication: tmp <- src1 * src2 */
/* Compute src1[0] * src2 */
- " movq 0(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 8(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 16(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 24(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
/* Line up pointers */
- " mov %0, %1;"
" mov %2, %0;"
+ " mov %3, %2;"
/* Wrap the result back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 8(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 16(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 24(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
- : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
- :
- : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
- );
+ " movq %%r8, 0(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
}
/* Computes two field multiplications:
- * out[0] <- f1[0] * f2[0]
- * out[1] <- f1[1] * f2[1]
- * Uses the 16-element buffer tmp for intermediate results. */
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results: */
static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
+
/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
/* Compute src1[0] * src2 */
- " movq 0(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 8(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 16(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 24(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
/* Compute src1[0] * src2 */
- " movq 32(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 32(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 64(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 72(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 40(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 40(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 72(%2), %%r8;"
+ " movq %%r8, 72(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 80(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 48(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 48(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 80(%2), %%r8;"
+ " movq %%r8, 80(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 88(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 56(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
+ " movq 56(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 88(%2), %%r8;"
+ " movq %%r8, 88(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 96(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 104(%2);"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 112(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 120(%2);"
+
/* Line up pointers */
- " mov %0, %1;"
" mov %2, %0;"
+ " mov %3, %2;"
/* Wrap the results back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 8(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 16(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 24(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
+ " movq %%r8, 0(%2);"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 96(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 64(%1), %%r8;"
- " mulxq 104(%1), %%r9, %%rbx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 72(%1), %%r9;"
- " mulxq 112(%1), %%r10, %%r13;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 80(%1), %%r10;"
- " mulxq 120(%1), %%r11, %%rax;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 88(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 40(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 48(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 56(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 40(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 48(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 56(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 32(%0);"
- : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
- :
- : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
- );
+ " movq %%r8, 32(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
}
-/* Computes the field multiplication of four-element f1 with value in f2 */
+/* Computes the field multiplication of four-element f1 with value in f2
+ * Requires f2 to be smaller than 2^17 */
static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
{
register u64 f2_r asm("rdx") = f2;
asm volatile(
/* Compute the raw multiplication of f1*f2 */
- " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
- " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
+ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
" add %%rcx, %%r9;"
" mov $0, %%rcx;"
- " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
+ " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
" adcx %%rbx, %%r10;"
- " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
+ " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
" adcx %%r13, %%r11;"
" adcx %%rcx, %%rax;"
@@ -418,17 +576,17 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%1);"
- : "+&r" (f2_r)
- : "r" (out), "r" (f1)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
- );
+ : "+&r"(f2_r)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "memory", "cc");
}
/* Computes p1 <- bit ? p2 : p1 in constant time */
static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
{
asm volatile(
- /* Invert the polarity of bit to match cmov expectations */
+ /* Transfer bit into CF flag */
" add $18446744073709551615, %0;"
/* cswap p1[0], p2[0] */
@@ -502,10 +660,9 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" cmovc %%r10, %%r9;"
" movq %%r8, 56(%1);"
" movq %%r9, 56(%2);"
- : "+&r" (bit)
- : "r" (p1), "r" (p2)
- : "%r8", "%r9", "%r10", "memory", "cc"
- );
+ : "+&r"(bit)
+ : "r"(p1), "r"(p2)
+ : "%r8", "%r9", "%r10", "memory", "cc");
}
/* Computes the square of a field element: out <- f * f
@@ -516,15 +673,22 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
/* Compute the raw multiplication: tmp <- f * f */
/* Step 1: Compute all partial products */
- " movq 0(%1), %%rdx;" /* f[0] */
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 24(%1), %%rdx;" /* f[3] */
- " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -542,39 +706,50 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 0(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
- " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
- " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
- " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
/* Line up pointers */
- " mov %0, %1;"
- " mov %2, %0;"
+ " mov %1, %0;"
+ " mov %2, %1;"
/* Wrap the result back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
+ " mulxq 32(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
+ " adoxq 24(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -582,40 +757,47 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 8(%0);"
+ " movq %%r9, 8(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 16(%0);"
+ " movq %%r10, 16(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 24(%0);"
+ " movq %%r11, 24(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
- : "+&r" (tmp), "+&r" (f), "+&r" (out)
- :
- : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
- );
+ " movq %%r8, 0(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
}
/* Computes two field squarings:
- * out[0] <- f[0] * f[0]
- * out[1] <- f[1] * f[1]
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
* Uses the 16-element buffer tmp for intermediate results */
static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
/* Step 1: Compute all partial products */
- " movq 0(%1), %%rdx;" /* f[0] */
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 24(%1), %%rdx;" /* f[3] */
- " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -633,29 +815,47 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 0(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
- " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
- " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
- " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
/* Step 1: Compute all partial products */
- " movq 32(%1), %%rdx;" /* f[0] */
- " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 56(%1), %%rdx;" /* f[3] */
- " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 32(%0), %%rdx;" /* f[0] */
+ " mulxq 40(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 48(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 56(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 56(%0), %%rdx;" /* f[3] */
+ " mulxq 40(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 40(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 48(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -673,37 +873,48 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 64(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 72(%0);"
- " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 80(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);"
- " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 96(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%0);"
- " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 112(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);"
+ " movq 32(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 64(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 72(%1);"
+ " movq 40(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 80(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 88(%1);"
+ " movq 48(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 96(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 104(%1);"
+ " movq 56(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 112(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 120(%1);"
/* Line up pointers */
- " mov %0, %1;"
- " mov %2, %0;"
+ " mov %1, %0;"
+ " mov %2, %1;"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
+ " mulxq 32(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
+ " adoxq 24(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -711,32 +922,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 8(%0);"
+ " movq %%r9, 8(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 16(%0);"
+ " movq %%r10, 16(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 24(%0);"
+ " movq %%r11, 24(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
+ " movq %%r8, 0(%1);"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 96(%1), %%r8, %%r13;"
+ " mulxq 96(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 64(%1), %%r8;"
- " mulxq 104(%1), %%r9, %%rbx;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 72(%1), %%r9;"
- " mulxq 112(%1), %%r10, %%r13;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 80(%1), %%r10;"
- " mulxq 120(%1), %%r11, %%rax;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 88(%1), %%r11;"
+ " adoxq 88(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -744,21 +955,21 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 40(%0);"
+ " movq %%r9, 40(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 48(%0);"
+ " movq %%r10, 48(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 56(%0);"
+ " movq %%r11, 56(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 32(%0);"
- : "+&r" (tmp), "+&r" (f), "+&r" (out)
- :
- : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
- );
+ " movq %%r8, 32(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
}
static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
@@ -1500,7 +1711,7 @@ static int __init curve25519_mod_init(void)
static void __exit curve25519_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
- (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
+ static_branch_likely(&curve25519_use_bmi2_adx))
crypto_unregister_kpp(&curve25519_alg);
}
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index fac0fdc3f25d..f4c760f4cade 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -243,7 +243,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk)
popq %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(des3_ede_x86_64_crypt_blk)
/***********************************************************************
@@ -528,7 +528,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way)
popq %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way)
.section .rodata, "a", @progbits
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index e7cb68a3db3b..abb8b1fe123b 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -45,14 +45,6 @@ static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
}
-static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- u32 *enc_ctx = ctx->enc.expkey;
-
- des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
-}
-
static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
const u8 *src)
{
@@ -164,7 +156,7 @@ static int cbc_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_encrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
@@ -243,7 +235,7 @@ static int cbc_decrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_decrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 99ac25e18e09..2bf871899920 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -85,7 +85,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
psrlq $1, T2
pxor T2, T1
pxor T1, DATA
- ret
+ RET
SYM_FUNC_END(__clmul_gf128mul_ble)
/* void clmul_ghash_mul(char *dst, const u128 *shash) */
@@ -99,7 +99,7 @@ SYM_FUNC_START(clmul_ghash_mul)
pshufb BSWAP, DATA
movups DATA, (%rdi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(clmul_ghash_mul)
/*
@@ -128,5 +128,5 @@ SYM_FUNC_START(clmul_ghash_update)
movups DATA, (%rdi)
.Lupdate_just_ret:
FRAME_END
- ret
+ RET
SYM_FUNC_END(clmul_ghash_update)
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index b22c7b936272..6a0b15e7196a 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -153,5 +153,5 @@ SYM_FUNC_START(nh_avx2)
vpaddq T1, T0, T0
vpaddq T4, T0, T0
vmovdqu T0, (HASH)
- ret
+ RET
SYM_FUNC_END(nh_avx2)
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
index d7ae22dd6683..34c567bbcb4f 100644
--- a/arch/x86/crypto/nh-sse2-x86_64.S
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -119,5 +119,5 @@ SYM_FUNC_START(nh_sse2)
paddq PASS2_SUMS, T1
movdqu T0, 0x00(HASH)
movdqu T1, 0x10(HASH)
- ret
+ RET
SYM_FUNC_END(nh_sse2)
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
index 71fae5a09e56..2077ce7a5647 100644
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -297,7 +297,7 @@ ___
$code.=<<___;
mov \$1,%eax
.Lno_key:
- ret
+ RET
___
&end_function("poly1305_init_x86_64");
@@ -373,7 +373,7 @@ $code.=<<___;
.cfi_adjust_cfa_offset -48
.Lno_data:
.Lblocks_epilogue:
- ret
+ RET
.cfi_endproc
___
&end_function("poly1305_blocks_x86_64");
@@ -399,7 +399,7 @@ $code.=<<___;
mov %rax,0($mac) # write result
mov %rcx,8($mac)
- ret
+ RET
___
&end_function("poly1305_emit_x86_64");
if ($avx) {
@@ -429,7 +429,7 @@ ___
&poly1305_iteration();
$code.=<<___;
pop $ctx
- ret
+ RET
.size __poly1305_block,.-__poly1305_block
.type __poly1305_init_avx,\@abi-omnipotent
@@ -594,7 +594,7 @@ __poly1305_init_avx:
lea -48-64($ctx),$ctx # size [de-]optimization
pop %rbp
- ret
+ RET
.size __poly1305_init_avx,.-__poly1305_init_avx
___
@@ -747,7 +747,7 @@ $code.=<<___;
.cfi_restore %rbp
.Lno_data_avx:
.Lblocks_avx_epilogue:
- ret
+ RET
.cfi_endproc
.align 32
@@ -1452,7 +1452,7 @@ $code.=<<___ if (!$win64);
___
$code.=<<___;
vzeroupper
- ret
+ RET
.cfi_endproc
___
&end_function("poly1305_blocks_avx");
@@ -1508,7 +1508,7 @@ $code.=<<___;
mov %rax,0($mac) # write result
mov %rcx,8($mac)
- ret
+ RET
___
&end_function("poly1305_emit_avx");
@@ -1675,7 +1675,7 @@ $code.=<<___;
.cfi_restore %rbp
.Lno_data_avx2$suffix:
.Lblocks_avx2_epilogue$suffix:
- ret
+ RET
.cfi_endproc
.align 32
@@ -2201,7 +2201,7 @@ $code.=<<___ if (!$win64);
___
$code.=<<___;
vzeroupper
- ret
+ RET
.cfi_endproc
___
if($avx > 2 && $avx512) {
@@ -2792,7 +2792,7 @@ $code.=<<___ if (!$win64);
.cfi_def_cfa_register %rsp
___
$code.=<<___;
- ret
+ RET
.cfi_endproc
___
@@ -2893,7 +2893,7 @@ $code.=<<___ if ($flavour =~ /elf32/);
___
$code.=<<___;
mov \$1,%eax
- ret
+ RET
.size poly1305_init_base2_44,.-poly1305_init_base2_44
___
{
@@ -3010,7 +3010,7 @@ poly1305_blocks_vpmadd52:
jnz .Lblocks_vpmadd52_4x
.Lno_data_vpmadd52:
- ret
+ RET
.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
___
}
@@ -3451,7 +3451,7 @@ poly1305_blocks_vpmadd52_4x:
vzeroall
.Lno_data_vpmadd52_4x:
- ret
+ RET
.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
___
}
@@ -3824,7 +3824,7 @@ $code.=<<___;
vzeroall
.Lno_data_vpmadd52_8x:
- ret
+ RET
.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
___
}
@@ -3861,7 +3861,7 @@ poly1305_emit_base2_44:
mov %rax,0($mac) # write result
mov %rcx,8($mac)
- ret
+ RET
.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
___
} } }
@@ -3916,7 +3916,7 @@ xor128_encrypt_n_pad:
.Ldone_enc:
mov $otp,%rax
- ret
+ RET
.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
.globl xor128_decrypt_n_pad
@@ -3967,7 +3967,7 @@ xor128_decrypt_n_pad:
.Ldone_dec:
mov $otp,%rax
- ret
+ RET
.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
___
}
@@ -4109,7 +4109,7 @@ avx_handler:
pop %rbx
pop %rdi
pop %rsi
- ret
+ RET
.size avx_handler,.-avx_handler
.section .pdata
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 646da46e8d10..1dfb8af48a3c 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -16,7 +16,7 @@
#include <asm/simd.h>
asmlinkage void poly1305_init_x86_64(void *ctx,
- const u8 key[POLY1305_KEY_SIZE]);
+ const u8 key[POLY1305_BLOCK_SIZE]);
asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
const size_t len, const u32 padbit);
asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
@@ -81,7 +81,7 @@ static void convert_to_base2_64(void *ctx)
state->is_base2_26 = 0;
}
-static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
{
poly1305_init_x86_64(ctx, key);
}
@@ -129,7 +129,7 @@ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
poly1305_emit_avx(ctx, mac, nonce);
}
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
{
poly1305_simd_init(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(&key[16]);
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index b7ee24df7fba..82f2313f512b 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -601,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk8_avx)
.align 8
@@ -655,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_dec_blk8_avx)
SYM_FUNC_START(serpent_ecb_enc_8way_avx)
@@ -673,7 +673,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx)
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_enc_8way_avx)
SYM_FUNC_START(serpent_ecb_dec_8way_avx)
@@ -691,7 +691,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx)
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_dec_8way_avx)
SYM_FUNC_START(serpent_cbc_dec_8way_avx)
@@ -709,5 +709,5 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx)
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_cbc_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index 9161b6e441f3..8ea34c9b9316 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -601,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk16)
.align 8
@@ -655,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_dec_blk16)
SYM_FUNC_START(serpent_ecb_enc_16way)
@@ -677,7 +677,7 @@ SYM_FUNC_START(serpent_ecb_enc_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_enc_16way)
SYM_FUNC_START(serpent_ecb_dec_16way)
@@ -699,7 +699,7 @@ SYM_FUNC_START(serpent_ecb_dec_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_dec_16way)
SYM_FUNC_START(serpent_cbc_dec_16way)
@@ -722,5 +722,5 @@ SYM_FUNC_START(serpent_cbc_dec_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_cbc_dec_16way)
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index 6379b99cb722..8ccb03ad7cef 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -553,12 +553,12 @@ SYM_FUNC_START(__serpent_enc_blk_4way)
write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
- ret;
+ RET;
.L__enc_xor4:
xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk_4way)
SYM_FUNC_START(serpent_dec_blk_4way)
@@ -612,5 +612,5 @@ SYM_FUNC_START(serpent_dec_blk_4way)
movl arg_dst(%esp), %eax;
write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
- ret;
+ RET;
SYM_FUNC_END(serpent_dec_blk_4way)
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
index efb6dc17dc90..e0998a011d1d 100644
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -675,13 +675,13 @@ SYM_FUNC_START(__serpent_enc_blk_8way)
write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
.L__enc_xor8:
xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk_8way)
SYM_FUNC_START(serpent_dec_blk_8way)
@@ -735,5 +735,5 @@ SYM_FUNC_START(serpent_dec_blk_8way)
write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(serpent_dec_blk_8way)
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index ccf0b5fa4933..347e97f4b713 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -96,7 +96,7 @@ static struct skcipher_alg serpent_algs[] = {
static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
-static int __init init(void)
+static int __init serpent_avx2_init(void)
{
const char *feature_name;
@@ -115,14 +115,14 @@ static int __init init(void)
serpent_simd_algs);
}
-static void __exit fini(void)
+static void __exit serpent_avx2_fini(void)
{
simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
serpent_simd_algs);
}
-module_init(init);
-module_exit(fini);
+module_init(serpent_avx2_init);
+module_exit(serpent_avx2_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1e594d60afa5..a96b2fd26dab 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -645,9 +645,9 @@ _loop3:
RESERVE_STACK = (W_SIZE*4 + 8+24)
/* Align stack */
- mov %rsp, %rbx
+ push %rbp
+ mov %rsp, %rbp
and $~(0x20-1), %rsp
- push %rbx
sub $RESERVE_STACK, %rsp
avx2_zeroupper
@@ -665,8 +665,8 @@ _loop3:
avx2_zeroupper
- add $RESERVE_STACK, %rsp
- pop %rsp
+ mov %rbp, %rsp
+ pop %rbp
pop %r15
pop %r14
@@ -674,7 +674,7 @@ _loop3:
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(\name)
.endm
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 11efe3a45a1f..2f94ec0e763b 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -59,8 +59,6 @@
#define DATA_PTR %rsi /* 2nd arg */
#define NUM_BLKS %rdx /* 3rd arg */
-#define RSPSAVE %rax
-
/* gcc conversion */
#define FRAME_SIZE 32 /* space for 2x16 bytes */
@@ -96,7 +94,8 @@
.text
.align 32
SYM_FUNC_START(sha1_ni_transform)
- mov %rsp, RSPSAVE
+ push %rbp
+ mov %rsp, %rbp
sub $FRAME_SIZE, %rsp
and $~0xF, %rsp
@@ -288,9 +287,10 @@ SYM_FUNC_START(sha1_ni_transform)
pextrd $3, E0, 1*16(DIGEST_PTR)
.Ldone_hash:
- mov RSPSAVE, %rsp
+ mov %rbp, %rsp
+ pop %rbp
- ret
+ RET
SYM_FUNC_END(sha1_ni_transform)
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index d25668d2a1e9..263f916362e0 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -99,7 +99,7 @@
pop %rbp
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(\name)
.endm
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 4739cd31b9db..3baa1ec39097 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -458,7 +458,7 @@ done_hash:
popq %r13
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_avx)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 11ff60c29c8b..9bcdbc47b8b4 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -117,15 +117,13 @@ _XMM_SAVE_SIZE = 0
_INP_END_SIZE = 8
_INP_SIZE = 8
_CTX_SIZE = 8
-_RSP_SIZE = 8
_XFER = 0
_XMM_SAVE = _XFER + _XFER_SIZE
_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
_INP = _INP_END + _INP_END_SIZE
_CTX = _INP + _INP_SIZE
-_RSP = _CTX + _CTX_SIZE
-STACK_SIZE = _RSP + _RSP_SIZE
+STACK_SIZE = _CTX + _CTX_SIZE
# rotate_Xs
# Rotate values of symbols X0...X3
@@ -533,11 +531,11 @@ SYM_FUNC_START(sha256_transform_rorx)
pushq %r14
pushq %r15
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
+
subq $STACK_SIZE, %rsp
and $-32, %rsp # align rsp to 32 byte boundary
- mov %rax, _RSP(%rsp)
-
shl $6, NUM_BLKS # convert to bytes
jz done_hash
@@ -704,14 +702,15 @@ only_one_block:
done_hash:
- mov _RSP(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_rorx)
.section .rodata.cst512.K256, "aM", @progbits, 512
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index ddfa863b4ee3..c4a5db612c32 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -472,7 +472,7 @@ done_hash:
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_ssse3)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 7abade04a3a3..94d50dd27cb5 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -326,7 +326,7 @@ SYM_FUNC_START(sha256_ni_transform)
.Ldone_hash:
- ret
+ RET
SYM_FUNC_END(sha256_ni_transform)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 684d58c8bc4f..1fefe6dd3a9e 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -76,14 +76,10 @@ tmp0 = %rax
W_SIZE = 80*8
# W[t] + K[t] | W[t+1] + K[t+1]
WK_SIZE = 2*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_W = 0
frame_WK = frame_W + W_SIZE
-frame_RSPSAVE = frame_WK + WK_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
@@ -281,18 +277,18 @@ SYM_FUNC_START(sha512_transform_avx)
test msglen, msglen
je nowork
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, frame_GPRSAVE(%rsp)
- mov %r12, frame_GPRSAVE +8*1(%rsp)
- mov %r13, frame_GPRSAVE +8*2(%rsp)
- mov %r14, frame_GPRSAVE +8*3(%rsp)
- mov %r15, frame_GPRSAVE +8*4(%rsp)
updateblock:
@@ -353,18 +349,19 @@ updateblock:
dec msglen
jnz updateblock
- # Restore GPRs
- mov frame_GPRSAVE(%rsp), %rbx
- mov frame_GPRSAVE +8*1(%rsp), %r12
- mov frame_GPRSAVE +8*2(%rsp), %r13
- mov frame_GPRSAVE +8*3(%rsp), %r14
- mov frame_GPRSAVE +8*4(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
nowork:
- ret
+ RET
SYM_FUNC_END(sha512_transform_avx)
########################################################################
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 3a44bdcfd583..5cdaab7d6901 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -102,17 +102,13 @@ SRND_SIZE = 1*8
INP_SIZE = 1*8
INPEND_SIZE = 1*8
CTX_SIZE = 1*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_XFER = 0
frame_SRND = frame_XFER + XFER_SIZE
frame_INP = frame_SRND + SRND_SIZE
frame_INPEND = frame_INP + INP_SIZE
frame_CTX = frame_INPEND + INPEND_SIZE
-frame_RSPSAVE = frame_CTX + CTX_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_CTX + CTX_SIZE
## assume buffers not aligned
#define VMOVDQ vmovdqu
@@ -570,18 +566,18 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
# "blocks" is the message length in SHA512 blocks
########################################################################
SYM_FUNC_START(sha512_transform_rorx)
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, 8*0+frame_GPRSAVE(%rsp)
- mov %r12, 8*1+frame_GPRSAVE(%rsp)
- mov %r13, 8*2+frame_GPRSAVE(%rsp)
- mov %r14, 8*3+frame_GPRSAVE(%rsp)
- mov %r15, 8*4+frame_GPRSAVE(%rsp)
shl $7, NUM_BLKS # convert to bytes
jz done_hash
@@ -672,16 +668,18 @@ loop2:
done_hash:
-# Restore GPRs
- mov 8*0+frame_GPRSAVE(%rsp), %rbx
- mov 8*1+frame_GPRSAVE(%rsp), %r12
- mov 8*2+frame_GPRSAVE(%rsp), %r13
- mov 8*3+frame_GPRSAVE(%rsp), %r14
- mov 8*4+frame_GPRSAVE(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
- ret
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ RET
SYM_FUNC_END(sha512_transform_rorx)
########################################################################
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index 50812af0b083..b84c22e06c5f 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -74,14 +74,10 @@ tmp0 = %rax
W_SIZE = 80*8
WK_SIZE = 2*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_W = 0
frame_WK = frame_W + W_SIZE
-frame_RSPSAVE = frame_WK + WK_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
@@ -283,18 +279,18 @@ SYM_FUNC_START(sha512_transform_ssse3)
test msglen, msglen
je nowork
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, frame_GPRSAVE(%rsp)
- mov %r12, frame_GPRSAVE +8*1(%rsp)
- mov %r13, frame_GPRSAVE +8*2(%rsp)
- mov %r14, frame_GPRSAVE +8*3(%rsp)
- mov %r15, frame_GPRSAVE +8*4(%rsp)
updateblock:
@@ -355,18 +351,19 @@ updateblock:
dec msglen
jnz updateblock
- # Restore GPRs
- mov frame_GPRSAVE(%rsp), %rbx
- mov frame_GPRSAVE +8*1(%rsp), %r12
- mov frame_GPRSAVE +8*2(%rsp), %r13
- mov frame_GPRSAVE +8*3(%rsp), %r14
- mov frame_GPRSAVE +8*4(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
nowork:
- ret
+ RET
SYM_FUNC_END(sha512_transform_ssse3)
########################################################################
diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S
new file mode 100644
index 000000000000..b12b9efb5ec5
--- /dev/null
+++ b/arch/x86/crypto/sm3-avx-asm_64.S
@@ -0,0 +1,517 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM3 AVX accelerated transform.
+ * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+ *
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM3 AES/BMI2 accelerated work by libgcrypt at:
+ * https://gnupg.org/software/libgcrypt/index.html
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+#define state_h5 20
+#define state_h6 24
+#define state_h7 28
+
+/* Constants */
+
+/* Round constant macros */
+
+#define K0 2043430169 /* 0x79cc4519 */
+#define K1 -208106958 /* 0xf3988a32 */
+#define K2 -416213915 /* 0xe7311465 */
+#define K3 -832427829 /* 0xce6228cb */
+#define K4 -1664855657 /* 0x9cc45197 */
+#define K5 965255983 /* 0x3988a32f */
+#define K6 1930511966 /* 0x7311465e */
+#define K7 -433943364 /* 0xe6228cbc */
+#define K8 -867886727 /* 0xcc451979 */
+#define K9 -1735773453 /* 0x988a32f3 */
+#define K10 823420391 /* 0x311465e7 */
+#define K11 1646840782 /* 0x6228cbce */
+#define K12 -1001285732 /* 0xc451979c */
+#define K13 -2002571463 /* 0x88a32f39 */
+#define K14 289824371 /* 0x11465e73 */
+#define K15 579648742 /* 0x228cbce6 */
+#define K16 -1651869049 /* 0x9d8a7a87 */
+#define K17 991229199 /* 0x3b14f50f */
+#define K18 1982458398 /* 0x7629ea1e */
+#define K19 -330050500 /* 0xec53d43c */
+#define K20 -660100999 /* 0xd8a7a879 */
+#define K21 -1320201997 /* 0xb14f50f3 */
+#define K22 1654563303 /* 0x629ea1e7 */
+#define K23 -985840690 /* 0xc53d43ce */
+#define K24 -1971681379 /* 0x8a7a879d */
+#define K25 351604539 /* 0x14f50f3b */
+#define K26 703209078 /* 0x29ea1e76 */
+#define K27 1406418156 /* 0x53d43cec */
+#define K28 -1482130984 /* 0xa7a879d8 */
+#define K29 1330705329 /* 0x4f50f3b1 */
+#define K30 -1633556638 /* 0x9ea1e762 */
+#define K31 1027854021 /* 0x3d43cec5 */
+#define K32 2055708042 /* 0x7a879d8a */
+#define K33 -183551212 /* 0xf50f3b14 */
+#define K34 -367102423 /* 0xea1e7629 */
+#define K35 -734204845 /* 0xd43cec53 */
+#define K36 -1468409689 /* 0xa879d8a7 */
+#define K37 1358147919 /* 0x50f3b14f */
+#define K38 -1578671458 /* 0xa1e7629e */
+#define K39 1137624381 /* 0x43cec53d */
+#define K40 -2019718534 /* 0x879d8a7a */
+#define K41 255530229 /* 0x0f3b14f5 */
+#define K42 511060458 /* 0x1e7629ea */
+#define K43 1022120916 /* 0x3cec53d4 */
+#define K44 2044241832 /* 0x79d8a7a8 */
+#define K45 -206483632 /* 0xf3b14f50 */
+#define K46 -412967263 /* 0xe7629ea1 */
+#define K47 -825934525 /* 0xcec53d43 */
+#define K48 -1651869049 /* 0x9d8a7a87 */
+#define K49 991229199 /* 0x3b14f50f */
+#define K50 1982458398 /* 0x7629ea1e */
+#define K51 -330050500 /* 0xec53d43c */
+#define K52 -660100999 /* 0xd8a7a879 */
+#define K53 -1320201997 /* 0xb14f50f3 */
+#define K54 1654563303 /* 0x629ea1e7 */
+#define K55 -985840690 /* 0xc53d43ce */
+#define K56 -1971681379 /* 0x8a7a879d */
+#define K57 351604539 /* 0x14f50f3b */
+#define K58 703209078 /* 0x29ea1e76 */
+#define K59 1406418156 /* 0x53d43cec */
+#define K60 -1482130984 /* 0xa7a879d8 */
+#define K61 1330705329 /* 0x4f50f3b1 */
+#define K62 -1633556638 /* 0x9ea1e762 */
+#define K63 1027854021 /* 0x3d43cec5 */
+
+/* Register macros */
+
+#define RSTATE %rdi
+#define RDATA %rsi
+#define RNBLKS %rdx
+
+#define t0 %eax
+#define t1 %ebx
+#define t2 %ecx
+
+#define a %r8d
+#define b %r9d
+#define c %r10d
+#define d %r11d
+#define e %r12d
+#define f %r13d
+#define g %r14d
+#define h %r15d
+
+#define W0 %xmm0
+#define W1 %xmm1
+#define W2 %xmm2
+#define W3 %xmm3
+#define W4 %xmm4
+#define W5 %xmm5
+
+#define XTMP0 %xmm6
+#define XTMP1 %xmm7
+#define XTMP2 %xmm8
+#define XTMP3 %xmm9
+#define XTMP4 %xmm10
+#define XTMP5 %xmm11
+#define XTMP6 %xmm12
+
+#define BSWAP_REG %xmm15
+
+/* Stack structure */
+
+#define STACK_W_SIZE (32 * 2 * 3)
+#define STACK_REG_SAVE_SIZE (64)
+
+#define STACK_W (0)
+#define STACK_REG_SAVE (STACK_W + STACK_W_SIZE)
+#define STACK_SIZE (STACK_REG_SAVE + STACK_REG_SAVE_SIZE)
+
+/* Instruction helpers. */
+
+#define roll2(v, reg) \
+ roll $(v), reg;
+
+#define roll3mov(v, src, dst) \
+ movl src, dst; \
+ roll $(v), dst;
+
+#define roll3(v, src, dst) \
+ rorxl $(32-(v)), src, dst;
+
+#define addl2(a, out) \
+ leal (a, out), out;
+
+/* Round function macros. */
+
+#define GG1(x, y, z, o, t) \
+ movl x, o; \
+ xorl y, o; \
+ xorl z, o;
+
+#define FF1(x, y, z, o, t) GG1(x, y, z, o, t)
+
+#define GG2(x, y, z, o, t) \
+ andnl z, x, o; \
+ movl y, t; \
+ andl x, t; \
+ addl2(t, o);
+
+#define FF2(x, y, z, o, t) \
+ movl y, o; \
+ xorl x, o; \
+ movl y, t; \
+ andl x, t; \
+ andl z, o; \
+ xorl t, o;
+
+#define R(i, a, b, c, d, e, f, g, h, round, widx, wtype) \
+ /* rol(a, 12) => t0 */ \
+ roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \
+ /* rol (t0 + e + t), 7) => t1 */ \
+ leal K##round(t0, e, 1), t1; \
+ roll2(7, t1); \
+ /* h + w1 => h */ \
+ addl wtype##_W1_ADDR(round, widx), h; \
+ /* h + t1 => h */ \
+ addl2(t1, h); \
+ /* t1 ^ t0 => t0 */ \
+ xorl t1, t0; \
+ /* w1w2 + d => d */ \
+ addl wtype##_W1W2_ADDR(round, widx), d; \
+ /* FF##i(a,b,c) => t1 */ \
+ FF##i(a, b, c, t1, t2); \
+ /* d + t1 => d */ \
+ addl2(t1, d); \
+ /* GG#i(e,f,g) => t2 */ \
+ GG##i(e, f, g, t2, t1); \
+ /* h + t2 => h */ \
+ addl2(t2, h); \
+ /* rol (f, 19) => f */ \
+ roll2(19, f); \
+ /* d + t0 => d */ \
+ addl2(t0, d); \
+ /* rol (b, 9) => b */ \
+ roll2(9, b); \
+ /* P0(h) => h */ \
+ roll3(9, h, t2); \
+ roll3(17, h, t1); \
+ xorl t2, h; \
+ xorl t1, h;
+
+#define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \
+ R(1, a, b, c, d, e, f, g, h, round, widx, wtype)
+
+#define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \
+ R(2, a, b, c, d, e, f, g, h, round, widx, wtype)
+
+/* Input expansion macros. */
+
+/* Byte-swapped input address. */
+#define IW_W_ADDR(round, widx, offs) \
+ (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp)
+
+/* Expanded input address. */
+#define XW_W_ADDR(round, widx, offs) \
+ (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp)
+
+/* Rounds 1-12, byte-swapped input block addresses. */
+#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 0)
+#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32)
+
+/* Rounds 1-12, expanded input block addresses. */
+#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0)
+#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32)
+
+/* Input block loading. */
+#define LOAD_W_XMM_1() \
+ vmovdqu 0*16(RDATA), XTMP0; /* XTMP0: w3, w2, w1, w0 */ \
+ vmovdqu 1*16(RDATA), XTMP1; /* XTMP1: w7, w6, w5, w4 */ \
+ vmovdqu 2*16(RDATA), XTMP2; /* XTMP2: w11, w10, w9, w8 */ \
+ vmovdqu 3*16(RDATA), XTMP3; /* XTMP3: w15, w14, w13, w12 */ \
+ vpshufb BSWAP_REG, XTMP0, XTMP0; \
+ vpshufb BSWAP_REG, XTMP1, XTMP1; \
+ vpshufb BSWAP_REG, XTMP2, XTMP2; \
+ vpshufb BSWAP_REG, XTMP3, XTMP3; \
+ vpxor XTMP0, XTMP1, XTMP4; \
+ vpxor XTMP1, XTMP2, XTMP5; \
+ vpxor XTMP2, XTMP3, XTMP6; \
+ leaq 64(RDATA), RDATA; \
+ vmovdqa XTMP0, IW_W1_ADDR(0, 0); \
+ vmovdqa XTMP4, IW_W1W2_ADDR(0, 0); \
+ vmovdqa XTMP1, IW_W1_ADDR(4, 0); \
+ vmovdqa XTMP5, IW_W1W2_ADDR(4, 0);
+
+#define LOAD_W_XMM_2() \
+ vmovdqa XTMP2, IW_W1_ADDR(8, 0); \
+ vmovdqa XTMP6, IW_W1W2_ADDR(8, 0);
+
+#define LOAD_W_XMM_3() \
+ vpshufd $0b00000000, XTMP0, W0; /* W0: xx, w0, xx, xx */ \
+ vpshufd $0b11111001, XTMP0, W1; /* W1: xx, w3, w2, w1 */ \
+ vmovdqa XTMP1, W2; /* W2: xx, w6, w5, w4 */ \
+ vpalignr $12, XTMP1, XTMP2, W3; /* W3: xx, w9, w8, w7 */ \
+ vpalignr $8, XTMP2, XTMP3, W4; /* W4: xx, w12, w11, w10 */ \
+ vpshufd $0b11111001, XTMP3, W5; /* W5: xx, w15, w14, w13 */
+
+/* Message scheduling. Note: 3 words per XMM register. */
+#define SCHED_W_0(round, w0, w1, w2, w3, w4, w5) \
+ /* Load (w[i - 16]) => XTMP0 */ \
+ vpshufd $0b10111111, w0, XTMP0; \
+ vpalignr $12, XTMP0, w1, XTMP0; /* XTMP0: xx, w2, w1, w0 */ \
+ /* Load (w[i - 13]) => XTMP1 */ \
+ vpshufd $0b10111111, w1, XTMP1; \
+ vpalignr $12, XTMP1, w2, XTMP1; \
+ /* w[i - 9] == w3 */ \
+ /* XMM3 ^ XTMP0 => XTMP0 */ \
+ vpxor w3, XTMP0, XTMP0;
+
+#define SCHED_W_1(round, w0, w1, w2, w3, w4, w5) \
+ /* w[i - 3] == w5 */ \
+ /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \
+ vpslld $15, w5, XTMP2; \
+ vpsrld $(32-15), w5, XTMP3; \
+ vpxor XTMP2, XTMP3, XTMP3; \
+ vpxor XTMP3, XTMP0, XTMP0; \
+ /* rol(XTMP1, 7) => XTMP1 */ \
+ vpslld $7, XTMP1, XTMP5; \
+ vpsrld $(32-7), XTMP1, XTMP1; \
+ vpxor XTMP5, XTMP1, XTMP1; \
+ /* XMM4 ^ XTMP1 => XTMP1 */ \
+ vpxor w4, XTMP1, XTMP1; \
+ /* w[i - 6] == XMM4 */ \
+ /* P1(XTMP0) ^ XTMP1 => XMM0 */ \
+ vpslld $15, XTMP0, XTMP5; \
+ vpsrld $(32-15), XTMP0, XTMP6; \
+ vpslld $23, XTMP0, XTMP2; \
+ vpsrld $(32-23), XTMP0, XTMP3; \
+ vpxor XTMP0, XTMP1, XTMP1; \
+ vpxor XTMP6, XTMP5, XTMP5; \
+ vpxor XTMP3, XTMP2, XTMP2; \
+ vpxor XTMP2, XTMP5, XTMP5; \
+ vpxor XTMP5, XTMP1, w0;
+
+#define SCHED_W_2(round, w0, w1, w2, w3, w4, w5) \
+ /* W1 in XMM12 */ \
+ vpshufd $0b10111111, w4, XTMP4; \
+ vpalignr $12, XTMP4, w5, XTMP4; \
+ vmovdqa XTMP4, XW_W1_ADDR((round), 0); \
+ /* W1 ^ W2 => XTMP1 */ \
+ vpxor w0, XTMP4, XTMP1; \
+ vmovdqa XTMP1, XW_W1W2_ADDR((round), 0);
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+.Lbe32mask:
+ .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+.text
+
+/*
+ * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
+ *
+ * void sm3_transform_avx(struct sm3_state *state,
+ * const u8 *data, int nblocks);
+ */
+.align 16
+SYM_FUNC_START(sm3_transform_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: data (64*nblks bytes)
+ * %rdx: nblocks
+ */
+ vzeroupper;
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ movq %rdx, RNBLKS;
+
+ subq $STACK_SIZE, %rsp;
+ andq $(~63), %rsp;
+
+ movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp);
+ movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp);
+ movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp);
+ movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp);
+ movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp);
+
+ vmovdqa .Lbe32mask (%rip), BSWAP_REG;
+
+ /* Get the values of the chaining variables. */
+ movl state_h0(RSTATE), a;
+ movl state_h1(RSTATE), b;
+ movl state_h2(RSTATE), c;
+ movl state_h3(RSTATE), d;
+ movl state_h4(RSTATE), e;
+ movl state_h5(RSTATE), f;
+ movl state_h6(RSTATE), g;
+ movl state_h7(RSTATE), h;
+
+.align 16
+.Loop:
+ /* Load data part1. */
+ LOAD_W_XMM_1();
+
+ leaq -1(RNBLKS), RNBLKS;
+
+ /* Transform 0-3 + Load data part2. */
+ R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2();
+ R1(d, a, b, c, h, e, f, g, 1, 1, IW);
+ R1(c, d, a, b, g, h, e, f, 2, 2, IW);
+ R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3();
+
+ /* Transform 4-7 + Precalc 12-14. */
+ R1(a, b, c, d, e, f, g, h, 4, 0, IW);
+ R1(d, a, b, c, h, e, f, g, 5, 1, IW);
+ R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5);
+ R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 8-11 + Precalc 12-17. */
+ R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5);
+ R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0);
+ R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0);
+ R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 12-14 + Precalc 18-20 */
+ R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1);
+ R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1);
+ R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 15-17 + Precalc 21-23 */
+ R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2);
+ R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2);
+ R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 18-20 + Precalc 24-26 */
+ R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3);
+ R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3);
+ R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 21-23 + Precalc 27-29 */
+ R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4);
+ R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 24-26 + Precalc 30-32 */
+ R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5);
+ R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5);
+ R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 27-29 + Precalc 33-35 */
+ R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0);
+ R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0);
+ R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 30-32 + Precalc 36-38 */
+ R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1);
+ R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1);
+ R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 33-35 + Precalc 39-41 */
+ R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2);
+ R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2);
+ R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 36-38 + Precalc 42-44 */
+ R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3);
+ R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3);
+ R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 39-41 + Precalc 45-47 */
+ R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4);
+ R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4);
+ R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 42-44 + Precalc 48-50 */
+ R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5);
+ R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5);
+ R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 45-47 + Precalc 51-53 */
+ R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0);
+ R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0);
+ R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 48-50 + Precalc 54-56 */
+ R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1);
+ R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1);
+ R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 51-53 + Precalc 57-59 */
+ R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2);
+ R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2);
+ R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 54-56 + Precalc 60-62 */
+ R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3);
+ R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3);
+ R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 57-59 + Precalc 63 */
+ R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 58, 1, XW);
+ R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 60-62 + Precalc 63 */
+ R2(a, b, c, d, e, f, g, h, 60, 0, XW);
+ R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 62, 2, XW);
+
+ /* Transform 63 */
+ R2(b, c, d, a, f, g, h, e, 63, 0, XW);
+
+ /* Update the chaining variables. */
+ xorl state_h0(RSTATE), a;
+ xorl state_h1(RSTATE), b;
+ xorl state_h2(RSTATE), c;
+ xorl state_h3(RSTATE), d;
+ movl a, state_h0(RSTATE);
+ movl b, state_h1(RSTATE);
+ movl c, state_h2(RSTATE);
+ movl d, state_h3(RSTATE);
+ xorl state_h4(RSTATE), e;
+ xorl state_h5(RSTATE), f;
+ xorl state_h6(RSTATE), g;
+ xorl state_h7(RSTATE), h;
+ movl e, state_h4(RSTATE);
+ movl f, state_h5(RSTATE);
+ movl g, state_h6(RSTATE);
+ movl h, state_h7(RSTATE);
+
+ cmpq $0, RNBLKS;
+ jne .Loop;
+
+ vzeroall;
+
+ movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx;
+ movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15;
+ movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14;
+ movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13;
+ movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12;
+
+ vmovdqa %xmm0, IW_W1_ADDR(0, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(0, 0);
+ vmovdqa %xmm0, IW_W1_ADDR(4, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(4, 0);
+ vmovdqa %xmm0, IW_W1_ADDR(8, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(8, 0);
+
+ movq %rbp, %rsp;
+ popq %rbp;
+ RET;
+SYM_FUNC_END(sm3_transform_avx)
diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c
new file mode 100644
index 000000000000..661b6f22ffcd
--- /dev/null
+++ b/arch/x86/crypto/sm3_avx_glue.c
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM3 Secure Hash Algorithm, AVX assembler accelerated.
+ * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+ *
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/sm3.h>
+#include <crypto/sm3_base.h>
+#include <asm/simd.h>
+
+asmlinkage void sm3_transform_avx(struct sm3_state *state,
+ const u8 *data, int nblocks);
+
+static int sm3_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sm3_state *sctx = shash_desc_ctx(desc);
+
+ if (!crypto_simd_usable() ||
+ (sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) {
+ sm3_update(sctx, data, len);
+ return 0;
+ }
+
+ /*
+ * Make sure struct sm3_state begins directly with the SM3
+ * 256-bit internal state, as this is what the asm functions expect.
+ */
+ BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0);
+
+ kernel_fpu_begin();
+ sm3_base_do_update(desc, data, len, sm3_transform_avx);
+ kernel_fpu_end();
+
+ return 0;
+}
+
+static int sm3_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ if (!crypto_simd_usable()) {
+ struct sm3_state *sctx = shash_desc_ctx(desc);
+
+ if (len)
+ sm3_update(sctx, data, len);
+
+ sm3_final(sctx, out);
+ return 0;
+ }
+
+ kernel_fpu_begin();
+ if (len)
+ sm3_base_do_update(desc, data, len, sm3_transform_avx);
+ sm3_base_do_finalize(desc, sm3_transform_avx);
+ kernel_fpu_end();
+
+ return sm3_base_finish(desc, out);
+}
+
+static int sm3_avx_final(struct shash_desc *desc, u8 *out)
+{
+ if (!crypto_simd_usable()) {
+ sm3_final(shash_desc_ctx(desc), out);
+ return 0;
+ }
+
+ kernel_fpu_begin();
+ sm3_base_do_finalize(desc, sm3_transform_avx);
+ kernel_fpu_end();
+
+ return sm3_base_finish(desc, out);
+}
+
+static struct shash_alg sm3_avx_alg = {
+ .digestsize = SM3_DIGEST_SIZE,
+ .init = sm3_base_init,
+ .update = sm3_avx_update,
+ .final = sm3_avx_final,
+ .finup = sm3_avx_finup,
+ .descsize = sizeof(struct sm3_state),
+ .base = {
+ .cra_name = "sm3",
+ .cra_driver_name = "sm3-avx",
+ .cra_priority = 300,
+ .cra_blocksize = SM3_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init sm3_avx_mod_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX)) {
+ pr_info("AVX instruction are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_BMI2)) {
+ pr_info("BMI2 instruction are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_shash(&sm3_avx_alg);
+}
+
+static void __exit sm3_avx_mod_exit(void)
+{
+ crypto_unregister_shash(&sm3_avx_alg);
+}
+
+module_init(sm3_avx_mod_init);
+module_exit(sm3_avx_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM3 Secure Hash Algorithm, AVX assembler accelerated");
+MODULE_ALIAS_CRYPTO("sm3");
+MODULE_ALIAS_CRYPTO("sm3-avx");
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..4767ab61ff48
--- /dev/null
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
+ * https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define rRIP (%rip)
+
+#define RX0 %xmm0
+#define RX1 %xmm1
+#define MASK_4BIT %xmm2
+#define RTMP0 %xmm3
+#define RTMP1 %xmm4
+#define RTMP2 %xmm5
+#define RTMP3 %xmm6
+#define RTMP4 %xmm7
+
+#define RA0 %xmm8
+#define RA1 %xmm9
+#define RA2 %xmm10
+#define RA3 %xmm11
+
+#define RB0 %xmm12
+#define RB1 %xmm13
+#define RB2 %xmm14
+#define RB3 %xmm15
+
+#define RNOT %xmm0
+#define RBSWAP %xmm1
+
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* pre-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction.
+ */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandn mask4bit, x, tmp0; \
+ vpsrld $4, x, x; \
+ vpand x, mask4bit, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+ .quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+ .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+ .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+ .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+ .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+ .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+ .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+ .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+ .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+ .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
+
+.text
+.align 16
+
+/*
+ * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ * const u8 *src, int nblocks)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_crypt4)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..4 blocks)
+ * %rdx: src (1..4 blocks)
+ * %rcx: num blocks (1..4)
+ */
+ FRAME_BEGIN
+
+ vmovdqu 0*16(%rdx), RA0;
+ vmovdqa RA0, RA1;
+ vmovdqa RA0, RA2;
+ vmovdqa RA0, RA3;
+ cmpq $2, %rcx;
+ jb .Lblk4_load_input_done;
+ vmovdqu 1*16(%rdx), RA1;
+ je .Lblk4_load_input_done;
+ vmovdqu 2*16(%rdx), RA2;
+ cmpq $3, %rcx;
+ je .Lblk4_load_input_done;
+ vmovdqu 3*16(%rdx), RA3;
+
+.Lblk4_load_input_done:
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+ vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+ vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+ vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+ vmovdqa .Linv_shift_row rRIP, RB3;
+ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
+ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vbroadcastss (4*(round))(%rdi), RX0; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RB3, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP2, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP3, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP1, s0, s0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+ ROUND(0, RA0, RA1, RA2, RA3);
+ ROUND(1, RA1, RA2, RA3, RA0);
+ ROUND(2, RA2, RA3, RA0, RA1);
+ ROUND(3, RA3, RA0, RA1, RA2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk4;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vmovdqu RA0, 0*16(%rsi);
+ cmpq $2, %rcx;
+ jb .Lblk4_store_output_done;
+ vmovdqu RA1, 1*16(%rsi);
+ je .Lblk4_store_output_done;
+ vmovdqu RA2, 2*16(%rsi);
+ cmpq $3, %rcx;
+ je .Lblk4_store_output_done;
+ vmovdqu RA3, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_crypt4)
+
+.align 8
+SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * plaintext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * ciphertext blocks
+ */
+ FRAME_BEGIN
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+ vbroadcastss (4*(round))(%rdi), RX0; \
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \
+ vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \
+ vmovdqa RX0, RX1; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \
+ vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \
+ vpxor r1, RX1, RX1; \
+ vpxor r2, RX1, RX1; \
+ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ vmovdqa .Linv_shift_row rRIP, RTMP4; \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ vaesenclast MASK_4BIT, RX1, RX1; \
+ transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RTMP4, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP4, RX1, RTMP2; \
+ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \
+ vpxor RTMP2, r0, r0; /* r0 ^ x */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ vpxor RTMP1, s0, s0; \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+ /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpslld $2, RTMP2, RTMP3; \
+ vpsrld $30, RTMP2, RTMP2; \
+ vpxor RTMP2, r0, r0; \
+ vpxor RTMP3, r0, r0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+ ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+ ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+ ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+ ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk8;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__sm4_crypt_blk8)
+
+/*
+ * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
+ * const u8 *src, int nblocks)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_crypt8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..8 blocks)
+ * %rdx: src (1..8 blocks)
+ * %rcx: num blocks (1..8)
+ */
+ cmpq $5, %rcx;
+ jb sm4_aesni_avx_crypt4;
+
+ FRAME_BEGIN
+
+ vmovdqu (0 * 16)(%rdx), RA0;
+ vmovdqu (1 * 16)(%rdx), RA1;
+ vmovdqu (2 * 16)(%rdx), RA2;
+ vmovdqu (3 * 16)(%rdx), RA3;
+ vmovdqu (4 * 16)(%rdx), RB0;
+ vmovdqa RB0, RB1;
+ vmovdqa RB0, RB2;
+ vmovdqa RB0, RB3;
+ je .Lblk8_load_input_done;
+ vmovdqu (5 * 16)(%rdx), RB1;
+ cmpq $7, %rcx;
+ jb .Lblk8_load_input_done;
+ vmovdqu (6 * 16)(%rdx), RB2;
+ je .Lblk8_load_input_done;
+ vmovdqu (7 * 16)(%rdx), RB3;
+
+.Lblk8_load_input_done:
+ call __sm4_crypt_blk8;
+
+ cmpq $6, %rcx;
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ jb .Lblk8_store_output_done;
+ vmovdqu RB1, (5 * 16)(%rsi);
+ je .Lblk8_store_output_done;
+ vmovdqu RB2, (6 * 16)(%rsi);
+ cmpq $7, %rcx;
+ je .Lblk8_store_output_done;
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+.Lblk8_store_output_done:
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_crypt8)
+
+/*
+ * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RA0;
+
+ vmovdqa .Lbswap128_mask rRIP, RBSWAP;
+ vpshufb RBSWAP, RA0, RTMP0; /* be => le */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP2); /* +1 */
+ vpshufb RBSWAP, RTMP0, RA1;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +2 */
+ vpshufb RBSWAP, RTMP0, RA2;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +3 */
+ vpshufb RBSWAP, RTMP0, RA3;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +4 */
+ vpshufb RBSWAP, RTMP0, RB0;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +5 */
+ vpshufb RBSWAP, RTMP0, RB1;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +6 */
+ vpshufb RBSWAP, RTMP0, RB2;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +7 */
+ vpshufb RBSWAP, RTMP0, RB3;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +8 */
+ vpshufb RBSWAP, RTMP0, RTMP1;
+
+ /* store new IV */
+ vmovdqu RTMP1, (%rcx);
+
+ call __sm4_crypt_blk8;
+
+ vpxor (0 * 16)(%rdx), RA0, RA0;
+ vpxor (1 * 16)(%rdx), RA1, RA1;
+ vpxor (2 * 16)(%rdx), RA2, RA2;
+ vpxor (3 * 16)(%rdx), RA3, RA3;
+ vpxor (4 * 16)(%rdx), RB0, RB0;
+ vpxor (5 * 16)(%rdx), RB1, RB1;
+ vpxor (6 * 16)(%rdx), RB2, RB2;
+ vpxor (7 * 16)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ vmovdqu RB1, (5 * 16)(%rsi);
+ vmovdqu RB2, (6 * 16)(%rsi);
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
+
+/*
+ * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ vmovdqu (0 * 16)(%rdx), RA0;
+ vmovdqu (1 * 16)(%rdx), RA1;
+ vmovdqu (2 * 16)(%rdx), RA2;
+ vmovdqu (3 * 16)(%rdx), RA3;
+ vmovdqu (4 * 16)(%rdx), RB0;
+ vmovdqu (5 * 16)(%rdx), RB1;
+ vmovdqu (6 * 16)(%rdx), RB2;
+ vmovdqu (7 * 16)(%rdx), RB3;
+
+ call __sm4_crypt_blk8;
+
+ vmovdqu (7 * 16)(%rdx), RNOT;
+ vpxor (%rcx), RA0, RA0;
+ vpxor (0 * 16)(%rdx), RA1, RA1;
+ vpxor (1 * 16)(%rdx), RA2, RA2;
+ vpxor (2 * 16)(%rdx), RA3, RA3;
+ vpxor (3 * 16)(%rdx), RB0, RB0;
+ vpxor (4 * 16)(%rdx), RB1, RB1;
+ vpxor (5 * 16)(%rdx), RB2, RB2;
+ vpxor (6 * 16)(%rdx), RB3, RB3;
+ vmovdqu RNOT, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ vmovdqu RB1, (5 * 16)(%rsi);
+ vmovdqu RB2, (6 * 16)(%rsi);
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
+
+/*
+ * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ /* Load input */
+ vmovdqu (%rcx), RA0;
+ vmovdqu 0 * 16(%rdx), RA1;
+ vmovdqu 1 * 16(%rdx), RA2;
+ vmovdqu 2 * 16(%rdx), RA3;
+ vmovdqu 3 * 16(%rdx), RB0;
+ vmovdqu 4 * 16(%rdx), RB1;
+ vmovdqu 5 * 16(%rdx), RB2;
+ vmovdqu 6 * 16(%rdx), RB3;
+
+ /* Update IV */
+ vmovdqu 7 * 16(%rdx), RNOT;
+ vmovdqu RNOT, (%rcx);
+
+ call __sm4_crypt_blk8;
+
+ vpxor (0 * 16)(%rdx), RA0, RA0;
+ vpxor (1 * 16)(%rdx), RA1, RA1;
+ vpxor (2 * 16)(%rdx), RA2, RA2;
+ vpxor (3 * 16)(%rdx), RA3, RA3;
+ vpxor (4 * 16)(%rdx), RB0, RB0;
+ vpxor (5 * 16)(%rdx), RB1, RB1;
+ vpxor (6 * 16)(%rdx), RB2, RB2;
+ vpxor (7 * 16)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ vmovdqu RB1, (5 * 16)(%rsi);
+ vmovdqu RB2, (6 * 16)(%rsi);
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..4732fe8bb65b
--- /dev/null
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
+ * https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define rRIP (%rip)
+
+/* vector registers */
+#define RX0 %ymm0
+#define RX1 %ymm1
+#define MASK_4BIT %ymm2
+#define RTMP0 %ymm3
+#define RTMP1 %ymm4
+#define RTMP2 %ymm5
+#define RTMP3 %ymm6
+#define RTMP4 %ymm7
+
+#define RA0 %ymm8
+#define RA1 %ymm9
+#define RA2 %ymm10
+#define RA3 %ymm11
+
+#define RB0 %ymm12
+#define RB1 %ymm13
+#define RB2 %ymm14
+#define RB3 %ymm15
+
+#define RNOT %ymm0
+#define RBSWAP %ymm1
+
+#define RX0x %xmm0
+#define RX1x %xmm1
+#define MASK_4BITx %xmm2
+
+#define RNOTx %xmm0
+#define RBSWAPx %xmm1
+
+#define RTMP0x %xmm3
+#define RTMP1x %xmm4
+#define RTMP2x %xmm5
+#define RTMP3x %xmm6
+#define RTMP4x %xmm7
+
+
+/* helper macros */
+
+/* Transpose four 32-bit words between 128-bit vector lanes. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* post-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction. */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandn mask4bit, x, tmp0; \
+ vpsrld $4, x, x; \
+ vpand x, mask4bit, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+ .quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+ .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+ .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+ .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+ .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+ .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+ .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+ .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+ .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+ .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
+.text
+.align 16
+
+.align 8
+SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * plaintext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * ciphertext blocks
+ */
+ FRAME_BEGIN
+
+ vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+ vpbroadcastd (4*(round))(%rdi), RX0; \
+ vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \
+ vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \
+ vmovdqa RX0, RX1; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \
+ vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \
+ vpxor r1, RX1, RX1; \
+ vpxor r2, RX1, RX1; \
+ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ vextracti128 $1, RX0, RTMP4x; \
+ vextracti128 $1, RX1, RTMP0x; \
+ vaesenclast MASK_4BITx, RX0x, RX0x; \
+ vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \
+ vaesenclast MASK_4BITx, RX1x, RX1x; \
+ vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \
+ vinserti128 $1, RTMP4x, RX0, RX0; \
+ vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \
+ vinserti128 $1, RTMP0x, RX1, RX1; \
+ transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RTMP4, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP4, RX1, RTMP2; \
+ vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \
+ vpxor RTMP2, r0, r0; /* r0 ^ x */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP1, s0, s0; \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP2, RTMP3; \
+ vpsrld $30, RTMP2, RTMP2; \
+ vpxor RTMP2, r0, r0; \
+ /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP3, r0, r0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+ ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+ ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+ ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+ ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk8;
+
+#undef ROUND
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__sm4_crypt_blk16)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+/*
+ * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ movq 8(%rcx), %rax;
+ bswapq %rax;
+
+ vzeroupper;
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RTMP4x;
+ vpshufb RTMP3x, RTMP4x, RTMP4x;
+ vmovdqa RTMP4x, RTMP0x;
+ inc_le128(RTMP4x, RNOTx, RTMP1x);
+ vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+ vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 16), %rax;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+ vpshufb RTMP3, RTMP0, RA1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+ vpshufb RTMP3, RTMP0, RA2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+ vpshufb RTMP3, RTMP0, RA3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+ vpshufb RTMP3, RTMP0, RB0;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+ vpshufb RTMP3, RTMP0, RB1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+ vpshufb RTMP3, RTMP0, RB2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+ vpshufb RTMP3, RTMP0, RB3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+ vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+ jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vextracti128 $1, RTMP0, RTMP0x;
+ vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+ /* store new IV */
+ vmovdqu RTMP0x, (%rcx);
+
+ call __sm4_crypt_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
+
+/*
+ * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ vmovdqu (4 * 32)(%rdx), RB0;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RB2;
+ vmovdqu (7 * 32)(%rdx), RB3;
+
+ call __sm4_crypt_blk16;
+
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RNOT;
+ vpxor RNOT, RA0, RA0;
+ vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+ vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+ vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+ vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+ vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+ vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+ vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
+
+/*
+ * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ /* Load input */
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RA0;
+ vmovdqu (0 * 32 + 16)(%rdx), RA1;
+ vmovdqu (1 * 32 + 16)(%rdx), RA2;
+ vmovdqu (2 * 32 + 16)(%rdx), RA3;
+ vmovdqu (3 * 32 + 16)(%rdx), RB0;
+ vmovdqu (4 * 32 + 16)(%rdx), RB1;
+ vmovdqu (5 * 32 + 16)(%rdx), RB2;
+ vmovdqu (6 * 32 + 16)(%rdx), RB3;
+
+ /* Update IV */
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx);
+
+ call __sm4_crypt_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)
diff --git a/arch/x86/crypto/sm4-avx.h b/arch/x86/crypto/sm4-avx.h
new file mode 100644
index 000000000000..1bceab7516aa
--- /dev/null
+++ b/arch/x86/crypto/sm4-avx.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ASM_X86_SM4_AVX_H
+#define ASM_X86_SM4_AVX_H
+
+#include <linux/types.h>
+#include <crypto/sm4.h>
+
+typedef void (*sm4_crypt_func)(const u32 *rk, u8 *dst, const u8 *src, u8 *iv);
+
+int sm4_avx_ecb_encrypt(struct skcipher_request *req);
+int sm4_avx_ecb_decrypt(struct skcipher_request *req);
+
+int sm4_cbc_encrypt(struct skcipher_request *req);
+int sm4_avx_cbc_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func);
+
+int sm4_cfb_encrypt(struct skcipher_request *req);
+int sm4_avx_cfb_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func);
+
+int sm4_avx_ctr_crypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func);
+
+#endif
diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c
new file mode 100644
index 000000000000..84bc718f49a3
--- /dev/null
+++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (c) 2021, Alibaba Group.
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-avx.h"
+
+#define SM4_CRYPT16_BLOCK_SIZE (SM4_BLOCK_SIZE * 16)
+
+asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+
+static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cbc_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
+ sm4_aesni_avx2_cbc_dec_blk16);
+}
+
+
+static int cfb_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cfb_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
+ sm4_aesni_avx2_cfb_dec_blk16);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE,
+ sm4_aesni_avx2_ctr_enc_blk16);
+}
+
+static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "__ecb(sm4)",
+ .cra_driver_name = "__ecb-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_avx_ecb_encrypt,
+ .decrypt = sm4_avx_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cbc(sm4)",
+ .cra_driver_name = "__cbc-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cfb(sm4)",
+ .cra_driver_name = "__cfb-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__ctr(sm4)",
+ .cra_driver_name = "__ctr-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static struct simd_skcipher_alg *
+simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)];
+
+static int __init sm4_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx2_skciphers),
+ simd_sm4_aesni_avx2_skciphers);
+}
+
+static void __exit sm4_exit(void)
+{
+ simd_unregister_skciphers(sm4_aesni_avx2_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx2_skciphers),
+ simd_sm4_aesni_avx2_skciphers);
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-aesni-avx2");
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
new file mode 100644
index 000000000000..7800f77d68ad
--- /dev/null
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (c) 2021, Alibaba Group.
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-avx.h"
+
+#define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8)
+
+asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ const u8 *src, int nblocks);
+asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
+ const u8 *src, int nblocks);
+asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+
+static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+ while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) {
+ sm4_aesni_avx_crypt8(rkey, dst, src, 8);
+ dst += SM4_CRYPT8_BLOCK_SIZE;
+ src += SM4_CRYPT8_BLOCK_SIZE;
+ nbytes -= SM4_CRYPT8_BLOCK_SIZE;
+ }
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ unsigned int nblocks = min(nbytes >> 4, 4u);
+ sm4_aesni_avx_crypt4(rkey, dst, src, nblocks);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+ kernel_fpu_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+int sm4_avx_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_crypt(req, ctx->rkey_enc);
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt);
+
+int sm4_avx_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_crypt(req, ctx->rkey_dec);
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt);
+
+int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE);
+ sm4_crypt_block(ctx->rkey_enc, dst, dst);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_cbc_encrypt);
+
+int sm4_avx_cbc_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+
+ while (nbytes >= bsize) {
+ func(ctx->rkey_dec, dst, src, walk.iv);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ }
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ u8 iv[SM4_BLOCK_SIZE];
+ unsigned int nblocks = min(nbytes >> 4, 8u);
+ int i;
+
+ sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream,
+ src, nblocks);
+
+ src += ((int)nblocks - 2) * SM4_BLOCK_SIZE;
+ dst += (nblocks - 1) * SM4_BLOCK_SIZE;
+ memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE);
+
+ for (i = nblocks - 1; i > 0; i--) {
+ crypto_xor_cpy(dst, src,
+ &keystream[i * SM4_BLOCK_SIZE],
+ SM4_BLOCK_SIZE);
+ src -= SM4_BLOCK_SIZE;
+ dst -= SM4_BLOCK_SIZE;
+ }
+ crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE);
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += (nblocks + 1) * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt);
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE,
+ sm4_aesni_avx_cbc_dec_blk8);
+}
+
+int sm4_cfb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ sm4_crypt_block(ctx->rkey_enc, keystream, iv);
+ crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_cfb_encrypt);
+
+int sm4_avx_cfb_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+
+ while (nbytes >= bsize) {
+ func(ctx->rkey_enc, dst, src, walk.iv);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ }
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ unsigned int nblocks = min(nbytes >> 4, 8u);
+
+ memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
+ if (nblocks > 1)
+ memcpy(&keystream[SM4_BLOCK_SIZE], src,
+ (nblocks - 1) * SM4_BLOCK_SIZE);
+ memcpy(walk.iv, src + (nblocks - 1) * SM4_BLOCK_SIZE,
+ SM4_BLOCK_SIZE);
+
+ sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream,
+ keystream, nblocks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblocks * SM4_BLOCK_SIZE);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_fpu_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_avx_cfb_decrypt);
+
+static int cfb_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cfb_decrypt(req, SM4_CRYPT8_BLOCK_SIZE,
+ sm4_aesni_avx_cfb_dec_blk8);
+}
+
+int sm4_avx_ctr_crypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+
+ while (nbytes >= bsize) {
+ func(ctx->rkey_enc, dst, src, walk.iv);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ }
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ unsigned int nblocks = min(nbytes >> 4, 8u);
+ int i;
+
+ for (i = 0; i < nblocks; i++) {
+ memcpy(&keystream[i * SM4_BLOCK_SIZE],
+ walk.iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ }
+ sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream,
+ keystream, nblocks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblocks * SM4_BLOCK_SIZE);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_fpu_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, keystream);
+
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt);
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE,
+ sm4_aesni_avx_ctr_enc_blk8);
+}
+
+static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "__ecb(sm4)",
+ .cra_driver_name = "__ecb-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_avx_ecb_encrypt,
+ .decrypt = sm4_avx_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cbc(sm4)",
+ .cra_driver_name = "__cbc-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cfb(sm4)",
+ .cra_driver_name = "__cfb-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__ctr(sm4)",
+ .cra_driver_name = "__ctr-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static struct simd_skcipher_alg *
+simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)];
+
+static int __init sm4_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(sm4_aesni_avx_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx_skciphers),
+ simd_sm4_aesni_avx_skciphers);
+}
+
+static void __exit sm4_exit(void)
+{
+ simd_unregister_skciphers(sm4_aesni_avx_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx_skciphers),
+ simd_sm4_aesni_avx_skciphers);
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-aesni-avx");
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 37e63b3c664e..31f9b2ec3857 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -267,7 +267,7 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
- ret;
+ RET;
SYM_FUNC_END(__twofish_enc_blk8)
.align 8
@@ -307,7 +307,7 @@ SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
- ret;
+ RET;
SYM_FUNC_END(__twofish_dec_blk8)
SYM_FUNC_START(twofish_ecb_enc_8way)
@@ -327,7 +327,7 @@ SYM_FUNC_START(twofish_ecb_enc_8way)
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_ecb_enc_8way)
SYM_FUNC_START(twofish_ecb_dec_8way)
@@ -347,7 +347,7 @@ SYM_FUNC_START(twofish_ecb_dec_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_ecb_dec_8way)
SYM_FUNC_START(twofish_cbc_dec_8way)
@@ -372,5 +372,5 @@ SYM_FUNC_START(twofish_cbc_dec_8way)
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_cbc_dec_8way)
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index a6f09e4f2e46..3abcad661884 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -260,7 +260,7 @@ SYM_FUNC_START(twofish_enc_blk)
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
SYM_FUNC_END(twofish_enc_blk)
SYM_FUNC_START(twofish_dec_blk)
@@ -317,5 +317,5 @@ SYM_FUNC_START(twofish_dec_blk)
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index fc23552afe37..d2288bf38a8a 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -88,7 +88,7 @@
/*
* Combined G1 & G2 function. Reordered with help of rotates to have moves
- * at begining.
+ * at beginning.
*/
#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
/* G1,1 && G2,1 */ \
@@ -258,7 +258,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
.L__enc_xor3:
outunpack_enc3(xor);
@@ -266,7 +266,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
SYM_FUNC_END(__twofish_enc_blk_3way)
SYM_FUNC_START(twofish_dec_blk_3way)
@@ -301,5 +301,5 @@ SYM_FUNC_START(twofish_dec_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
SYM_FUNC_END(twofish_dec_blk_3way)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index d2e56232494a..775af290cd19 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -252,7 +252,7 @@ SYM_FUNC_START(twofish_enc_blk)
popq R1
movl $1,%eax
- ret
+ RET
SYM_FUNC_END(twofish_enc_blk)
SYM_FUNC_START(twofish_dec_blk)
@@ -304,5 +304,5 @@ SYM_FUNC_START(twofish_dec_blk)
popq R1
movl $1,%eax
- ret
+ RET
SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index 77e06c2da83d..f9c4adc27404 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -81,18 +81,18 @@ static struct crypto_alg alg = {
}
};
-static int __init init(void)
+static int __init twofish_glue_init(void)
{
return crypto_register_alg(&alg);
}
-static void __exit fini(void)
+static void __exit twofish_glue_fini(void)
{
crypto_unregister_alg(&alg);
}
-module_init(init);
-module_exit(fini);
+module_init(twofish_glue_init);
+module_exit(twofish_glue_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 03725696397c..90454cf18e0d 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -117,7 +117,7 @@ static bool is_blacklisted_cpu(void)
* storing blocks in 64bit registers to allow three blocks to
* be processed parallel. Parallel operation then allows gaining
* more performance than was trade off, on out-of-order CPUs.
- * However Atom does not benefit from this parallellism and
+ * However Atom does not benefit from this parallelism and
* should be blacklisted.
*/
return true;
@@ -140,7 +140,7 @@ static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
-static int __init init(void)
+static int __init twofish_3way_init(void)
{
if (!force && is_blacklisted_cpu()) {
printk(KERN_INFO
@@ -154,13 +154,13 @@ static int __init init(void)
ARRAY_SIZE(tf_skciphers));
}
-static void __exit fini(void)
+static void __exit twofish_3way_fini(void)
{
crypto_unregister_skciphers(tf_skciphers, ARRAY_SIZE(tf_skciphers));
}
-module_init(init);
-module_exit(fini);
+module_init(twofish_3way_init);
+module_exit(twofish_3way_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized");
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 08bf95dbc911..7fec5dcf6438 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -8,18 +8,8 @@ UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE)
CFLAGS_common.o += -fno-stack-protector
-CFLAGS_syscall_64.o += -fno-stack-protector
-CFLAGS_syscall_32.o += -fno-stack-protector
-CFLAGS_syscall_x32.o += -fno-stack-protector
-
-CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
-CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,)
-CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,)
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 07a9331d55e7..29b36e9e4e74 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,6 +6,7 @@
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
+#include <asm/ptrace-abi.h>
/*
@@ -62,42 +63,7 @@ For 32-bit we have the following conventions - kernel is built with
* for assembly code:
*/
-/* The layout forms the "struct pt_regs" on the stack: */
-/*
- * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
- * unless syscall needs a complete, fully filled "struct pt_regs".
- */
-#define R15 0*8
-#define R14 1*8
-#define R13 2*8
-#define R12 3*8
-#define RBP 4*8
-#define RBX 5*8
-/* These regs are callee-clobbered. Always saved on kernel entry. */
-#define R11 6*8
-#define R10 7*8
-#define R9 8*8
-#define R8 9*8
-#define RAX 10*8
-#define RCX 11*8
-#define RDX 12*8
-#define RSI 13*8
-#define RDI 14*8
-/*
- * On syscall entry, this is syscall#. On CPU exception, this is error code.
- * On hw interrupt, it's IRQ number:
- */
-#define ORIG_RAX 15*8
-/* Return frame for iretq */
-#define RIP 16*8
-#define CS 17*8
-#define EFLAGS 18*8
-#define RSP 19*8
-#define SS 20*8
-
-#define SIZEOF_PTREGS 21*8
-
-.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
+.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
.if \save_ret
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
@@ -107,7 +73,7 @@ For 32-bit we have the following conventions - kernel is built with
pushq %rsi /* pt_regs->si */
.endif
pushq \rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
+ pushq \rcx /* pt_regs->cx */
pushq \rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
@@ -124,13 +90,16 @@ For 32-bit we have the following conventions - kernel is built with
.if \save_ret
pushq %rsi /* return address on top of stack */
.endif
+.endm
+.macro CLEAR_REGS
/*
* Sanitize registers of values that a speculation attack might
* otherwise want to exploit. The lower registers are likely clobbered
* well before they could be put to use in a speculative execution
* gadget.
*/
+ xorl %esi, %esi /* nospec si */
xorl %edx, %edx /* nospec dx */
xorl %ecx, %ecx /* nospec cx */
xorl %r8d, %r8d /* nospec r8 */
@@ -146,27 +115,24 @@ For 32-bit we have the following conventions - kernel is built with
.endm
-.macro POP_REGS pop_rdi=1 skip_r11rcx=0
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
+ PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret
+ CLEAR_REGS
+.endm
+
+.macro POP_REGS pop_rdi=1
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
- .if \skip_r11rcx
- popq %rsi
- .else
popq %r11
- .endif
popq %r10
popq %r9
popq %r8
popq %rax
- .if \skip_r11rcx
- popq %rsi
- .else
popq %rcx
- .endif
popq %rdx
popq %rsi
.if \pop_rdi
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 4efd39aacb9f..6c2826417b33 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -36,59 +36,97 @@
#include <asm/irq_stack.h>
#ifdef CONFIG_X86_64
-__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < NR_syscalls)) {
+ unr = array_index_nospec(unr, NR_syscalls);
+ regs->ax = sys_call_table[unr](regs);
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+ /*
+ * Adjust the starting offset of the table, and convert numbers
+ * < __X32_SYSCALL_BIT to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int xnr = nr - __X32_SYSCALL_BIT;
+
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
+ xnr = array_index_nospec(xnr, X32_NR_syscalls);
+ regs->ax = x32_sys_call_table[xnr](regs);
+ return true;
+ }
+ return false;
+}
+
+__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
{
+ add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
- if (likely(nr < NR_syscalls)) {
- nr = array_index_nospec(nr, NR_syscalls);
- regs->ax = sys_call_table[nr](regs);
-#ifdef CONFIG_X86_X32_ABI
- } else if (likely((nr & __X32_SYSCALL_BIT) &&
- (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
- nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
- X32_NR_syscalls);
- regs->ax = x32_sys_call_table[nr](regs);
-#endif
+
+ if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
+ /* Invalid system call, but still a system call. */
+ regs->ax = __x64_sys_ni_syscall(regs);
}
+
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
#endif
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_IA32_EMULATION))
current_thread_info()->status |= TS_COMPAT;
- return (unsigned int)regs->orig_ax;
+ return (int)regs->orig_ax;
}
/*
* Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
*/
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
- unsigned int nr)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
{
- if (likely(nr < IA32_NR_syscalls)) {
- nr = array_index_nospec(nr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call_table[nr](regs);
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < IA32_NR_syscalls)) {
+ unr = array_index_nospec(unr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call_table[unr](regs);
+ } else if (nr != -1) {
+ regs->ax = __ia32_sys_ni_syscall(regs);
}
}
/* Handles int $0x80 */
__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
- unsigned int nr = syscall_32_enter(regs);
+ int nr = syscall_32_enter(regs);
+ add_random_kstack_offset();
/*
- * Subtlety here: if ptrace pokes something larger than 2^32-1 into
- * orig_ax, the unsigned int return value truncates it. This may
- * or may not be necessary, but it matches the old asm behavior.
+ * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+ * orig_ax, the int return value truncates it. This matches
+ * the semantics of syscall_get_nr().
*/
- nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
+ nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
do_syscall_32_irqs_on(regs, nr);
@@ -99,9 +137,10 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
- unsigned int nr = syscall_32_enter(regs);
+ int nr = syscall_32_enter(regs);
int res;
+ add_random_kstack_offset();
/*
* This cannot use syscall_enter_from_user_mode() as it has to
* fetch EBP before invoking any of the syscall entry work
@@ -127,14 +166,13 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
/* User code screwed up. */
regs->ax = -EFAULT;
- instrumentation_end();
local_irq_disable();
+ instrumentation_end();
irqentry_exit_to_user_mode(regs);
return false;
}
- /* The case truncates any ptrace induced syscall nr > 2^32 -1 */
- nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
+ nr = syscall_enter_from_user_mode_work(regs, nr);
/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr);
@@ -266,15 +304,16 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
irqentry_state_t state = irqentry_enter(regs);
bool inhcall;
+ instrumentation_begin();
run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
inhcall = get_and_clear_inhcall();
if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
- instrumentation_begin();
irqentry_exit_cond_resched();
instrumentation_end();
restore_inhcall(inhcall);
} else {
+ instrumentation_end();
irqentry_exit(regs, state);
}
}
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..887420844066 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -20,7 +20,7 @@
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
- * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 28(%esp) - unused -- was %gs on old stackprotector kernels
* 2C(%esp) - orig_eax
* 30(%esp) - %eip
* 34(%esp) - %cs
@@ -40,7 +40,7 @@
#include <asm/processor-flags.h>
#include <asm/irq_vectors.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/frame.h>
@@ -53,83 +53,6 @@
#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
-/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc. Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
- */
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
- pushl $0
-.endm
-.macro POP_GS pop=0
- addl $(4 + \pop), %esp
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-#else /* CONFIG_X86_32_LAZY_GS */
-
-.macro PUSH_GS
- pushl %gs
-.endm
-
-.macro POP_GS pop=0
-98: popl %gs
- .if \pop <> 0
- add $\pop, %esp
- .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, (%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b, 99b)
-.endm
-
-.macro PTGS_TO_GS
-98: mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, PT_GS(%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b, 99b)
-.endm
-
-.macro GS_TO_REG reg
- movl %gs, \reg
-.endm
-.macro REG_TO_PTGS reg
- movl \reg, PT_GS(%esp)
-.endm
-.macro SET_KERNEL_GS reg
- movl $(__KERNEL_STACK_CANARY), \reg
- movl \reg, %gs
-.endm
-
-#endif /* CONFIG_X86_32_LAZY_GS */
-
/* Unconditionally switch to user cr3 */
.macro SWITCH_TO_USER_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
@@ -209,7 +132,7 @@
*
* Lets build a 5 entry IRET frame after that, such that struct pt_regs
* is complete and in particular regs->sp is correct. This gives us
- * the original 6 enties as gap:
+ * the original 6 entries as gap:
*
* 14*4(%esp) - <previous context>
* 13*4(%esp) - gap / flags
@@ -282,7 +205,7 @@
.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
cld
.if \skip_gs == 0
- PUSH_GS
+ pushl $0
.endif
pushl %fs
@@ -307,9 +230,6 @@
movl $(__USER_DS), %edx
movl %edx, %ds
movl %edx, %es
-.if \skip_gs == 0
- SET_KERNEL_GS %edx
-.endif
/* Switch to kernel stack if necessary */
.if \switch_stacks > 0
SWITCH_TO_KERNEL_STACK
@@ -348,20 +268,16 @@
1: popl %ds
2: popl %es
3: popl %fs
- POP_GS \pop
+4: addl $(4 + \pop), %esp /* pop the unused "gs" slot */
IRET_FRAME
-.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
-.popsection
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 5b)
- _ASM_EXTABLE(3b, 6b)
- POP_GS_EX
+
+ /*
+ * There is no _ASM_EXTABLE_TYPE_REG() for ASM, however since this is
+ * ASM the registers are known and we can trivially hard-code them.
+ */
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_POP_ZERO|EX_REG_DS)
+ _ASM_EXTABLE_TYPE(2b, 3b, EX_TYPE_POP_ZERO|EX_REG_ES)
+ _ASM_EXTABLE_TYPE(3b, 4b, EX_TYPE_POP_ZERO|EX_REG_FS)
.endm
.macro RESTORE_ALL_NMI cr3_reg:req pop=0
@@ -430,7 +346,7 @@
* will soon execute iret and the tracer was already set to
* the irqstate after the IRET:
*/
- DISABLE_INTERRUPTS(CLBR_ANY)
+ cli
lss (%esp), %esp /* switch to espfix segment */
.Lend_\@:
#endif /* CONFIG_X86_ESPFIX32 */
@@ -779,7 +695,7 @@ SYM_CODE_START(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR
movl TASK_stack_canary(%edx), %ebx
- movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+ movl %ebx, PER_CPU_VAR(__stack_chk_guard)
#endif
#ifdef CONFIG_RETPOLINE
@@ -821,7 +737,7 @@ SYM_FUNC_START(schedule_tail_wrapper)
popl %eax
FRAME_END
- ret
+ RET
SYM_FUNC_END(schedule_tail_wrapper)
.popsection
@@ -976,7 +892,6 @@ SYM_FUNC_START(entry_SYSENTER_32)
movl PT_EIP(%esp), %edx /* pt_regs->ip */
movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
1: mov PT_FS(%esp), %fs
- PTGS_TO_GS
popl %ebx /* pt_regs->bx */
addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
@@ -1007,12 +922,9 @@ SYM_FUNC_START(entry_SYSENTER_32)
sti
sysexit
-.pushsection .fixup, "ax"
-2: movl $0, PT_FS(%esp)
- jmp 1b
-.popsection
+2: movl $0, PT_FS(%esp)
+ jmp 1b
_ASM_EXTABLE(1b, 2b)
- PTGS_TO_GS_EX
.Lsysenter_fix_flags:
pushl $X86_EFLAGS_FIXED
@@ -1077,10 +989,9 @@ restore_all_switch_stack:
* when returning from IPI handler and when returning from
* scheduler to user-space.
*/
- INTERRUPT_RETURN
+ iret
-.section .fixup, "ax"
-SYM_CODE_START(asm_iret_error)
+.Lasm_iret_error:
pushl $0 # no error code
pushl $iret_error
@@ -1097,9 +1008,8 @@ SYM_CODE_START(asm_iret_error)
#endif
jmp handle_exception
-SYM_CODE_END(asm_iret_error)
-.previous
- _ASM_EXTABLE(.Lirq_return, asm_iret_error)
+
+ _ASM_EXTABLE(.Lirq_return, .Lasm_iret_error)
SYM_FUNC_END(entry_INT80_32)
.macro FIXUP_ESPFIX_STACK
@@ -1154,11 +1064,7 @@ SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER
- /* fixup %gs */
- GS_TO_REG %ecx
movl PT_GS(%esp), %edi # get the function address
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
/* fixup orig %eax */
movl PT_ORIG_EAX(%esp), %edx # get the error code
@@ -1335,14 +1241,14 @@ SYM_CODE_START(asm_exc_nmi)
SYM_CODE_END(asm_exc_nmi)
.pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_do_exit)
+SYM_CODE_START(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
- call do_exit
+ call make_task_dead
1: jmp 1b
-SYM_CODE_END(rewind_stack_do_exit)
+SYM_CODE_END(rewind_stack_and_make_dead)
.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 400908dff42e..4300ba49b5ee 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -86,6 +86,7 @@
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
+ ENDBR
swapgs
/* tss.sp2 is scratch space. */
@@ -94,6 +95,7 @@ SYM_CODE_START(entry_SYSCALL_64)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
@@ -107,8 +109,9 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
- movq %rax, %rdi
- movq %rsp, %rsi
+ movq %rsp, %rdi
+ /* Sign extend the lower 32bit as syscall numbers are treated as int */
+ movslq %eax, %rsi
call do_syscall_64 /* returns with IRQs disabled */
/*
@@ -188,8 +191,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
- /* rcx and r11 are already restored (see code above) */
- POP_REGS pop_rdi=0 skip_r11rcx=1
+ POP_REGS pop_rdi=0
/*
* Now all regs are restored except RSP and RDI.
@@ -212,8 +214,13 @@ syscall_return_via_sysret:
popq %rdi
popq %rsp
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
swapgs
sysretq
+SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ int3
SYM_CODE_END(entry_SYSCALL_64)
/*
@@ -275,6 +282,7 @@ SYM_FUNC_END(__switch_to_asm)
.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR // copy_thread
movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
@@ -305,7 +313,7 @@ SYM_CODE_END(ret_from_fork)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushq %rax
- SAVE_FLAGS(CLBR_RAX)
+ SAVE_FLAGS
testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@
ud2
@@ -314,6 +322,14 @@ SYM_CODE_END(ret_from_fork)
#endif
.endm
+/* Save all registers in pt_regs */
+SYM_CODE_START_LOCAL(push_and_clear_regs)
+ UNWIND_HINT_FUNC
+ PUSH_AND_CLEAR_REGS save_ret=1
+ ENCODE_FRAME_POINTER 8
+ RET
+SYM_CODE_END(push_and_clear_regs)
+
/**
* idtentry_body - Macro to emit code calling the C function
* @cfunc: C function to be called
@@ -321,7 +337,21 @@ SYM_CODE_END(ret_from_fork)
*/
.macro idtentry_body cfunc has_error_code:req
- call error_entry
+ call push_and_clear_regs
+ UNWIND_HINT_REGS
+
+ /*
+ * Call error_entry() and switch to the task stack if from userspace.
+ *
+ * When in XENPV, it is already in the task stack, and it can't fault
+ * for native_iret() nor native_load_gs_index() since XENPV uses its
+ * own pvops for IRET and load_gs_index(). And it doesn't need to
+ * switch the CR3. So it can skip invoking error_entry().
+ */
+ ALTERNATIVE "call error_entry; movq %rax, %rsp", \
+ "", X86_FEATURE_XENPV
+
+ ENCODE_FRAME_POINTER
UNWIND_HINT_REGS
movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
@@ -333,6 +363,9 @@ SYM_CODE_END(ret_from_fork)
call \cfunc
+ /* For some configurations \cfunc ends up being a noreturn. */
+ REACHABLE
+
jmp error_return
.endm
@@ -349,7 +382,9 @@ SYM_CODE_END(ret_from_fork)
.macro idtentry vector asmsym cfunc has_error_code:req
SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+ ENDBR
ASM_CLAC
+ cld
.if \has_error_code == 0
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -416,7 +451,9 @@ SYM_CODE_END(\asmsym)
.macro idtentry_mce_db vector asmsym cfunc
SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS
+ ENDBR
ASM_CLAC
+ cld
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -471,7 +508,9 @@ SYM_CODE_END(\asmsym)
.macro idtentry_vc vector asmsym cfunc
SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS
+ ENDBR
ASM_CLAC
+ cld
/*
* If the entry is from userspace, switch stacks and treat it as
@@ -498,6 +537,7 @@ SYM_CODE_START(\asmsym)
call vc_switch_off_ist
movq %rax, %rsp /* Switch to new stack */
+ ENCODE_FRAME_POINTER
UNWIND_HINT_REGS
/* Update pt_regs */
@@ -506,18 +546,18 @@ SYM_CODE_START(\asmsym)
movq %rsp, %rdi /* pt_regs pointer */
- call \cfunc
+ call kernel_\cfunc
/*
* No need to switch back to the IST stack. The current stack is either
* identical to the stack in the IRET frame or the VC fall-back stack,
- * so it is definitly mapped even with PTI enabled.
+ * so it is definitely mapped even with PTI enabled.
*/
jmp paranoid_exit
/* Switch to the regular task stack */
.Lfrom_usermode_switch_stack_\@:
- idtentry_body safe_stack_\cfunc, has_error_code=1
+ idtentry_body user_\cfunc, has_error_code=1
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
@@ -532,7 +572,9 @@ SYM_CODE_END(\asmsym)
.macro idtentry_df vector asmsym cfunc
SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
ASM_CLAC
+ cld
/* paranoid_entry returns GS information for paranoid_exit in EBX. */
call paranoid_entry
@@ -543,6 +585,9 @@ SYM_CODE_START(\asmsym)
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
call \cfunc
+ /* For some configurations \cfunc ends up being a noreturn. */
+ REACHABLE
+
jmp paranoid_exit
_ASM_NOKPROBE(\asmsym)
@@ -563,6 +608,7 @@ __irqentry_text_start:
.align 16
.globl __irqentry_text_end
__irqentry_text_end:
+ ANNOTATE_NOENDBR
SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
@@ -573,6 +619,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
+#ifdef CONFIG_XEN_PV
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+#endif
+
POP_REGS pop_rdi=0
/*
@@ -603,8 +653,8 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
/* Restore RDI. */
popq %rdi
- SWAPGS
- INTERRUPT_RETURN
+ swapgs
+ jmp .Lnative_iret
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
@@ -621,9 +671,14 @@ SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
* when returning from IPI handler.
*/
- INTERRUPT_RETURN
+#ifdef CONFIG_XEN_PV
+SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ .byte 0xe9
+ .long .Lnative_iret - (. + 4)
+#endif
-SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
+.Lnative_iret:
UNWIND_HINT_IRET_REGS
/*
* Are we returning to a stack segment from the LDT? Note: in
@@ -635,6 +690,7 @@ SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
#endif
SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR // exc_double_fault
/*
* This may fault. Non-paranoid faults on return to userspace are
* handled by fixup_bad_iret. These include #SS, #GP, and #NP.
@@ -729,18 +785,15 @@ SYM_FUNC_START(asm_load_gs_index)
FRAME_BEGIN
swapgs
.Lgs_change:
+ ANNOTATE_NOENDBR // error_entry
movl %edi, %gs
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
swapgs
FRAME_END
- ret
-SYM_FUNC_END(asm_load_gs_index)
-EXPORT_SYMBOL(asm_load_gs_index)
+ RET
- _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
- .section .fixup, "ax"
/* running with kernelgs */
-SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
+.Lbad_gs:
swapgs /* switch back to user gs */
.macro ZAP_GS
/* This can't be a string because the preprocessor needs to see it. */
@@ -751,8 +804,11 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
xorl %eax, %eax
movl %eax, %gs
jmp 2b
-SYM_CODE_END(.Lbad_gs)
- .previous
+
+ _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
+
+SYM_FUNC_END(asm_load_gs_index)
+EXPORT_SYMBOL(asm_load_gs_index)
#ifdef CONFIG_XEN_PV
/*
@@ -800,6 +856,7 @@ SYM_CODE_END(exc_xen_hypervisor_callback)
*/
SYM_CODE_START(xen_failsafe_callback)
UNWIND_HINT_EMPTY
+ ENDBR
movl %ds, %ecx
cmpw %cx, 0x10(%rsp)
jne 1f
@@ -843,7 +900,6 @@ SYM_CODE_END(xen_failsafe_callback)
*/
SYM_CODE_START_LOCAL(paranoid_entry)
UNWIND_HINT_FUNC
- cld
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
@@ -884,11 +940,12 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* is needed here.
*/
SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
- ret
+ RET
.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl $1, %ebx
+
/*
* The kernel-enforced convention is a negative GSBASE indicates
* a kernel value. No SWAPGS needed on entry and exit.
@@ -896,22 +953,15 @@ SYM_CODE_START_LOCAL(paranoid_entry)
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
- jns .Lparanoid_entry_swapgs
- ret
+ js .Lparanoid_kernel_gsbase
-.Lparanoid_entry_swapgs:
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
swapgs
+.Lparanoid_kernel_gsbase:
- /*
- * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
- * unconditional CR3 write, even in the PTI case. So do an lfence
- * to prevent GS speculation, regardless of whether PTI is enabled.
- */
FENCE_SWAPGS_KERNEL_ENTRY
-
- /* EBX = 0 -> SWAPGS required on exit */
- xorl %ebx, %ebx
- ret
+ RET
SYM_CODE_END(paranoid_entry)
/*
@@ -963,13 +1013,10 @@ SYM_CODE_START_LOCAL(paranoid_exit)
SYM_CODE_END(paranoid_exit)
/*
- * Save all registers in pt_regs, and switch GS if needed.
+ * Switch GS and CR3 if needed.
*/
SYM_CODE_START_LOCAL(error_entry)
UNWIND_HINT_FUNC
- cld
- PUSH_AND_CLEAR_REGS save_ret=1
- ENCODE_FRAME_POINTER 8
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -977,25 +1024,16 @@ SYM_CODE_START_LOCAL(error_entry)
* We entered from user mode or we're pretending to have entered
* from user mode due to an IRET fault.
*/
- SWAPGS
+ swapgs
FENCE_SWAPGS_USER_ENTRY
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
.Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
- popq %r12 /* save return addr in %12 */
- movq %rsp, %rdi /* arg0 = pt_regs pointer */
call sync_regs
- movq %rax, %rsp /* switch stack */
- ENCODE_FRAME_POINTER
- pushq %r12
- ret
-
-.Lerror_entry_done_lfence:
- FENCE_SWAPGS_KERNEL_ENTRY
-.Lerror_entry_done:
- ret
+ RET
/*
* There are two places in the kernel that can potentially fault with
@@ -1018,9 +1056,16 @@ SYM_CODE_START_LOCAL(error_entry)
* gsbase and proceed. We'll fix up the exception and land in
* .Lgs_change's error handler with kernel gsbase.
*/
- SWAPGS
- FENCE_SWAPGS_USER_ENTRY
- jmp .Lerror_entry_done
+ swapgs
+
+ /*
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
+ * kernel or user gsbase.
+ */
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
+ leaq 8(%rsp), %rax /* return pt_regs pointer */
+ RET
.Lbstep_iret:
/* Fix truncated RIP */
@@ -1032,7 +1077,7 @@ SYM_CODE_START_LOCAL(error_entry)
* We came from an IRET to user mode, so we have user
* gsbase and CR3. Switch to kernel gsbase and CR3:
*/
- SWAPGS
+ swapgs
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
@@ -1040,9 +1085,9 @@ SYM_CODE_START_LOCAL(error_entry)
* Pretend that the exception came from user mode: set up pt_regs
* as if we faulted immediately after IRET.
*/
- mov %rsp, %rdi
+ leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
call fixup_bad_iret
- mov %rax, %rsp
+ mov %rax, %rdi
jmp .Lerror_entry_from_usermode_after_swapgs
SYM_CODE_END(error_entry)
@@ -1064,6 +1109,7 @@ SYM_CODE_END(error_return)
*/
SYM_CODE_START(asm_exc_nmi)
UNWIND_HINT_IRET_REGS
+ ENDBR
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
@@ -1104,6 +1150,7 @@ SYM_CODE_START(asm_exc_nmi)
*/
ASM_CLAC
+ cld
/* Use %rdx as our temp variable throughout */
pushq %rdx
@@ -1123,7 +1170,6 @@ SYM_CODE_START(asm_exc_nmi)
*/
swapgs
- cld
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
@@ -1311,6 +1357,7 @@ first_nmi:
#endif
repeat_nmi:
+ ANNOTATE_NOENDBR // this code
/*
* If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another
@@ -1339,6 +1386,7 @@ repeat_nmi:
.endr
subq $(5*8), %rsp
end_repeat_nmi:
+ ANNOTATE_NOENDBR // this code
/*
* Everything below this point can be preempted by a nested NMI.
@@ -1422,13 +1470,14 @@ SYM_CODE_END(asm_exc_nmi)
*/
SYM_CODE_START(ignore_sysret)
UNWIND_HINT_EMPTY
+ ENDBR
mov $-ENOSYS, %eax
sysretl
SYM_CODE_END(ignore_sysret)
#endif
.pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_do_exit)
+SYM_CODE_START(rewind_stack_and_make_dead)
UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
@@ -1437,6 +1486,6 @@ SYM_CODE_START(rewind_stack_do_exit)
leaq -PTREGS_SIZE(%rax), %rsp
UNWIND_HINT_REGS
- call do_exit
-SYM_CODE_END(rewind_stack_do_exit)
+ call make_task_dead
+SYM_CODE_END(rewind_stack_and_make_dead)
.popsection
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 0051cf5c792d..d1052742ad0c 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -48,8 +48,9 @@
*/
SYM_CODE_START(entry_SYSENTER_compat)
UNWIND_HINT_EMPTY
+ ENDBR
/* Interrupts are off on entry. */
- SWAPGS
+ swapgs
pushq %rax
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
@@ -82,32 +83,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
movl %eax, %eax
pushq %rax /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
- pushq %rsi /* pt_regs->si */
- pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
- pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 = 0 */
- xorl %r8d, %r8d /* nospec r8 */
- pushq $0 /* pt_regs->r9 = 0 */
- xorl %r9d, %r9d /* nospec r9 */
- pushq $0 /* pt_regs->r10 = 0 */
- xorl %r10d, %r10d /* nospec r10 */
- pushq $0 /* pt_regs->r11 = 0 */
- xorl %r11d, %r11d /* nospec r11 */
- pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx */
- pushq %rbp /* pt_regs->rbp (will be overwritten) */
- xorl %ebp, %ebp /* nospec rbp */
- pushq $0 /* pt_regs->r12 = 0 */
- xorl %r12d, %r12d /* nospec r12 */
- pushq $0 /* pt_regs->r13 = 0 */
- xorl %r13d, %r13d /* nospec r13 */
- pushq $0 /* pt_regs->r14 = 0 */
- xorl %r14d, %r14d /* nospec r14 */
- pushq $0 /* pt_regs->r15 = 0 */
- xorl %r15d, %r15d /* nospec r15 */
-
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
UNWIND_HINT_REGS
cld
@@ -147,6 +123,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
popfq
jmp .Lsysenter_flags_fixed
SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR // is_sysenter_singlestep
SYM_CODE_END(entry_SYSENTER_compat)
/*
@@ -198,6 +175,7 @@ SYM_CODE_END(entry_SYSENTER_compat)
*/
SYM_CODE_START(entry_SYSCALL_compat)
UNWIND_HINT_EMPTY
+ ENDBR
/* Interrupts are off on entry. */
swapgs
@@ -211,6 +189,7 @@ SYM_CODE_START(entry_SYSCALL_compat)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
@@ -221,35 +200,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
movl %eax, %eax /* discard orig_ax high bits */
pushq %rax /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
- pushq %rsi /* pt_regs->si */
- xorl %esi, %esi /* nospec si */
- pushq %rdx /* pt_regs->dx */
- xorl %edx, %edx /* nospec dx */
- pushq %rbp /* pt_regs->cx (stashed in bp) */
- xorl %ecx, %ecx /* nospec cx */
- pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 = 0 */
- xorl %r8d, %r8d /* nospec r8 */
- pushq $0 /* pt_regs->r9 = 0 */
- xorl %r9d, %r9d /* nospec r9 */
- pushq $0 /* pt_regs->r10 = 0 */
- xorl %r10d, %r10d /* nospec r10 */
- pushq $0 /* pt_regs->r11 = 0 */
- xorl %r11d, %r11d /* nospec r11 */
- pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx */
- pushq %rbp /* pt_regs->rbp (will be overwritten) */
- xorl %ebp, %ebp /* nospec rbp */
- pushq $0 /* pt_regs->r12 = 0 */
- xorl %r12d, %r12d /* nospec r12 */
- pushq $0 /* pt_regs->r13 = 0 */
- xorl %r13d, %r13d /* nospec r13 */
- pushq $0 /* pt_regs->r14 = 0 */
- xorl %r14d, %r14d /* nospec r14 */
- pushq $0 /* pt_regs->r15 = 0 */
- xorl %r15d, %r15d /* nospec r15 */
-
+ PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS
UNWIND_HINT_REGS
movq %rsp, %rdi
@@ -293,6 +244,8 @@ sysret32_from_system_call:
* code. We zero R8-R10 to avoid info leaks.
*/
movq RSP-ORIG_RAX(%rsp), %rsp
+SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
/*
* The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
@@ -310,6 +263,9 @@ sysret32_from_system_call:
xorl %r10d, %r10d
swapgs
sysretl
+SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ int3
SYM_CODE_END(entry_SYSCALL_compat)
/*
@@ -340,6 +296,7 @@ SYM_CODE_END(entry_SYSCALL_compat)
*/
SYM_CODE_START(entry_INT80_compat)
UNWIND_HINT_EMPTY
+ ENDBR
/*
* Interrupts are off on entry.
*/
@@ -357,54 +314,25 @@ SYM_CODE_START(entry_INT80_compat)
/* switch to thread stack expects orig_ax and rdi to be pushed */
pushq %rax /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
/* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/* In the Xen PV case we already run on the thread stack. */
ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
- movq %rsp, %rdi
+ movq %rsp, %rax
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- pushq 6*8(%rdi) /* regs->ss */
- pushq 5*8(%rdi) /* regs->rsp */
- pushq 4*8(%rdi) /* regs->eflags */
- pushq 3*8(%rdi) /* regs->cs */
- pushq 2*8(%rdi) /* regs->ip */
- pushq 1*8(%rdi) /* regs->orig_ax */
- pushq (%rdi) /* pt_regs->di */
+ pushq 5*8(%rax) /* regs->ss */
+ pushq 4*8(%rax) /* regs->rsp */
+ pushq 3*8(%rax) /* regs->eflags */
+ pushq 2*8(%rax) /* regs->cs */
+ pushq 1*8(%rax) /* regs->ip */
+ pushq 0*8(%rax) /* regs->orig_ax */
.Lint80_keep_stack:
- pushq %rsi /* pt_regs->si */
- xorl %esi, %esi /* nospec si */
- pushq %rdx /* pt_regs->dx */
- xorl %edx, %edx /* nospec dx */
- pushq %rcx /* pt_regs->cx */
- xorl %ecx, %ecx /* nospec cx */
- pushq $-ENOSYS /* pt_regs->ax */
- pushq %r8 /* pt_regs->r8 */
- xorl %r8d, %r8d /* nospec r8 */
- pushq %r9 /* pt_regs->r9 */
- xorl %r9d, %r9d /* nospec r9 */
- pushq %r10 /* pt_regs->r10*/
- xorl %r10d, %r10d /* nospec r10 */
- pushq %r11 /* pt_regs->r11 */
- xorl %r11d, %r11d /* nospec r11 */
- pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx */
- pushq %rbp /* pt_regs->rbp */
- xorl %ebp, %ebp /* nospec rbp */
- pushq %r12 /* pt_regs->r12 */
- xorl %r12d, %r12d /* nospec r12 */
- pushq %r13 /* pt_regs->r13 */
- xorl %r13d, %r13d /* nospec r13 */
- pushq %r14 /* pt_regs->r14 */
- xorl %r14d, %r14d /* nospec r14 */
- pushq %r15 /* pt_regs->r15 */
- xorl %r15d, %r15d /* nospec r15 */
-
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
UNWIND_HINT_REGS
cld
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 86eb0d89d46f..8cfc9bc73e7f 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -5,21 +5,21 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
-#define __SYSCALL_I386(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
+#ifdef CONFIG_IA32_EMULATION
+#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat)
+#else
+#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
+#endif
+
+#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
+#undef __SYSCALL
-#define __SYSCALL_I386(nr, sym) [nr] = __ia32_##sym,
+#define __SYSCALL(nr, sym) __ia32_##sym,
-__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_ia32_syscall_max] = &__ia32_sys_ni_syscall,
+__visible const sys_call_ptr_t ia32_sys_call_table[] = {
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 1594ec72bcbb..be120eec1fc9 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -5,23 +5,14 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
-#define __SYSCALL_X32(nr, sym)
-#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym)
-
-#define __SYSCALL_64(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
-#undef __SYSCALL_64
+#undef __SYSCALL
-#define __SYSCALL_64(nr, sym) [nr] = __x64_##sym,
+#define __SYSCALL(nr, sym) __x64_##sym,
-asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall,
+asmlinkage const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index f2fe0a33bcfd..bdd0e03a1265 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -5,37 +5,14 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
-/*
- * Reuse the 64-bit entry points for the x32 versions that occupy different
- * slots in the syscall table.
- */
-#define __x32_sys_readv __x64_sys_readv
-#define __x32_sys_writev __x64_sys_writev
-#define __x32_sys_getsockopt __x64_sys_getsockopt
-#define __x32_sys_setsockopt __x64_sys_setsockopt
-#define __x32_sys_vmsplice __x64_sys_vmsplice
-#define __x32_sys_process_vm_readv __x64_sys_process_vm_readv
-#define __x32_sys_process_vm_writev __x64_sys_process_vm_writev
+#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#include <asm/syscalls_x32.h>
+#undef __SYSCALL
-#define __SYSCALL_64(nr, sym)
+#define __SYSCALL(nr, sym) __x64_##sym,
-#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *);
-#define __SYSCALL_COMMON(nr, sym) extern long __x64_##sym(const struct pt_regs *);
-#include <asm/syscalls_64.h>
-#undef __SYSCALL_X32
-#undef __SYSCALL_COMMON
-
-#define __SYSCALL_X32(nr, sym) [nr] = __x32_##sym,
-#define __SYSCALL_COMMON(nr, sym) [nr] = __x64_##sym,
-
-asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_x32_syscall_max] = &__x64_sys_ni_syscall,
-#include <asm/syscalls_64.h>
+asmlinkage const sys_call_ptr_t x32_sys_call_table[] = {
+#include <asm/syscalls_x32.h>
};
diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
index d8c4f6c9eadc..eca5d6eff132 100644
--- a/arch/x86/entry/syscalls/Makefile
+++ b/arch/x86/entry/syscalls/Makefile
@@ -3,53 +3,59 @@ out := arch/$(SRCARCH)/include/generated/asm
uapi := arch/$(SRCARCH)/include/generated/uapi/asm
# Create output directory if not already present
-_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
- $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')
+$(shell mkdir -p $(out) $(uapi))
syscall32 := $(src)/syscall_32.tbl
syscall64 := $(src)/syscall_64.tbl
-syshdr := $(srctree)/$(src)/syscallhdr.sh
-systbl := $(srctree)/$(src)/syscalltbl.sh
+syshdr := $(srctree)/scripts/syscallhdr.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
+offset :=
+prefix :=
quiet_cmd_syshdr = SYSHDR $@
- cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \
- '$(syshdr_abi_$(basetarget))' \
- '$(syshdr_pfx_$(basetarget))' \
- '$(syshdr_offset_$(basetarget))'
+ cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \
+ $(if $(offset),--offset $(offset)) \
+ $(if $(prefix),--prefix $(prefix)) \
+ $< $@
quiet_cmd_systbl = SYSTBL $@
- cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
+ cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@
quiet_cmd_hypercalls = HYPERCALLS $@
cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs))
-syshdr_abi_unistd_32 := i386
+$(uapi)/unistd_32.h: abis := i386
$(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE
$(call if_changed,syshdr)
-syshdr_abi_unistd_32_ia32 := i386
-syshdr_pfx_unistd_32_ia32 := ia32_
+$(out)/unistd_32_ia32.h: abis := i386
+$(out)/unistd_32_ia32.h: prefix := ia32_
$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE
$(call if_changed,syshdr)
-syshdr_abi_unistd_x32 := common,x32
-syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT
+$(uapi)/unistd_x32.h: abis := common,x32
+$(uapi)/unistd_x32.h: offset := __X32_SYSCALL_BIT
$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
-syshdr_abi_unistd_64 := common,64
+$(uapi)/unistd_64.h: abis := common,64
$(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
-syshdr_abi_unistd_64_x32 := x32
-syshdr_pfx_unistd_64_x32 := x32_
+$(out)/unistd_64_x32.h: abis := x32
+$(out)/unistd_64_x32.h: prefix := x32_
$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
+$(out)/syscalls_32.h: abis := i386
$(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE
$(call if_changed,systbl)
+$(out)/syscalls_64.h: abis := common,64
$(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE
$(call if_changed,systbl)
+$(out)/syscalls_x32.h: abis := common,x32
+$(out)/syscalls_x32.h: $(syscall64) $(systbl) FORCE
+ $(call if_changed,systbl)
$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE
$(call if_changed,hypercalls)
@@ -60,6 +66,7 @@ uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h
syshdr-y += syscalls_32.h
syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
syshdr-$(CONFIG_X86_64) += syscalls_64.h
+syshdr-$(CONFIG_X86_X32_ABI) += syscalls_x32.h
syshdr-$(CONFIG_XEN) += xen-hypercalls.h
uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y))
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index a1c9f496fca6..320480a8db4f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -145,7 +145,7 @@
131 i386 quotactl sys_quotactl
132 i386 getpgid sys_getpgid
133 i386 fchdir sys_fchdir
-134 i386 bdflush sys_bdflush
+134 i386 bdflush sys_ni_syscall
135 i386 sysfs sys_sysfs
136 i386 personality sys_personality
137 i386 afs_syscall
@@ -286,7 +286,7 @@
272 i386 fadvise64_64 sys_ia32_fadvise64_64
273 i386 vserver
274 i386 mbind sys_mbind
-275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
+275 i386 get_mempolicy sys_get_mempolicy
276 i386 set_mempolicy sys_set_mempolicy
277 i386 mq_open sys_mq_open compat_sys_mq_open
278 i386 mq_unlink sys_mq_unlink
@@ -328,7 +328,7 @@
314 i386 sync_file_range sys_ia32_sync_file_range
315 i386 tee sys_tee
316 i386 vmsplice sys_vmsplice
-317 i386 move_pages sys_move_pages compat_sys_move_pages
+317 i386 move_pages sys_move_pages
318 i386 getcpu sys_getcpu
319 i386 epoll_pwait sys_epoll_pwait
320 i386 utimensat sys_utimensat_time32
@@ -447,3 +447,11 @@
440 i386 process_madvise sys_process_madvise
441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
442 i386 mount_setattr sys_mount_setattr
+443 i386 quotactl_fd sys_quotactl_fd
+444 i386 landlock_create_ruleset sys_landlock_create_ruleset
+445 i386 landlock_add_rule sys_landlock_add_rule
+446 i386 landlock_restrict_self sys_landlock_restrict_self
+447 i386 memfd_secret sys_memfd_secret
+448 i386 process_mrelease sys_process_mrelease
+449 i386 futex_waitv sys_futex_waitv
+450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7bf01cbe582f..c84d12608cd2 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -364,6 +364,14 @@
440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
442 common mount_setattr sys_mount_setattr
+443 common quotactl_fd sys_quotactl_fd
+444 common landlock_create_ruleset sys_landlock_create_ruleset
+445 common landlock_add_rule sys_landlock_add_rule
+446 common landlock_restrict_self sys_landlock_restrict_self
+447 common memfd_secret sys_memfd_secret
+448 common process_mrelease sys_process_mrelease
+449 common futex_waitv sys_futex_waitv
+450 common set_mempolicy_home_node sys_set_mempolicy_home_node
#
# Due to a historical design error, certain syscalls are numbered differently
@@ -392,7 +400,7 @@
530 x32 set_robust_list compat_sys_set_robust_list
531 x32 get_robust_list compat_sys_get_robust_list
532 x32 vmsplice sys_vmsplice
-533 x32 move_pages compat_sys_move_pages
+533 x32 move_pages sys_move_pages
534 x32 preadv compat_sys_preadv64
535 x32 pwritev compat_sys_pwritev64
536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh
deleted file mode 100644
index cc1e63857427..000000000000
--- a/arch/x86/entry/syscalls/syscallhdr.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-prefix="$4"
-offset="$5"
-
-fileguard=_ASM_X86_`basename "$out" | sed \
- -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
- -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
- echo "#ifndef ${fileguard}"
- echo "#define ${fileguard} 1"
- echo ""
-
- max=0
- while read nr abi name entry ; do
- if [ -z "$offset" ]; then
- echo "#define __NR_${prefix}${name} $nr"
- else
- echo "#define __NR_${prefix}${name} ($offset + $nr)"
- fi
-
- max=$nr
- done
-
- echo ""
- echo "#ifdef __KERNEL__"
- echo "#define __NR_${prefix}syscall_max $max"
- echo "#endif"
- echo ""
- echo "#endif /* ${fileguard} */"
-) > "$out"
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
deleted file mode 100644
index 929bde120d6b..000000000000
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-
-syscall_macro() {
- local abi="$1"
- local nr="$2"
- local entry="$3"
-
- echo "__SYSCALL_${abi}($nr, $entry)"
-}
-
-emit() {
- local abi="$1"
- local nr="$2"
- local entry="$3"
- local compat="$4"
-
- if [ "$abi" != "I386" -a -n "$compat" ]; then
- echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2
- exit 1
- fi
-
- if [ -z "$compat" ]; then
- if [ -n "$entry" ]; then
- syscall_macro "$abi" "$nr" "$entry"
- fi
- else
- echo "#ifdef CONFIG_X86_32"
- if [ -n "$entry" ]; then
- syscall_macro "$abi" "$nr" "$entry"
- fi
- echo "#else"
- syscall_macro "$abi" "$nr" "$compat"
- echo "#endif"
- fi
-}
-
-grep '^[0-9]' "$in" | sort -n | (
- while read nr abi name entry compat; do
- abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
- emit "$abi" "$nr" "$entry" "$compat"
- done
-) > "$out"
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index f1f96d4d8cd6..7591bab060f7 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -24,7 +24,7 @@ SYM_CODE_START_NOALIGN(\name)
popl %edx
popl %ecx
popl %eax
- ret
+ RET
_ASM_NOKPROBE(\name)
SYM_CODE_END(\name)
.endm
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index 496b11ec469d..505b488fcc65 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -50,7 +50,7 @@ SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore)
popq %rsi
popq %rdi
popq %rbp
- ret
+ RET
_ASM_NOKPROBE(__thunk_restore)
SYM_CODE_END(__thunk_restore)
#endif
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 05c4abc2fdfd..c2a8b76ae0bc 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -91,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
#
# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
@@ -131,7 +131,7 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE
targets += vdsox32.lds $(vobjx32s-y)
$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table
-$(obj)/%.so: $(obj)/%.so.dbg
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
@@ -148,6 +148,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
@@ -172,7 +173,7 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
# The DSO images are built using a special linker script.
#
quiet_cmd_vdso = VDSO $@
- cmd_vdso = $(LD) -nostdlib -o $@ \
+ cmd_vdso = $(LD) -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-T $(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index dc8da7695859..bafa73f09e92 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -77,7 +77,6 @@ SECTIONS
.text : {
*(.text*)
- *(.fixup)
} :text =0x90909090,
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 2d0f3d8bcc25..edfe9780f6d1 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -218,7 +218,7 @@ int main(int argc, char **argv)
/*
* Figure out the struct name. If we're writing to a .so file,
- * generate raw output insted.
+ * generate raw output instead.
*/
name = strdup(argv[3]);
namelen = strlen(name);
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 1c7cfac7e64a..5264daa8859f 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -35,7 +35,7 @@ static void BITSFUNC(extract)(const unsigned char *data, size_t data_len,
if (offset + len > data_len)
fail("section to extract overruns input data");
- fprintf(outfile, "static const unsigned char %s[%lu] = {", name, len);
+ fprintf(outfile, "static const unsigned char %s[%zu] = {", name, len);
BITSFUNC(copy)(outfile, data + offset, len);
fprintf(outfile, "\n};\n\n");
}
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index de1fff7188aa..d33c6513fd2c 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -6,7 +6,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
.text
.globl __kernel_vsyscall
@@ -29,7 +29,7 @@ __kernel_vsyscall:
* anyone with an AMD CPU, for example). Nonetheless, we try to keep
* it working approximately as well as it ever worked.
*
- * This link may eludicate some of the history:
+ * This link may elucidate some of the history:
* https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
* personally, I find it hard to understand what's going on there.
*
@@ -78,7 +78,7 @@ SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL)
popl %ecx
CFI_RESTORE ecx
CFI_ADJUST_CFA_OFFSET -4
- ret
+ RET
CFI_ENDPROC
.size __kernel_vsyscall,.-__kernel_vsyscall
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 825e829ffff1..1000d457c332 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -358,7 +358,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
mmap_write_lock(mm);
/*
* Check if we have already mapped vdso blob - fail to prevent
- * abusing from userspace install_speciall_mapping, which may
+ * abusing from userspace install_special_mapping, which may
* not do accounting and rlimit right.
* We could search vma near context.vdso, but it's a slowpath,
* so let's explicitly check all VMAs to be completely sure.
@@ -438,7 +438,7 @@ bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
static __init int vdso_setup(char *s)
{
vdso64_enabled = simple_strtoul(s, NULL, 0);
- return 0;
+ return 1;
}
__setup("vdso=", vdso_setup);
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index 86a0e94f68df..d77d278ee9dd 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -81,7 +81,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
pop %rbx
leave
.cfi_def_cfa %rsp, 8
- ret
+ RET
/* The out-of-line code runs with the pre-leave stack frame. */
.cfi_def_cfa %rbp, 16
@@ -137,7 +137,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
/*
* If the return from callback is zero or negative, return immediately,
- * else re-execute ENCLU with the postive return value interpreted as
+ * else re-execute ENCLU with the positive return value interpreted as
* the requested ENCLU function.
*/
cmp $0, %eax
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 1b40b9297083..4af81df133ee 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -48,7 +48,7 @@ static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
XONLY;
#else
- EMULATE;
+ #error VSYSCALL config is broken
#endif
static int __init vsyscall_setup(char *str)
@@ -226,7 +226,8 @@ bool emulate_vsyscall(unsigned long error_code,
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
- do_exit(SIGSYS);
+ force_exit_sig(SIGSYS);
+ return true;
}
regs->orig_ax = -1;
if (tmp)
diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
index 2e203f3a25a7..15e35159ebb6 100644
--- a/arch/x86/entry/vsyscall/vsyscall_emu_64.S
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
@@ -19,17 +19,17 @@ __vsyscall_page:
mov $__NR_gettimeofday, %rax
syscall
- ret
+ RET
.balign 1024, 0xcc
mov $__NR_time, %rax
syscall
- ret
+ RET
.balign 1024, 0xcc
mov $__NR_getcpu, %rax
syscall
- ret
+ RET
.balign 4096, 0xcc
diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig
index 39d9ded9e25a..dabdf3d7bf84 100644
--- a/arch/x86/events/Kconfig
+++ b/arch/x86/events/Kconfig
@@ -6,24 +6,24 @@ config PERF_EVENTS_INTEL_UNCORE
depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
default y
help
- Include support for Intel uncore performance events. These are
- available on NehalemEX and more modern processors.
+ Include support for Intel uncore performance events. These are
+ available on NehalemEX and more modern processors.
config PERF_EVENTS_INTEL_RAPL
tristate "Intel/AMD rapl performance events"
depends on PERF_EVENTS && (CPU_SUP_INTEL || CPU_SUP_AMD) && PCI
default y
help
- Include support for Intel and AMD rapl performance events for power
- monitoring on modern processors.
+ Include support for Intel and AMD rapl performance events for power
+ monitoring on modern processors.
config PERF_EVENTS_INTEL_CSTATE
tristate "Intel cstate performance events"
depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
default y
help
- Include support for Intel cstate performance events for power
- monitoring on modern processors.
+ Include support for Intel cstate performance events for power
+ monitoring on modern processors.
config PERF_EVENTS_AMD_POWER
depends on PERF_EVENTS && CPU_SUP_AMD
@@ -34,4 +34,22 @@ config PERF_EVENTS_AMD_POWER
(CPUID Fn8000_0007_EDX[12]) interface to calculate the
average power consumption on Family 15h processors.
+config PERF_EVENTS_AMD_UNCORE
+ tristate "AMD Uncore performance events"
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ default y
+ help
+ Include support for AMD uncore performance events for use with
+ e.g., perf stat -e amd_l3/.../,amd_df/.../.
+
+ To compile this driver as a module, choose M here: the
+ module will be called 'amd-uncore'.
+
+config PERF_EVENTS_AMD_BRS
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ bool "AMD Zen3 Branch Sampling support"
+ help
+ Enable AMD Zen3 branch sampling support (BRS) which samples up to
+ 16 consecutive taken branches in registers.
+
endmenu
diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
index fe8795a67385..b9f5d4610256 100644
--- a/arch/x86/events/amd/Makefile
+++ b/arch/x86/events/amd/Makefile
@@ -1,8 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o
+obj-$(CONFIG_CPU_SUP_AMD) += core.o
+obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o
obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o
obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o
+obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o
+amd-uncore-objs := uncore.o
ifdef CONFIG_AMD_IOMMU
obj-$(CONFIG_CPU_SUP_AMD) += iommu.o
endif
-
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
new file mode 100644
index 000000000000..bee8765a1e9b
--- /dev/null
+++ b/arch/x86/events/amd/brs.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implement support for AMD Fam19h Branch Sampling feature
+ * Based on specifications published in AMD PPR Fam19 Model 01
+ *
+ * Copyright 2021 Google LLC
+ * Contributed by Stephane Eranian <eranian@google.com>
+ */
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <asm/msr.h>
+#include <asm/cpufeature.h>
+
+#include "../perf_event.h"
+
+#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */
+
+/* Debug Extension Configuration register layout */
+union amd_debug_extn_cfg {
+ __u64 val;
+ struct {
+ __u64 rsvd0:2, /* reserved */
+ brsmen:1, /* branch sample enable */
+ rsvd4_3:2,/* reserved - must be 0x3 */
+ vb:1, /* valid branches recorded */
+ rsvd2:10, /* reserved */
+ msroff:4, /* index of next entry to write */
+ rsvd3:4, /* reserved */
+ pmc:3, /* #PMC holding the sampling event */
+ rsvd4:37; /* reserved */
+ };
+};
+
+static inline unsigned int brs_from(int idx)
+{
+ return MSR_AMD_SAMP_BR_FROM + 2 * idx;
+}
+
+static inline unsigned int brs_to(int idx)
+{
+ return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1;
+}
+
+static inline void set_debug_extn_cfg(u64 val)
+{
+ /* bits[4:3] must always be set to 11b */
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3);
+}
+
+static inline u64 get_debug_extn_cfg(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, val);
+ return val;
+}
+
+static bool __init amd_brs_detect(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BRS))
+ return false;
+
+ switch (boot_cpu_data.x86) {
+ case 0x19: /* AMD Fam19h (Zen3) */
+ x86_pmu.lbr_nr = 16;
+
+ /* No hardware filtering supported */
+ x86_pmu.lbr_sel_map = NULL;
+ x86_pmu.lbr_sel_mask = 0;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Current BRS implementation does not support branch type or privilege level
+ * filtering. Therefore, this function simply enforces these limitations. No need for
+ * a br_sel_map. Software filtering is not supported because it would not correlate well
+ * with a sampling period.
+ */
+int amd_brs_setup_filter(struct perf_event *event)
+{
+ u64 type = event->attr.branch_sample_type;
+
+ /* No BRS support */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ /* Can only capture all branches, i.e., no filtering */
+ if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* tos = top of stack, i.e., last valid entry written */
+static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg)
+{
+ /*
+ * msroff: index of next entry to write so top-of-stack is one off
+ * if BRS is full then msroff is set back to 0.
+ */
+ return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1;
+}
+
+/*
+ * make sure we have a sane BRS offset to begin with
+ * especially with kexec
+ */
+void amd_brs_reset(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BRS))
+ return;
+
+ /*
+ * Reset config
+ */
+ set_debug_extn_cfg(0);
+
+ /*
+ * Mark first entry as poisoned
+ */
+ wrmsrl(brs_to(0), BRS_POISON);
+}
+
+int __init amd_brs_init(void)
+{
+ if (!amd_brs_detect())
+ return -EOPNOTSUPP;
+
+ pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr);
+
+ return 0;
+}
+
+void amd_brs_enable(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /* Activate only on first user */
+ if (++cpuc->brs_active > 1)
+ return;
+
+ cfg.val = 0; /* reset all fields */
+ cfg.brsmen = 1; /* enable branch sampling */
+
+ /* Set enable bit */
+ set_debug_extn_cfg(cfg.val);
+}
+
+void amd_brs_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ if (cpuc->lbr_users)
+ amd_brs_enable();
+}
+
+void amd_brs_disable(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /* Check if active (could be disabled via x86_pmu_disable_all()) */
+ if (!cpuc->brs_active)
+ return;
+
+ /* Only disable for last user */
+ if (--cpuc->brs_active)
+ return;
+
+ /*
+ * Clear the brsmen bit but preserve the others as they contain
+ * useful state such as vb and msroff
+ */
+ cfg.val = get_debug_extn_cfg();
+
+ /*
+ * When coming in on interrupt and BRS is full, then hw will have
+ * already stopped BRS, no need to issue wrmsr again
+ */
+ if (cfg.brsmen) {
+ cfg.brsmen = 0;
+ set_debug_extn_cfg(cfg.val);
+ }
+}
+
+void amd_brs_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ if (cpuc->lbr_users)
+ amd_brs_disable();
+}
+
+static bool amd_brs_match_plm(struct perf_event *event, u64 to)
+{
+ int type = event->attr.branch_sample_type;
+ int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV;
+ int plm_u = PERF_SAMPLE_BRANCH_USER;
+
+ if (!(type & plm_k) && kernel_ip(to))
+ return 0;
+
+ if (!(type & plm_u) && !kernel_ip(to))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Caller must ensure amd_brs_inuse() is true before calling
+ * return:
+ */
+void amd_brs_drain(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event = cpuc->events[0];
+ struct perf_branch_entry *br = cpuc->lbr_entries;
+ union amd_debug_extn_cfg cfg;
+ u32 i, nr = 0, num, tos, start;
+ u32 shift = 64 - boot_cpu_data.x86_virt_bits;
+
+ /*
+ * BRS event forced on PMC0,
+ * so check if there is an event.
+ * It is possible to have lbr_users > 0 but the event
+ * not yet scheduled due to long latency PMU irq
+ */
+ if (!event)
+ goto empty;
+
+ cfg.val = get_debug_extn_cfg();
+
+ /* Sanity check [0-x86_pmu.lbr_nr] */
+ if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr))
+ goto empty;
+
+ /* No valid branch */
+ if (cfg.vb == 0)
+ goto empty;
+
+ /*
+ * msr.off points to next entry to be written
+ * tos = most recent entry index = msr.off - 1
+ * BRS register buffer saturates, so we know we have
+ * start < tos and that we have to read from start to tos
+ */
+ start = 0;
+ tos = amd_brs_get_tos(&cfg);
+
+ num = tos - start + 1;
+
+ /*
+ * BRS is only one pass (saturation) from MSROFF to depth-1
+ * MSROFF wraps to zero when buffer is full
+ */
+ for (i = 0; i < num; i++) {
+ u32 brs_idx = tos - i;
+ u64 from, to;
+
+ rdmsrl(brs_to(brs_idx), to);
+
+ /* Entry does not belong to us (as marked by kernel) */
+ if (to == BRS_POISON)
+ break;
+
+ /*
+ * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved.
+ * Necessary to generate proper virtual addresses suitable for
+ * symbolization
+ */
+ to = (u64)(((s64)to << shift) >> shift);
+
+ if (!amd_brs_match_plm(event, to))
+ continue;
+
+ rdmsrl(brs_from(brs_idx), from);
+
+ perf_clear_branch_entry_bitfields(br+nr);
+
+ br[nr].from = from;
+ br[nr].to = to;
+
+ nr++;
+ }
+empty:
+ /* Record number of sampled branches */
+ cpuc->lbr_stack.nr = nr;
+}
+
+/*
+ * Poison most recent entry to prevent reuse by next task
+ * required because BRS entry are not tagged by PID
+ */
+static void amd_brs_poison_buffer(void)
+{
+ union amd_debug_extn_cfg cfg;
+ unsigned int idx;
+
+ /* Get current state */
+ cfg.val = get_debug_extn_cfg();
+
+ /* idx is most recently written entry */
+ idx = amd_brs_get_tos(&cfg);
+
+ /* Poison target of entry */
+ wrmsrl(brs_to(idx), BRS_POISON);
+}
+
+/*
+ * On context switch in, we need to make sure no samples from previous user
+ * are left in the BRS.
+ *
+ * On ctxswin, sched_in = true, called after the PMU has started
+ * On ctxswout, sched_in = false, called before the PMU is stopped
+ */
+void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /* no active users */
+ if (!cpuc->lbr_users)
+ return;
+
+ /*
+ * On context switch in, we need to ensure we do not use entries
+ * from previous BRS user on that CPU, so we poison the buffer as
+ * a faster way compared to resetting all entries.
+ */
+ if (sched_in)
+ amd_brs_poison_buffer();
+}
+
+/*
+ * called from ACPI processor_idle.c or acpi_pad.c
+ * with interrupts disabled
+ */
+void perf_amd_brs_lopwr_cb(bool lopwr_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /*
+ * on mwait in, we may end up in non C0 state.
+ * we must disable branch sampling to avoid holding the NMI
+ * for too long. We disable it in hardware but we
+ * keep the state in cpuc, so we can re-enable.
+ *
+ * The hardware will deliver the NMI if needed when brsmen cleared
+ */
+ if (cpuc->brs_active) {
+ cfg.val = get_debug_extn_cfg();
+ cfg.brsmen = !lopwr_in;
+ set_debug_extn_cfg(cfg.val);
+ }
+}
+
+DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb);
+
+void __init amd_brs_lopwr_init(void)
+{
+ static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+}
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 2c1791c4a518..9ac3718410ce 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/perf_event.h>
+#include <linux/jump_label.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -7,6 +8,7 @@
#include <linux/delay.h>
#include <linux/jiffies.h>
#include <asm/apicdef.h>
+#include <asm/apic.h>
#include <asm/nmi.h>
#include "../perf_event.h"
@@ -18,6 +20,9 @@ static unsigned long perf_nmi_window;
#define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL)
#define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE)
+/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */
+static u64 amd_pmu_global_cntr_mask __read_mostly;
+
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -325,8 +330,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc)
}
}
+#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
+static inline int amd_is_brs_event(struct perf_event *e)
+{
+ return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT;
+}
+
static int amd_core_hw_config(struct perf_event *event)
{
+ int ret = 0;
+
if (event->attr.exclude_host && event->attr.exclude_guest)
/*
* When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -343,7 +356,66 @@ static int amd_core_hw_config(struct perf_event *event)
if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw))
event->hw.flags |= PERF_X86_EVENT_PAIR;
- return 0;
+ /*
+ * if branch stack is requested
+ */
+ if (has_branch_stack(event)) {
+ /*
+ * Due to interrupt holding, BRS is not recommended in
+ * counting mode.
+ */
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ /*
+ * Due to the way BRS operates by holding the interrupt until
+ * lbr_nr entries have been captured, it does not make sense
+ * to allow sampling on BRS with an event that does not match
+ * what BRS is capturing, i.e., retired taken branches.
+ * Otherwise the correlation with the event's period is even
+ * more loose:
+ *
+ * With retired taken branch:
+ * Effective P = P + 16 + X
+ * With any other event:
+ * Effective P = P + Y + X
+ *
+ * Where X is the number of taken branches due to interrupt
+ * skid. Skid is large.
+ *
+ * Where Y is the occurences of the event while BRS is
+ * capturing the lbr_nr entries.
+ *
+ * By using retired taken branches, we limit the impact on the
+ * Y variable. We know it cannot be more than the depth of
+ * BRS.
+ */
+ if (!amd_is_brs_event(event))
+ return -EINVAL;
+
+ /*
+ * BRS implementation does not work with frequency mode
+ * reprogramming of the period.
+ */
+ if (event->attr.freq)
+ return -EINVAL;
+ /*
+ * The kernel subtracts BRS depth from period, so it must
+ * be big enough.
+ */
+ if (event->attr.sample_period <= x86_pmu.lbr_nr)
+ return -EINVAL;
+
+ /*
+ * Check if we can allow PERF_SAMPLE_BRANCH_STACK
+ */
+ ret = amd_brs_setup_filter(event);
+
+ /* only set in case of success */
+ if (!ret)
+ event->hw.flags |= PERF_X86_EVENT_AMD_BRS;
+ }
+ return ret;
}
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
@@ -366,7 +438,7 @@ static int amd_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip && get_ibs_caps())
return -ENOENT;
- if (has_branch_stack(event))
+ if (has_branch_stack(event) && !x86_pmu.lbr_nr)
return -EOPNOTSUPP;
ret = x86_pmu_hw_config(event);
@@ -510,6 +582,18 @@ static struct amd_nb *amd_alloc_nb(int cpu)
return nb;
}
+static void amd_pmu_cpu_reset(int cpu)
+{
+ if (x86_pmu.version < 2)
+ return;
+
+ /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
+
+ /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask);
+}
+
static int amd_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -555,6 +639,9 @@ static void amd_pmu_cpu_starting(int cpu)
cpuc->amd_nb->nb_id = nb_id;
cpuc->amd_nb->refcnt++;
+
+ amd_brs_reset();
+ amd_pmu_cpu_reset(cpu);
}
static void amd_pmu_cpu_dead(int cpu)
@@ -574,8 +661,54 @@ static void amd_pmu_cpu_dead(int cpu)
cpuhw->amd_nb = NULL;
}
+
+ amd_pmu_cpu_reset(cpu);
+}
+
+static inline void amd_pmu_set_global_ctl(u64 ctl)
+{
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
}
+static inline u64 amd_pmu_get_global_status(void)
+{
+ u64 status;
+
+ /* PerfCntrGlobalStatus is read-only */
+ rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status);
+
+ return status & amd_pmu_global_cntr_mask;
+}
+
+static inline void amd_pmu_ack_global_status(u64 status)
+{
+ /*
+ * PerfCntrGlobalStatus is read-only but an overflow acknowledgment
+ * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr
+ * clears the same bit in PerfCntrGlobalStatus
+ */
+
+ /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */
+ status &= amd_pmu_global_cntr_mask;
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status);
+}
+
+static bool amd_pmu_test_overflow_topbit(int idx)
+{
+ u64 counter;
+
+ rdmsrl(x86_pmu_event_addr(idx), counter);
+
+ return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1));
+}
+
+static bool amd_pmu_test_overflow_status(int idx)
+{
+ return amd_pmu_get_global_status() & BIT_ULL(idx);
+}
+
+DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit);
+
/*
* When a PMC counter overflows, an NMI is used to process the event and
* reset the counter. NMI latency can result in the counter being updated
@@ -588,7 +721,6 @@ static void amd_pmu_cpu_dead(int cpu)
static void amd_pmu_wait_on_overflow(int idx)
{
unsigned int i;
- u64 counter;
/*
* Wait for the counter to be reset if it has overflowed. This loop
@@ -596,8 +728,7 @@ static void amd_pmu_wait_on_overflow(int idx)
* forever...
*/
for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) {
- rdmsrl(x86_pmu_event_addr(idx), counter);
- if (counter & (1ULL << (x86_pmu.cntval_bits - 1)))
+ if (!static_call(amd_pmu_test_overflow)(idx))
break;
/* Might be in IRQ context, so can't sleep */
@@ -605,13 +736,11 @@ static void amd_pmu_wait_on_overflow(int idx)
}
}
-static void amd_pmu_disable_all(void)
+static void amd_pmu_check_overflow(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
- x86_pmu_disable_all();
-
/*
* This shouldn't be called from NMI context, but add a safeguard here
* to return, since if we're in NMI context we can't wait for an NMI
@@ -623,7 +752,7 @@ static void amd_pmu_disable_all(void)
/*
* Check each counter for overflow and wait for it to be reset by the
* NMI if it has overflowed. This relies on the fact that all active
- * counters are always enabled when this function is caled and
+ * counters are always enabled when this function is called and
* ARCH_PERFMON_EVENTSEL_INT is always set.
*/
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -634,6 +763,47 @@ static void amd_pmu_disable_all(void)
}
}
+static void amd_pmu_enable_event(struct perf_event *event)
+{
+ x86_pmu_enable_event(event);
+}
+
+static void amd_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ amd_brs_enable_all();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ /* only activate events which are marked as active */
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ amd_pmu_enable_event(cpuc->events[idx]);
+ }
+}
+
+static void amd_pmu_v2_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /*
+ * Testing cpu_hw_events.enabled should be skipped in this case unlike
+ * in x86_pmu_enable_event().
+ *
+ * Since cpu_hw_events.enabled is set only after returning from
+ * x86_pmu_start(), the PMCs must be programmed and kept ready.
+ * Counting starts only after x86_pmu_enable_all() is called.
+ */
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+static void amd_pmu_v2_enable_all(int added)
+{
+ amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask);
+}
+
static void amd_pmu_disable_event(struct perf_event *event)
{
x86_pmu_disable_event(event);
@@ -651,6 +821,32 @@ static void amd_pmu_disable_event(struct perf_event *event)
amd_pmu_wait_on_overflow(event->hw.idx);
}
+static void amd_pmu_disable_all(void)
+{
+ amd_brs_disable_all();
+ x86_pmu_disable_all();
+ amd_pmu_check_overflow();
+}
+
+static void amd_pmu_v2_disable_all(void)
+{
+ /* Disable all PMCs */
+ amd_pmu_set_global_ctl(0);
+ amd_pmu_check_overflow();
+}
+
+static void amd_pmu_add_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ amd_pmu_brs_add(event);
+}
+
+static void amd_pmu_del_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ amd_pmu_brs_del(event);
+}
+
/*
* Because of NMI latency, if multiple PMC counters are active or other sources
* of NMIs are received, the perf NMI handler can handle one or more overflowed
@@ -669,13 +865,8 @@ static void amd_pmu_disable_event(struct perf_event *event)
* handled a counter. When an un-handled NMI is received, it will be claimed
* only if arriving within that window.
*/
-static int amd_pmu_handle_irq(struct pt_regs *regs)
+static inline int amd_pmu_adjust_nmi_window(int handled)
{
- int handled;
-
- /* Process any counter overflows */
- handled = x86_pmu_handle_irq(regs);
-
/*
* If a counter was handled, record a timestamp such that un-handled
* NMIs will be claimed if arriving within that window.
@@ -692,6 +883,113 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
return NMI_HANDLED;
}
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int handled;
+ int pmu_enabled;
+
+ /*
+ * Save the PMU state.
+ * It needs to be restored when leaving the handler.
+ */
+ pmu_enabled = cpuc->enabled;
+ cpuc->enabled = 0;
+
+ /* stop everything (includes BRS) */
+ amd_pmu_disable_all();
+
+ /* Drain BRS is in use (could be inactive) */
+ if (cpuc->lbr_users)
+ amd_brs_drain();
+
+ /* Process any counter overflows */
+ handled = x86_pmu_handle_irq(regs);
+
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ amd_pmu_enable_all(0);
+
+ return amd_pmu_adjust_nmi_window(handled);
+}
+
+static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_sample_data data;
+ struct hw_perf_event *hwc;
+ struct perf_event *event;
+ int handled = 0, idx;
+ u64 status, mask;
+ bool pmu_enabled;
+
+ /*
+ * Save the PMU state as it needs to be restored when leaving the
+ * handler
+ */
+ pmu_enabled = cpuc->enabled;
+ cpuc->enabled = 0;
+
+ /* Stop counting */
+ amd_pmu_v2_disable_all();
+
+ status = amd_pmu_get_global_status();
+
+ /* Check if any overflows are pending */
+ if (!status)
+ goto done;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ event = cpuc->events[idx];
+ hwc = &event->hw;
+ x86_perf_event_update(event);
+ mask = BIT_ULL(idx);
+
+ if (!(status & mask))
+ continue;
+
+ /* Event overflow */
+ handled++;
+ perf_sample_data_init(&data, 0, hwc->last_period);
+
+ if (!x86_perf_event_set_period(event))
+ continue;
+
+ if (perf_event_overflow(event, &data, regs))
+ x86_pmu_stop(event, 0);
+
+ status &= ~mask;
+ }
+
+ /*
+ * It should never be the case that some overflows are not handled as
+ * the corresponding PMCs are expected to be inactive according to the
+ * active_mask
+ */
+ WARN_ON(status > 0);
+
+ /* Clear overflow bits */
+ amd_pmu_ack_global_status(~status);
+
+ /*
+ * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT
+ * PMI entry is not set by the local APIC when a PMC overflow occurs
+ */
+ inc_irq_stat(apic_perf_irqs);
+
+done:
+ cpuc->enabled = pmu_enabled;
+
+ /* Resume counting only if PMU is active */
+ if (pmu_enabled)
+ amd_pmu_v2_enable_all(0);
+
+ return amd_pmu_adjust_nmi_window(handled);
+}
+
static struct event_constraint *
amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
@@ -897,6 +1195,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc,
--cpuc->n_pair;
}
+/*
+ * Because of the way BRS operates with an inactive and active phases, and
+ * the link to one counter, it is not possible to have two events using BRS
+ * scheduled at the same time. There would be an issue with enforcing the
+ * period of each one and given that the BRS saturates, it would not be possible
+ * to guarantee correlated content for all events. Therefore, in situations
+ * where multiple events want to use BRS, the kernel enforces mutual exclusion.
+ * Exclusion is enforced by chosing only one counter for events using BRS.
+ * The event scheduling logic will then automatically multiplex the
+ * events and ensure that at most one event is actively using BRS.
+ *
+ * The BRS counter could be any counter, but there is no constraint on Fam19h,
+ * therefore all counters are equal and thus we pick the first one: PMC0
+ */
+static struct event_constraint amd_fam19h_brs_cntr0_constraint =
+ EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK);
+
+static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint =
+ __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR);
+
+static struct event_constraint *
+amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ bool has_brs = has_amd_brs(hwc);
+
+ /*
+ * In case BRS is used with an event requiring a counter pair,
+ * the kernel allows it but only on counter 0 & 1 to enforce
+ * multiplexing requiring to protect BRS in case of multiple
+ * BRS users
+ */
+ if (amd_is_pair_event_code(hwc)) {
+ return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint
+ : &pair_constraint;
+ }
+
+ if (has_brs)
+ return &amd_fam19h_brs_cntr0_constraint;
+
+ return &unconstrained;
+}
+
+
static ssize_t amd_event_sysfs_show(char *page, u64 config)
{
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
@@ -905,12 +1248,31 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config)
return x86_event_sysfs_show(page, config, event);
}
+static void amd_pmu_sched_task(struct perf_event_context *ctx,
+ bool sched_in)
+{
+ if (sched_in && x86_pmu.lbr_nr)
+ amd_pmu_brs_sched_task(ctx, sched_in);
+}
+
+static u64 amd_pmu_limit_period(struct perf_event *event, u64 left)
+{
+ /*
+ * Decrease period by the depth of the BRS feature to get the last N
+ * taken branches and approximate the desired period
+ */
+ if (has_branch_stack(event) && left > x86_pmu.lbr_nr)
+ left -= x86_pmu.lbr_nr;
+
+ return left;
+}
+
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = amd_pmu_handle_irq,
.disable_all = amd_pmu_disable_all,
- .enable_all = x86_pmu_enable_all,
- .enable = x86_pmu_enable_event,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_event,
.disable = amd_pmu_disable_event,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
@@ -920,6 +1282,8 @@ static __initconst const struct x86_pmu amd_pmu = {
.event_map = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters = AMD64_NUM_COUNTERS,
+ .add = amd_pmu_add_event,
+ .del = amd_pmu_del_event,
.cntval_bits = 48,
.cntval_mask = (1ULL << 48) - 1,
.apic = 1,
@@ -938,8 +1302,55 @@ static __initconst const struct x86_pmu amd_pmu = {
.amd_nb_constraints = 1,
};
+static ssize_t branches_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *amd_pmu_brs_attrs[] = {
+ &dev_attr_branches.attr,
+ NULL,
+};
+
+static umode_t
+amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return x86_pmu.lbr_nr ? attr->mode : 0;
+}
+
+static struct attribute_group group_caps_amd_brs = {
+ .name = "caps",
+ .attrs = amd_pmu_brs_attrs,
+ .is_visible = amd_brs_is_visible,
+};
+
+EVENT_ATTR_STR(branch-brs, amd_branch_brs,
+ "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n");
+
+static struct attribute *amd_brs_events_attrs[] = {
+ EVENT_PTR(amd_branch_brs),
+ NULL,
+};
+
+static struct attribute_group group_events_amd_brs = {
+ .name = "events",
+ .attrs = amd_brs_events_attrs,
+ .is_visible = amd_brs_is_visible,
+};
+
+static const struct attribute_group *amd_attr_update[] = {
+ &group_caps_amd_brs,
+ &group_events_amd_brs,
+ NULL,
+};
+
static int __init amd_core_pmu_init(void)
{
+ union cpuid_0x80000022_ebx ebx;
u64 even_ctr_mask = 0ULL;
int i;
@@ -957,6 +1368,27 @@ static int __init amd_core_pmu_init(void)
x86_pmu.eventsel = MSR_F15H_PERF_CTL;
x86_pmu.perfctr = MSR_F15H_PERF_CTR;
x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
+
+ /* Check for Performance Monitoring v2 support */
+ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) {
+ ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+
+ /* Update PMU version for later usage */
+ x86_pmu.version = 2;
+
+ /* Find the number of available Core PMCs */
+ x86_pmu.num_counters = ebx.split.num_core_pmc;
+
+ amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1;
+
+ /* Update PMC handling functions */
+ x86_pmu.enable_all = amd_pmu_v2_enable_all;
+ x86_pmu.disable_all = amd_pmu_v2_disable_all;
+ x86_pmu.enable = amd_pmu_v2_enable_event;
+ x86_pmu.handle_irq = amd_pmu_v2_handle_irq;
+ static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status);
+ }
+
/*
* AMD Core perfctr has separate MSRs for the NB events, see
* the amd/uncore.c driver.
@@ -989,6 +1421,23 @@ static int __init amd_core_pmu_init(void)
x86_pmu.flags |= PMU_FL_PAIR;
}
+ /*
+ * BRS requires special event constraints and flushing on ctxsw.
+ */
+ if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) {
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f19h;
+ x86_pmu.sched_task = amd_pmu_sched_task;
+ x86_pmu.limit_period = amd_pmu_limit_period;
+ /*
+ * put_event_constraints callback same as Fam17h, set above
+ */
+
+ /* branch sampling must be stopped when entering low power */
+ amd_brs_lopwr_init();
+ }
+
+ x86_pmu.attr_update = amd_attr_update;
+
pr_cont("core perfctr, ");
return 0;
}
@@ -1023,6 +1472,24 @@ __init int amd_pmu_init(void)
return 0;
}
+static inline void amd_pmu_reload_virt(void)
+{
+ if (x86_pmu.version >= 2) {
+ /*
+ * Clear global enable bits, reprogram the PERF_CTL
+ * registers with updated perf_ctr_virt_mask and then
+ * set global enable bits once again
+ */
+ amd_pmu_v2_disable_all();
+ amd_pmu_enable_all(0);
+ amd_pmu_v2_enable_all(0);
+ return;
+ }
+
+ amd_pmu_disable_all();
+ amd_pmu_enable_all(0);
+}
+
void amd_pmu_enable_virt(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1030,8 +1497,7 @@ void amd_pmu_enable_virt(void)
cpuc->perf_ctr_virt_mask = 0;
/* Reload all events */
- amd_pmu_disable_all();
- x86_pmu_enable_all(0);
+ amd_pmu_reload_virt();
}
EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
@@ -1048,7 +1514,6 @@ void amd_pmu_disable_virt(void)
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
/* Reload all events */
- amd_pmu_disable_all();
- x86_pmu_enable_all(0);
+ amd_pmu_reload_virt();
}
EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 40669eac9d6d..c251bc44c088 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -26,6 +26,7 @@ static u32 ibs_caps;
#include <linux/hardirq.h>
#include <asm/nmi.h>
+#include <asm/amd-ibs.h>
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
@@ -90,24 +91,12 @@ struct perf_ibs {
unsigned long offset_mask[1];
int offset_max;
unsigned int fetch_count_reset_broken : 1;
+ unsigned int fetch_ignore_if_zero_rip : 1;
struct cpu_perf_ibs __percpu *pcpu;
- struct attribute **format_attrs;
- struct attribute_group format_group;
- const struct attribute_group *attr_groups[2];
-
u64 (*get_count)(u64 config);
};
-struct perf_ibs_data {
- u32 size;
- union {
- u32 data[0]; /* data buffer starts here */
- u32 caps;
- };
- u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
-};
-
static int
perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
{
@@ -311,6 +300,16 @@ static int perf_ibs_init(struct perf_event *event)
hwc->config_base = perf_ibs->msr;
hwc->config = config;
+ /*
+ * rip recorded by IbsOpRip will not be consistent with rsp and rbp
+ * recorded as part of interrupt regs. Thus we need to use rip from
+ * interrupt regs while unwinding call stack. Setting _EARLY flag
+ * makes sure we unwind call-stack before perf sample rip is set to
+ * IbsOpRip.
+ */
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
+
return 0;
}
@@ -328,11 +327,14 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
static u64 get_ibs_fetch_count(u64 config)
{
- return (config & IBS_FETCH_CNT) >> 12;
+ union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config;
+
+ return fetch_ctl.fetch_cnt << 4;
}
static u64 get_ibs_op_count(u64 config)
{
+ union ibs_op_ctl op_ctl = (union ibs_op_ctl)config;
u64 count = 0;
/*
@@ -340,12 +342,12 @@ static u64 get_ibs_op_count(u64 config)
* and the lower 7 bits of CurCnt are randomized.
* Otherwise CurCnt has the full 27-bit current counter value.
*/
- if (config & IBS_OP_VAL) {
- count = (config & IBS_OP_MAX_CNT) << 4;
+ if (op_ctl.op_val) {
+ count = op_ctl.opmaxcnt << 4;
if (ibs_caps & IBS_CAPS_OPCNTEXT)
- count += config & IBS_OP_MAX_CNT_EXT_MASK;
+ count += op_ctl.opmaxcnt_ext << 20;
} else if (ibs_caps & IBS_CAPS_RDWROPCNT) {
- count = (config & IBS_OP_CUR_CNT) >> 32;
+ count = op_ctl.opcurcnt;
}
return count;
@@ -522,16 +524,118 @@ static void perf_ibs_del(struct perf_event *event, int flags)
static void perf_ibs_read(struct perf_event *event) { }
+/*
+ * We need to initialize with empty group if all attributes in the
+ * group are dynamic.
+ */
+static struct attribute *attrs_empty[] = {
+ NULL,
+};
+
+static struct attribute_group empty_format_group = {
+ .name = "format",
+ .attrs = attrs_empty,
+};
+
+static struct attribute_group empty_caps_group = {
+ .name = "caps",
+ .attrs = attrs_empty,
+};
+
+static const struct attribute_group *empty_attr_groups[] = {
+ &empty_format_group,
+ &empty_caps_group,
+ NULL,
+};
+
PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
+PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
+PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
+PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
-static struct attribute *ibs_fetch_format_attrs[] = {
+static umode_t
+zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
+}
+
+static struct attribute *rand_en_attrs[] = {
&format_attr_rand_en.attr,
NULL,
};
-static struct attribute *ibs_op_format_attrs[] = {
- NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+static struct attribute *fetch_l3missonly_attrs[] = {
+ &fetch_l3missonly.attr.attr,
+ NULL,
+};
+
+static struct attribute *zen4_ibs_extensions_attrs[] = {
+ &zen4_ibs_extensions.attr.attr,
+ NULL,
+};
+
+static struct attribute_group group_rand_en = {
+ .name = "format",
+ .attrs = rand_en_attrs,
+};
+
+static struct attribute_group group_fetch_l3missonly = {
+ .name = "format",
+ .attrs = fetch_l3missonly_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static struct attribute_group group_zen4_ibs_extensions = {
+ .name = "caps",
+ .attrs = zen4_ibs_extensions_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static const struct attribute_group *fetch_attr_groups[] = {
+ &group_rand_en,
+ &empty_caps_group,
+ NULL,
+};
+
+static const struct attribute_group *fetch_attr_update[] = {
+ &group_fetch_l3missonly,
+ &group_zen4_ibs_extensions,
+ NULL,
+};
+
+static umode_t
+cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
+}
+
+static struct attribute *cnt_ctl_attrs[] = {
+ &format_attr_cnt_ctl.attr,
+ NULL,
+};
+
+static struct attribute *op_l3missonly_attrs[] = {
+ &op_l3missonly.attr.attr,
+ NULL,
+};
+
+static struct attribute_group group_cnt_ctl = {
+ .name = "format",
+ .attrs = cnt_ctl_attrs,
+ .is_visible = cnt_ctl_is_visible,
+};
+
+static struct attribute_group group_op_l3missonly = {
+ .name = "format",
+ .attrs = op_l3missonly_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static const struct attribute_group *op_attr_update[] = {
+ &group_cnt_ctl,
+ &group_op_l3missonly,
+ &group_zen4_ibs_extensions,
NULL,
};
@@ -555,7 +659,6 @@ static struct perf_ibs perf_ibs_fetch = {
.max_period = IBS_FETCH_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
.offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
- .format_attrs = ibs_fetch_format_attrs,
.get_count = get_ibs_fetch_count,
};
@@ -570,6 +673,7 @@ static struct perf_ibs perf_ibs_op = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
@@ -580,7 +684,6 @@ static struct perf_ibs perf_ibs_op = {
.max_period = IBS_OP_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
.offset_max = MSR_AMD64_IBSOP_REG_COUNT,
- .format_attrs = ibs_op_format_attrs,
.get_count = get_ibs_op_count,
};
@@ -672,6 +775,10 @@ fail:
if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
regs.flags &= ~PERF_EFLAGS_EXACT;
} else {
+ /* Workaround for erratum #1197 */
+ if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1]))
+ goto out;
+
set_linear_ip(&regs, ibs_data.regs[1]);
regs.flags |= PERF_EFLAGS_EXACT;
}
@@ -686,6 +793,14 @@ fail:
data.raw = &raw;
}
+ /*
+ * rip recorded by IbsOpRip will not be consistent with rsp and rbp
+ * recorded as part of interrupt regs. Thus we need to use rip from
+ * interrupt regs while unwinding call stack.
+ */
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ data.callchain = perf_callchain(event, iregs);
+
throttle = perf_event_overflow(event, &data, &regs);
out:
if (throttle) {
@@ -738,17 +853,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
perf_ibs->pcpu = pcpu;
- /* register attributes */
- if (perf_ibs->format_attrs[0]) {
- memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
- perf_ibs->format_group.name = "format";
- perf_ibs->format_group.attrs = perf_ibs->format_attrs;
-
- memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
- perf_ibs->attr_groups[0] = &perf_ibs->format_group;
- perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
- }
-
ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
if (ret) {
perf_ibs->pcpu = NULL;
@@ -758,10 +862,8 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
return ret;
}
-static __init void perf_event_ibs_init(void)
+static __init int perf_ibs_fetch_init(void)
{
- struct attribute **attr = ibs_op_format_attrs;
-
/*
* Some chips fail to reset the fetch count when it is written; instead
* they need a 0-1 transition of IbsFetchEn.
@@ -769,12 +871,22 @@ static __init void perf_event_ibs_init(void)
if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
perf_ibs_fetch.fetch_count_reset_broken = 1;
- perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+ if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
+ perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
- if (ibs_caps & IBS_CAPS_OPCNT) {
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY;
+
+ perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups;
+ perf_ibs_fetch.pmu.attr_update = fetch_attr_update;
+
+ return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+}
+
+static __init int perf_ibs_op_init(void)
+{
+ if (ibs_caps & IBS_CAPS_OPCNT)
perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
- *attr++ = &format_attr_cnt_ctl.attr;
- }
if (ibs_caps & IBS_CAPS_OPCNTEXT) {
perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK;
@@ -782,15 +894,52 @@ static __init void perf_event_ibs_init(void)
perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK;
}
- perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
+
+ perf_ibs_op.pmu.attr_groups = empty_attr_groups;
+ perf_ibs_op.pmu.attr_update = op_attr_update;
+
+ return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+}
+
+static __init int perf_event_ibs_init(void)
+{
+ int ret;
+
+ ret = perf_ibs_fetch_init();
+ if (ret)
+ return ret;
+
+ ret = perf_ibs_op_init();
+ if (ret)
+ goto err_op;
+
+ ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+ if (ret)
+ goto err_nmi;
- register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+ return 0;
+
+err_nmi:
+ perf_pmu_unregister(&perf_ibs_op.pmu);
+ free_percpu(perf_ibs_op.pcpu);
+ perf_ibs_op.pcpu = NULL;
+err_op:
+ perf_pmu_unregister(&perf_ibs_fetch.pmu);
+ free_percpu(perf_ibs_fetch.pcpu);
+ perf_ibs_fetch.pcpu = NULL;
+
+ return ret;
}
#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
-static __init void perf_event_ibs_init(void) { }
+static __init int perf_event_ibs_init(void)
+{
+ return 0;
+}
#endif
@@ -1060,9 +1209,7 @@ static __init int amd_ibs_init(void)
x86_pmu_amd_ibs_starting_cpu,
x86_pmu_amd_ibs_dying_cpu);
- perf_event_ibs_init();
-
- return 0;
+ return perf_event_ibs_init();
}
/* Since we need the pci subsystem to init ibs we can't do this earlier: */
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index be50ef8572cc..b15f7b950d2e 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -14,12 +14,11 @@
#include <linux/init.h>
#include <linux/cpumask.h>
#include <linux/slab.h>
+#include <linux/amd-iommu.h>
#include "../perf_event.h"
#include "iommu.h"
-#define COUNTER_SHIFT 16
-
/* iommu pmu conf masks */
#define GET_CSOURCE(x) ((x)->conf & 0xFFULL)
#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL)
@@ -81,12 +80,12 @@ static struct attribute_group amd_iommu_events_group = {
};
struct amd_iommu_event_desc {
- struct kobj_attribute attr;
+ struct device_attribute attr;
const char *event;
};
-static ssize_t _iommu_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t _iommu_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct amd_iommu_event_desc *event =
container_of(attr, struct amd_iommu_event_desc, attr);
@@ -162,7 +161,7 @@ static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
raw_spin_lock_irqsave(&piommu->lock, flags);
- for (bank = 0, shift = 0; bank < max_banks; bank++) {
+ for (bank = 0; bank < max_banks; bank++) {
for (cntr = 0; cntr < max_cntrs; cntr++) {
shift = bank + (bank*3) + cntr;
if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
@@ -285,22 +284,31 @@ static void perf_iommu_start(struct perf_event *event, int flags)
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
hwc->state = 0;
+ /*
+ * To account for power-gating, which prevents write to
+ * the counter, we need to enable the counter
+ * before setting up counter register.
+ */
+ perf_iommu_enable_event(event);
+
if (flags & PERF_EF_RELOAD) {
- u64 prev_raw_count = local64_read(&hwc->prev_count);
+ u64 count = 0;
struct amd_iommu *iommu = perf_event_2_iommu(event);
+ /*
+ * Since the IOMMU PMU only support counting mode,
+ * the counter always start with value zero.
+ */
amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
- IOMMU_PC_COUNTER_REG, &prev_raw_count);
+ IOMMU_PC_COUNTER_REG, &count);
}
- perf_iommu_enable_event(event);
perf_event_update_userpage(event);
-
}
static void perf_iommu_read(struct perf_event *event)
{
- u64 count, prev, delta;
+ u64 count;
struct hw_perf_event *hwc = &event->hw;
struct amd_iommu *iommu = perf_event_2_iommu(event);
@@ -311,14 +319,11 @@ static void perf_iommu_read(struct perf_event *event)
/* IOMMU pc counter register is only 48 bits */
count &= GENMASK_ULL(47, 0);
- prev = local64_read(&hwc->prev_count);
- if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev)
- return;
-
- /* Handle 48-bit counter overflow */
- delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
- delta >>= COUNTER_SHIFT;
- local64_add(delta, &event->count);
+ /*
+ * Since the counter always start with value zero,
+ * simply just accumulate the count for the event.
+ */
+ local64_add(count, &event->count);
}
static void perf_iommu_stop(struct perf_event *event, int flags)
@@ -328,15 +333,16 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
if (hwc->state & PERF_HES_UPTODATE)
return;
+ /*
+ * To account for power-gating, in which reading the counter would
+ * return zero, we need to read the register before disabling.
+ */
+ perf_iommu_read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+
perf_iommu_disable_event(event);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
-
- if (hwc->state & PERF_HES_UPTODATE)
- return;
-
- perf_iommu_read(event);
- hwc->state |= PERF_HES_UPTODATE;
}
static int perf_iommu_add(struct perf_event *event, int flags)
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
index 0e5c036fd7be..e6310c635c8b 100644
--- a/arch/x86/events/amd/iommu.h
+++ b/arch/x86/events/amd/iommu.h
@@ -17,27 +17,8 @@
#define IOMMU_PC_DEVID_MATCH_REG 0x20
#define IOMMU_PC_COUNTER_REPORT_REG 0x28
-/* maximun specified bank/counters */
+/* maximum specified bank/counters */
#define PC_MAX_SPEC_BNKS 64
#define PC_MAX_SPEC_CNTRS 16
-struct amd_iommu;
-
-/* amd_iommu_init.c external support functions */
-extern int amd_iommu_get_num_iommus(void);
-
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
-
-extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
-
-extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
- u8 fxn, u64 *value);
-
-extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
- u8 fxn, u64 *value);
-
-extern struct amd_iommu *get_amd_iommu(int idx);
-
#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
index 16a2369c586e..37d5b380516e 100644
--- a/arch/x86/events/amd/power.c
+++ b/arch/x86/events/amd/power.c
@@ -213,6 +213,7 @@ static struct pmu pmu_class = {
.stop = pmu_event_stop,
.read = pmu_event_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .module = THIS_MODULE,
};
static int power_cpu_exit(unsigned int cpu)
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 7f014d450bc2..0d04414b97d2 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -12,11 +12,11 @@
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
+#include <linux/cpufeature.h>
+#include <linux/smp.h>
-#include <asm/cpufeature.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
-#include <asm/smp.h>
#define NUM_COUNTERS_NB 4
#define NUM_COUNTERS_L2 4
@@ -275,14 +275,14 @@ static struct attribute_group amd_uncore_attr_group = {
};
#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
+static ssize_t __uncore_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
-static struct kobj_attribute format_attr_##_var = \
+static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35");
@@ -347,6 +347,7 @@ static struct pmu amd_nb_pmu = {
.stop = amd_uncore_stop,
.read = amd_uncore_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
};
static struct pmu amd_llc_pmu = {
@@ -360,6 +361,7 @@ static struct pmu amd_llc_pmu = {
.stop = amd_uncore_stop,
.read = amd_uncore_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
};
static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
@@ -452,7 +454,7 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
if (amd_uncore_llc) {
uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
- uncore->id = per_cpu(cpu_llc_id, cpu);
+ uncore->id = get_llc_id(cpu);
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
*per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
@@ -659,12 +661,34 @@ fail_prep:
fail_llc:
if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
perf_pmu_unregister(&amd_nb_pmu);
- if (amd_uncore_llc)
- free_percpu(amd_uncore_llc);
+ free_percpu(amd_uncore_llc);
fail_nb:
- if (amd_uncore_nb)
- free_percpu(amd_uncore_nb);
+ free_percpu(amd_uncore_nb);
return ret;
}
-device_initcall(amd_uncore_init);
+
+static void __exit amd_uncore_exit(void)
+{
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE);
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
+ cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
+
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
+ perf_pmu_unregister(&amd_llc_pmu);
+ free_percpu(amd_uncore_llc);
+ amd_uncore_llc = NULL;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
+ perf_pmu_unregister(&amd_nb_pmu);
+ free_percpu(amd_uncore_nb);
+ amd_uncore_nb = NULL;
+ }
+}
+
+module_init(amd_uncore_init);
+module_exit(amd_uncore_exit);
+
+MODULE_DESCRIPTION("AMD Uncore Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 18df17129695..30788894124f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -45,13 +45,16 @@
#include "perf_event.h"
struct x86_pmu x86_pmu __read_mostly;
+static struct pmu pmu;
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
+ .pmu = &pmu,
};
DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
+DEFINE_STATIC_KEY_FALSE(perf_is_hybrid);
/*
* This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
@@ -63,6 +66,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
+DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign);
+
DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
@@ -151,15 +156,16 @@ again:
*/
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
+ struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
struct hw_perf_event_extra *reg;
struct extra_reg *er;
reg = &event->hw.extra_reg;
- if (!x86_pmu.extra_regs)
+ if (!extra_regs)
return 0;
- for (er = x86_pmu.extra_regs; er->msr; er++) {
+ for (er = extra_regs; er->msr; er++) {
if (er->event != (config & er->config_mask))
continue;
if (event->attr.config1 & ~er->valid_mask)
@@ -182,16 +188,29 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
#ifdef CONFIG_X86_LOCAL_APIC
+static inline int get_possible_num_counters(void)
+{
+ int i, num_counters = x86_pmu.num_counters;
+
+ if (!is_hybrid())
+ return num_counters;
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
+ num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters);
+
+ return num_counters;
+}
+
static bool reserve_pmc_hardware(void)
{
- int i;
+ int i, num_counters = get_possible_num_counters();
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
goto perfctr_fail;
}
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < num_counters; i++) {
if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
goto eventsel_fail;
}
@@ -202,7 +221,7 @@ eventsel_fail:
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu_config_addr(i));
- i = x86_pmu.num_counters;
+ i = num_counters;
perfctr_fail:
for (i--; i >= 0; i--)
@@ -213,9 +232,9 @@ perfctr_fail:
static void release_pmc_hardware(void)
{
- int i;
+ int i, num_counters = get_possible_num_counters();
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < num_counters; i++) {
release_perfctr_nmi(x86_pmu_event_addr(i));
release_evntsel_nmi(x86_pmu_config_addr(i));
}
@@ -228,7 +247,7 @@ static void release_pmc_hardware(void) {}
#endif
-static bool check_hw_exists(void)
+bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed)
{
u64 val, val_fail = -1, val_new= ~0;
int i, reg, reg_fail = -1, ret = 0;
@@ -239,7 +258,7 @@ static bool check_hw_exists(void)
* Check to see if the BIOS enabled any of the counters, if so
* complain and bail.
*/
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
@@ -253,15 +272,15 @@ static bool check_hw_exists(void)
}
}
- if (x86_pmu.num_counters_fixed) {
+ if (num_counters_fixed) {
reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
- for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
- if (fixed_counter_disabled(i))
+ for (i = 0; i < num_counters_fixed; i++) {
+ if (fixed_counter_disabled(i, pmu))
continue;
- if (val & (0x03 << i*4)) {
+ if (val & (0x03ULL << i*4)) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;
@@ -360,8 +379,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return -EINVAL;
cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
- val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
+ val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result];
if (val == 0)
return -ENOENT;
@@ -369,7 +387,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return -EINVAL;
hwc->config |= val;
- attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+ attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result];
return x86_pmu_extra_regs(val, event);
}
@@ -380,10 +398,12 @@ int x86_reserve_hardware(void)
if (!atomic_inc_not_zero(&pmc_refcount)) {
mutex_lock(&pmc_reserve_mutex);
if (atomic_read(&pmc_refcount) == 0) {
- if (!reserve_pmc_hardware())
+ if (!reserve_pmc_hardware()) {
err = -EBUSY;
- else
+ } else {
reserve_ds_buffers();
+ reserve_lbr_buffers();
+ }
}
if (!err)
atomic_inc(&pmc_refcount);
@@ -462,7 +482,7 @@ int x86_setup_perfctr(struct perf_event *event)
local64_set(&hwc->period_left, hwc->sample_period);
}
- if (attr->type == PERF_TYPE_RAW)
+ if (attr->type == event->pmu->type)
return x86_pmu_extra_regs(event->attr.config, event);
if (attr->type == PERF_TYPE_HW_CACHE)
@@ -597,7 +617,7 @@ int x86_pmu_hw_config(struct perf_event *event)
if (!event->attr.exclude_kernel)
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
- if (event->attr.type == PERF_TYPE_RAW)
+ if (event->attr.type == event->pmu->type)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
if (event->attr.sample_period && x86_pmu.limit_period) {
@@ -724,16 +744,33 @@ void x86_pmu_enable_all(int added)
}
}
-static struct pmu pmu;
-
static inline int is_x86_event(struct perf_event *event)
{
- return event->pmu == &pmu;
+ int i;
+
+ if (!is_hybrid())
+ return event->pmu == &pmu;
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu)
+ return true;
+ }
+
+ return false;
}
-struct pmu *x86_get_pmu(void)
+struct pmu *x86_get_pmu(unsigned int cpu)
{
- return &pmu;
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ /*
+ * All CPUs of the hybrid type have been offline.
+ * The x86_get_pmu() should not be invoked.
+ */
+ if (WARN_ON_ONCE(!cpuc->pmu))
+ return &pmu;
+
+ return cpuc->pmu;
}
/*
* Event scheduler state:
@@ -765,7 +802,7 @@ struct perf_sched {
};
/*
- * Initialize interator that runs through all events and counters.
+ * Initialize iterator that runs through all events and counters.
*/
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
int num, int wmin, int wmax, int gpmax)
@@ -936,6 +973,7 @@ EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
+ int num_counters = hybrid(cpuc->pmu, num_counters);
struct event_constraint *c;
struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
@@ -1011,7 +1049,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* slow path */
if (i != n) {
- int gpmax = x86_pmu.num_counters;
+ int gpmax = num_counters;
/*
* Do not allow scheduling of more than half the available
@@ -1032,7 +1070,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* the extra Merge events needed by large increment events.
*/
if (x86_pmu.flags & PMU_FL_PAIR) {
- gpmax = x86_pmu.num_counters - cpuc->n_pair;
+ gpmax = num_counters - cpuc->n_pair;
WARN_ON(gpmax <= 0);
}
@@ -1051,10 +1089,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* validate an event group (assign == NULL)
*/
if (!unsched && assign) {
- for (i = 0; i < n; i++) {
- e = cpuc->event_list[i];
+ for (i = 0; i < n; i++)
static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
- }
} else {
for (i = n0; i < n; i++) {
e = cpuc->event_list[i];
@@ -1096,8 +1132,9 @@ static void del_nr_metric_event(struct cpu_hw_events *cpuc,
static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
int max_count, int n)
{
+ union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
- if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
+ if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
return -EINVAL;
if (n >= max_count + cpuc->n_metric)
@@ -1118,10 +1155,12 @@ static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
*/
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
+ int num_counters = hybrid(cpuc->pmu, num_counters);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct perf_event *event;
int n, max_count;
- max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+ max_count = num_counters + num_counters_fixed;
/* current number of events already accepted */
n = cpuc->n_events;
@@ -1178,6 +1217,8 @@ static inline void x86_assign_hw_event(struct perf_event *event,
hwc->last_cpu = smp_processor_id();
hwc->last_tag = ++cpuc->tags[i];
+ static_call_cond(x86_pmu_assign)(event, idx);
+
switch (hwc->idx) {
case INTEL_PMC_IDX_FIXED_BTS:
case INTEL_PMC_IDX_FIXED_VLBR:
@@ -1297,6 +1338,10 @@ static void x86_pmu_enable(struct pmu *pmu)
if (hwc->state & PERF_HES_ARCH)
continue;
+ /*
+ * if cpuc->enabled = 0, then no wrmsr as
+ * per x86_pmu_enable_event()
+ */
x86_pmu_start(event, PERF_EF_RELOAD);
}
cpuc->n_added = 0;
@@ -1480,7 +1525,6 @@ static void x86_pmu_start(struct perf_event *event, int flags)
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
- __set_bit(idx, cpuc->running);
static_call(x86_pmu_enable)(event);
perf_event_update_userpage(event);
}
@@ -1489,18 +1533,19 @@ void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
u64 pebs, debugctl;
- struct cpu_hw_events *cpuc;
+ int cpu = smp_processor_id();
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ int num_counters = hybrid(cpuc->pmu, num_counters);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
+ struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
unsigned long flags;
- int cpu, idx;
+ int idx;
- if (!x86_pmu.num_counters)
+ if (!num_counters)
return;
local_irq_save(flags);
- cpu = smp_processor_id();
- cpuc = &per_cpu(cpu_hw_events, cpu);
-
if (x86_pmu.version >= 2) {
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -1512,7 +1557,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- if (x86_pmu.pebs_constraints) {
+ if (pebs_constraints) {
rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
}
@@ -1523,7 +1568,7 @@ void perf_event_print_debug(void)
}
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < num_counters; idx++) {
rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
rdmsrl(x86_pmu_event_addr(idx), pmc_count);
@@ -1536,8 +1581,8 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
cpu, idx, prev_left);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
- if (fixed_counter_disabled(idx))
+ for (idx = 0; idx < num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
@@ -1573,6 +1618,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
static void x86_pmu_del(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
int i;
/*
@@ -1586,6 +1632,8 @@ static void x86_pmu_del(struct perf_event *event, int flags)
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto do_del;
+ __set_bit(event->hw.idx, cpuc->dirty);
+
/*
* Not a TXN, therefore cleanup properly.
*/
@@ -1612,7 +1660,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
}
cpuc->event_constraint[i-1] = NULL;
--cpuc->n_events;
- if (x86_pmu.intel_cap.perf_metrics)
+ if (intel_cap.perf_metrics)
del_nr_metric_event(cpuc, event);
perf_event_update_userpage(event);
@@ -1660,11 +1708,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
* event overflow
*/
handled++;
- perf_sample_data_init(&data, 0, event->hw.last_period);
if (!x86_perf_event_set_period(event))
continue;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (has_branch_stack(event))
+ data.br_stack = &cpuc->lbr_stack;
+
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1793,7 +1845,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha
/* string trumps id */
if (pmu_attr->event_str)
- return sprintf(page, "%s", pmu_attr->event_str);
+ return sprintf(page, "%s\n", pmu_attr->event_str);
return x86_pmu.events_sysfs_show(page, config);
}
@@ -1822,6 +1874,49 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
pmu_attr->event_str_noht);
}
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
+ struct x86_hybrid_pmu *pmu;
+ const char *str, *next_str;
+ int i;
+
+ if (hweight64(pmu_attr->pmu_type) == 1)
+ return sprintf(page, "%s", pmu_attr->event_str);
+
+ /*
+ * Hybrid PMUs may support the same event name, but with different
+ * event encoding, e.g., the mem-loads event on an Atom PMU has
+ * different event encoding from a Core PMU.
+ *
+ * The event_str includes all event encodings. Each event encoding
+ * is divided by ";". The order of the event encodings must follow
+ * the order of the hybrid PMU index.
+ */
+ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+ str = pmu_attr->event_str;
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type))
+ continue;
+ if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) {
+ next_str = strchr(str, ';');
+ if (next_str)
+ return snprintf(page, next_str - str + 1, "%s", str);
+ else
+ return sprintf(page, "%s", str);
+ }
+ str = strchr(str, ';');
+ str++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
+
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
EVENT_ATTR(instructions, INSTRUCTIONS );
EVENT_ATTR(cache-references, CACHE_REFERENCES );
@@ -1922,6 +2017,8 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_enable, x86_pmu.enable);
static_call_update(x86_pmu_disable, x86_pmu.disable);
+ static_call_update(x86_pmu_assign, x86_pmu.assign);
+
static_call_update(x86_pmu_add, x86_pmu.add);
static_call_update(x86_pmu_del, x86_pmu.del);
static_call_update(x86_pmu_read, x86_pmu.read);
@@ -1948,6 +2045,37 @@ static void _x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
}
+void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
+ u64 intel_ctrl)
+{
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
+ pr_info("... generic registers: %d\n", num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
+ pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+ pr_info("... fixed-purpose events: %lu\n",
+ hweight64((((1ULL << num_counters_fixed) - 1)
+ << INTEL_PMC_IDX_FIXED) & intel_ctrl));
+ pr_info("... event mask: %016Lx\n", intel_ctrl);
+}
+
+/*
+ * The generic code is not hybrid friendly. The hybrid_pmu->pmu
+ * of the first registered PMU is unconditionally assigned to
+ * each possible cpuctx->ctx.pmu.
+ * Update the correct hybrid PMU to the cpuctx->ctx.pmu.
+ */
+void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!pmu->pmu_cpu_context)
+ return;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->ctx.pmu = pmu;
+}
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1981,7 +2109,7 @@ static int __init init_hw_perf_events(void)
pmu_check_apic();
/* sanity check that the hardware exists or is emulated */
- if (!check_hw_exists())
+ if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed))
return 0;
pr_cont("%s PMU driver.\n", x86_pmu.name);
@@ -2008,15 +2136,11 @@ static int __init init_hw_perf_events(void)
pmu.attr_update = x86_pmu.attr_update;
- pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
- pr_info("... generic registers: %d\n", x86_pmu.num_counters);
- pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
- pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %lu\n",
- hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1)
- << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl));
- pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
+ if (!is_hybrid()) {
+ x86_pmu_show_pmu_cap(x86_pmu.num_counters,
+ x86_pmu.num_counters_fixed,
+ x86_pmu.intel_ctrl);
+ }
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
@@ -2046,9 +2170,46 @@ static int __init init_hw_perf_events(void)
if (err)
goto out1;
- err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
- if (err)
- goto out2;
+ if (!is_hybrid()) {
+ err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+ if (err)
+ goto out2;
+ } else {
+ u8 cpu_type = get_this_hybrid_cpu_type();
+ struct x86_hybrid_pmu *hybrid_pmu;
+ int i, j;
+
+ if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
+ cpu_type = x86_pmu.get_hybrid_cpu_type();
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ hybrid_pmu = &x86_pmu.hybrid_pmu[i];
+
+ hybrid_pmu->pmu = pmu;
+ hybrid_pmu->pmu.type = -1;
+ hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
+ hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+ hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
+
+ err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
+ (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
+ if (err)
+ break;
+
+ if (cpu_type == hybrid_pmu->cpu_type)
+ x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
+ }
+
+ if (i < x86_pmu.num_hybrid_pmus) {
+ for (j = 0; j < i; j++)
+ perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu);
+ pr_warn("Failed to register hybrid PMUs\n");
+ kfree(x86_pmu.hybrid_pmu);
+ x86_pmu.hybrid_pmu = NULL;
+ x86_pmu.num_hybrid_pmus = 0;
+ goto out2;
+ }
+ }
return 0;
@@ -2173,16 +2334,27 @@ static void free_fake_cpuc(struct cpu_hw_events *cpuc)
kfree(cpuc);
}
-static struct cpu_hw_events *allocate_fake_cpuc(void)
+static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu)
{
struct cpu_hw_events *cpuc;
- int cpu = raw_smp_processor_id();
+ int cpu;
cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
if (!cpuc)
return ERR_PTR(-ENOMEM);
cpuc->is_fake = 1;
+ if (is_hybrid()) {
+ struct x86_hybrid_pmu *h_pmu;
+
+ h_pmu = hybrid_pmu(event_pmu);
+ if (cpumask_empty(&h_pmu->supported_cpus))
+ goto error;
+ cpu = cpumask_first(&h_pmu->supported_cpus);
+ } else
+ cpu = raw_smp_processor_id();
+ cpuc->pmu = event_pmu;
+
if (intel_cpuc_prepare(cpuc, cpu))
goto error;
@@ -2201,7 +2373,7 @@ static int validate_event(struct perf_event *event)
struct event_constraint *c;
int ret = 0;
- fake_cpuc = allocate_fake_cpuc();
+ fake_cpuc = allocate_fake_cpuc(event->pmu);
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
@@ -2235,7 +2407,27 @@ static int validate_group(struct perf_event *event)
struct cpu_hw_events *fake_cpuc;
int ret = -EINVAL, n;
- fake_cpuc = allocate_fake_cpuc();
+ /*
+ * Reject events from different hybrid PMUs.
+ */
+ if (is_hybrid()) {
+ struct perf_event *sibling;
+ struct pmu *pmu = NULL;
+
+ if (is_x86_event(leader))
+ pmu = leader->pmu;
+
+ for_each_sibling_event(sibling, leader) {
+ if (!is_x86_event(sibling))
+ continue;
+ if (!pmu)
+ pmu = sibling->pmu;
+ else if (pmu != sibling->pmu)
+ return ret;
+ }
+ }
+
+ fake_cpuc = allocate_fake_cpuc(event->pmu);
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
/*
@@ -2263,51 +2455,70 @@ out:
static int x86_pmu_event_init(struct perf_event *event)
{
- struct pmu *tmp;
+ struct x86_hybrid_pmu *pmu = NULL;
int err;
- switch (event->attr.type) {
- case PERF_TYPE_RAW:
- case PERF_TYPE_HARDWARE:
- case PERF_TYPE_HW_CACHE:
- break;
-
- default:
+ if ((event->attr.type != event->pmu->type) &&
+ (event->attr.type != PERF_TYPE_HARDWARE) &&
+ (event->attr.type != PERF_TYPE_HW_CACHE))
return -ENOENT;
+
+ if (is_hybrid() && (event->cpu != -1)) {
+ pmu = hybrid_pmu(event->pmu);
+ if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus))
+ return -ENOENT;
}
err = __x86_pmu_event_init(event);
if (!err) {
- /*
- * we temporarily connect event to its pmu
- * such that validate_group() can classify
- * it as an x86 event using is_x86_event()
- */
- tmp = event->pmu;
- event->pmu = &pmu;
-
if (event->group_leader != event)
err = validate_group(event);
else
err = validate_event(event);
-
- event->pmu = tmp;
}
if (err) {
if (event->destroy)
event->destroy(event);
+ event->destroy = NULL;
}
if (READ_ONCE(x86_pmu.attr_rdpmc) &&
!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
- event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+ event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
return err;
}
+void perf_clear_dirty_counters(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int i;
+
+ /* Don't need to clear the assigned counter. */
+ for (i = 0; i < cpuc->n_events; i++)
+ __clear_bit(cpuc->assign[i], cpuc->dirty);
+
+ if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
+ return;
+
+ for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
+ if (i >= INTEL_PMC_IDX_FIXED) {
+ /* Metrics and fake events don't have corresponding HW counters. */
+ if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed))
+ continue;
+
+ wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0);
+ } else {
+ wrmsrl(x86_pmu_event_addr(i), 0);
+ }
+ }
+
+ bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
+}
+
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
/*
@@ -2328,8 +2539,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
-
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
@@ -2340,7 +2550,7 @@ static int x86_pmu_event_idx(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
return 0;
if (is_metric_idx(hwc->idx))
@@ -2475,6 +2685,14 @@ static int x86_pmu_aux_output_match(struct perf_event *event)
return 0;
}
+static int x86_pmu_filter_match(struct perf_event *event)
+{
+ if (x86_pmu.filter_match)
+ return x86_pmu.filter_match(event);
+
+ return 1;
+}
+
static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
.pmu_disable = x86_pmu_disable,
@@ -2502,6 +2720,8 @@ static struct pmu pmu = {
.check_period = x86_pmu_check_period,
.aux_output_match = x86_pmu_aux_output_match,
+
+ .filter_match = x86_pmu_filter_match,
};
void arch_perf_update_userpage(struct perf_event *event,
@@ -2513,7 +2733,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc =
- !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+ !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
userpg->pmc_width = x86_pmu.cntval_bits;
if (!using_native_sched_clock() || !sched_clock_stable())
@@ -2559,7 +2779,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
struct unwind_state state;
unsigned long addr;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
return;
}
@@ -2582,7 +2802,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
static inline int
valid_user_frame(const void __user *fp, unsigned long size)
{
- return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+ return __access_ok(fp, size);
}
static unsigned long get_segment_base(unsigned int segment)
@@ -2662,7 +2882,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
struct stack_frame frame;
const struct stack_frame __user *fp;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
return;
}
@@ -2739,18 +2959,19 @@ static unsigned long code_segment_base(struct pt_regs *regs)
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
- return perf_guest_cbs->get_guest_ip();
+ if (perf_guest_state())
+ return perf_guest_get_ip();
return regs->ip + code_segment_base(regs);
}
unsigned long perf_misc_flags(struct pt_regs *regs)
{
+ unsigned int guest_state = perf_guest_state();
int misc = 0;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- if (perf_guest_cbs->is_user_mode())
+ if (guest_state) {
+ if (guest_state & PERF_GUEST_USER)
misc |= PERF_RECORD_MISC_GUEST_USER;
else
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
@@ -2770,6 +2991,11 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
cap->version = x86_pmu.version;
+ /*
+ * KVM doesn't support the hybrid PMU yet.
+ * Return the common value in global x86_pmu,
+ * which available for all cores.
+ */
cap->num_counters_gp = x86_pmu.num_counters;
cap->num_counters_fixed = x86_pmu.num_counters_fixed;
cap->bit_width_gp = x86_pmu.cntval_bits;
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index e67a5886336c..10bde6c5abb2 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -3,6 +3,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o
obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o
obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o
-intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o
+intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o uncore_discovery.o
obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o
intel-cstate-objs := cstate.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 731dd8d0dbb1..974e917e65b2 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -209,6 +209,12 @@ static void bts_update(struct bts_ctx *bts)
} else {
local_set(&buf->data_size, head);
}
+
+ /*
+ * Since BTS is coherent, just add compiler barrier to ensure
+ * BTS updating is ordered against bts::handle::event.
+ */
+ barrier();
}
static int
@@ -594,7 +600,7 @@ static __init int bts_init(void)
* we cannot use the user mapping since it will not be available
* if we're not running the owning process.
*
- * With PTI we can't use the kernal map either, because its not
+ * With PTI we can't use the kernel map either, because its not
* there when we run userspace.
*
* For now, disable this driver when using PTI.
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 37ce38403cb8..45024abd929f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -137,7 +137,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMPTY */
INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
@@ -181,6 +181,27 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+static struct event_constraint intel_v5_gen_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ FIXED_EVENT_CONSTRAINT(0x0500, 4),
+ FIXED_EVENT_CONSTRAINT(0x0600, 5),
+ FIXED_EVENT_CONSTRAINT(0x0700, 6),
+ FIXED_EVENT_CONSTRAINT(0x0800, 7),
+ FIXED_EVENT_CONSTRAINT(0x0900, 8),
+ FIXED_EVENT_CONSTRAINT(0x0a00, 9),
+ FIXED_EVENT_CONSTRAINT(0x0b00, 10),
+ FIXED_EVENT_CONSTRAINT(0x0c00, 11),
+ FIXED_EVENT_CONSTRAINT(0x0d00, 12),
+ FIXED_EVENT_CONSTRAINT(0x0e00, 13),
+ FIXED_EVENT_CONSTRAINT(0x0f00, 14),
+ FIXED_EVENT_CONSTRAINT(0x1000, 15),
+ EVENT_CONSTRAINT_END
+};
+
static struct event_constraint intel_slm_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -243,7 +264,8 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
static struct event_constraint intel_icl_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* old INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
@@ -254,7 +276,7 @@ static struct event_constraint intel_icl_event_constraints[] = {
INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf),
INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */
- INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf),
+ INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x56, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf),
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */
INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */
@@ -263,6 +285,7 @@ static struct event_constraint intel_icl_event_constraints[] = {
INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf),
+ INTEL_EVENT_CONSTRAINT(0xef, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf),
EVENT_CONSTRAINT_END
};
@@ -279,13 +302,15 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
- INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE),
+ INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE),
+ INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
EVENT_EXTRA_END
};
static struct event_constraint intel_spr_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
@@ -2076,6 +2101,14 @@ static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0),
+ EVENT_EXTRA_END
+};
+
#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
#define KNL_MCDRAM_LOCAL BIT_ULL(21)
@@ -2133,19 +2166,19 @@ static __initconst const u64 knl_hw_cache_extra_regs
* However, there are some cases which may change PEBS status, e.g. PMI
* throttle. The PEBS_ENABLE should be updated where the status changes.
*/
-static void __intel_pmu_disable_all(void)
+static __always_inline void __intel_pmu_disable_all(bool bts)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
- if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ if (bts && test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
}
-static void intel_pmu_disable_all(void)
+static __always_inline void intel_pmu_disable_all(void)
{
- __intel_pmu_disable_all();
+ __intel_pmu_disable_all(true);
intel_pmu_pebs_disable_all();
intel_pmu_lbr_disable_all();
}
@@ -2153,10 +2186,11 @@ static void intel_pmu_disable_all(void)
static void __intel_pmu_enable_all(int added, bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
intel_pmu_lbr_enable_all(pmi);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
- x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+ intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
struct perf_event *event =
@@ -2175,6 +2209,47 @@ static void intel_pmu_enable_all(int added)
__intel_pmu_enable_all(added, false);
}
+static noinline int
+__intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries,
+ unsigned int cnt, unsigned long flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ intel_pmu_lbr_read();
+ cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
+
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+ intel_pmu_enable_all(0);
+ local_irq_restore(flags);
+ return cnt;
+}
+
+static int
+intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ unsigned long flags;
+
+ /* must not have branches... */
+ local_irq_save(flags);
+ __intel_pmu_disable_all(false); /* we don't care about BTS */
+ __intel_pmu_lbr_disable();
+ /* ... until here */
+ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
+}
+
+static int
+intel_pmu_snapshot_arch_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ unsigned long flags;
+
+ /* must not have branches... */
+ local_irq_save(flags);
+ __intel_pmu_disable_all(false); /* we don't care about BTS */
+ __intel_pmu_arch_lbr_disable();
+ /* ... until here */
+ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
+}
+
/*
* Workaround for:
* Intel Errata AAK100 (model 26)
@@ -2186,7 +2261,7 @@ static void intel_pmu_enable_all(int added)
* magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
* in sequence on the same PMC or on different PMCs.
*
- * In practise it appears some of these events do in fact count, and
+ * In practice it appears some of these events do in fact count, and
* we need to program all 4 events.
*/
static void intel_pmu_nhm_workaround(void)
@@ -2391,6 +2466,12 @@ static void intel_pmu_disable_event(struct perf_event *event)
intel_pmu_pebs_disable(event);
}
+static void intel_pmu_assign_event(struct perf_event *event, int idx)
+{
+ if (is_pebs_pt(event))
+ perf_report_aux_output_id(event, idx);
+}
+
static void intel_pmu_del_event(struct perf_event *event)
{
if (needs_branch_stack(event))
@@ -2429,13 +2510,23 @@ static int icl_set_topdown_event_period(struct perf_event *event)
return 0;
}
+static int adl_set_topdown_event_period(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type != hybrid_big)
+ return 0;
+
+ return icl_set_topdown_event_period(event);
+}
+
static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
{
u32 val;
/*
* The metric is reported as an 8bit integer fraction
- * suming up to 0xff.
+ * summing up to 0xff.
* slots-in-metric = (Metric / 0xff) * slots
*/
val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff;
@@ -2569,6 +2660,17 @@ static u64 icl_update_topdown_event(struct perf_event *event)
x86_pmu.num_topdown_events - 1);
}
+static u64 adl_update_topdown_event(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type != hybrid_big)
+ return 0;
+
+ return icl_update_topdown_event(event);
+}
+
+
static void intel_pmu_read_topdown_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2709,22 +2811,25 @@ int intel_pmu_save_and_restart(struct perf_event *event)
static void intel_pmu_reset(void)
{
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
+ int num_counters = hybrid(cpuc->pmu, num_counters);
unsigned long flags;
int idx;
- if (!x86_pmu.num_counters)
+ if (!num_counters)
return;
local_irq_save(flags);
pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < num_counters; idx++) {
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
- if (fixed_counter_disabled(idx))
+ for (idx = 0; idx < num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
}
@@ -2753,6 +2858,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int bit;
int handled = 0;
+ u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
inc_irq_stat(apic_perf_irqs);
@@ -2776,7 +2882,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
* processing loop coming after that the function, otherwise
* phony regular samples may be generated in the sampling buffer
* not marked with the EXACT tag. Another possibility is to have
- * one PEBS event and at least one non-PEBS event whic hoverflows
+ * one PEBS event and at least one non-PEBS event which overflows
* while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will
* not be set, yet the overflow status bit for the PEBS counter will
* be on Skylake.
@@ -2798,7 +2904,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
handled++;
x86_pmu.drain_pebs(regs, &data);
- status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
+ status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
/*
* PMI throttle may be triggered, which stops the PEBS event.
@@ -2816,15 +2922,12 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
*/
if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) {
handled++;
- if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
- perf_guest_cbs->handle_intel_pt_intr))
- perf_guest_cbs->handle_intel_pt_intr();
- else
+ if (!perf_guest_handle_intel_pt_intr())
intel_pt_interrupt();
}
/*
- * Intel Perf mertrics
+ * Intel Perf metrics
*/
if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) {
handled++;
@@ -2868,28 +2971,32 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
*/
static int intel_pmu_handle_irq(struct pt_regs *regs)
{
- struct cpu_hw_events *cpuc;
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ bool late_ack = hybrid_bit(cpuc->pmu, late_ack);
+ bool mid_ack = hybrid_bit(cpuc->pmu, mid_ack);
int loops;
u64 status;
int handled;
int pmu_enabled;
- cpuc = this_cpu_ptr(&cpu_hw_events);
-
/*
* Save the PMU state.
* It needs to be restored when leaving the handler.
*/
pmu_enabled = cpuc->enabled;
/*
- * No known reason to not always do late ACK,
- * but just in case do it opt-in.
+ * In general, the early ACK is only applied for old platforms.
+ * For the big core starts from Haswell, the late ACK should be
+ * applied.
+ * For the small core after Tremont, we have to do the ACK right
+ * before re-enabling counters, which is in the middle of the
+ * NMI handler.
*/
- if (!x86_pmu.late_ack)
+ if (!late_ack && !mid_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
intel_bts_disable_local();
cpuc->enabled = 0;
- __intel_pmu_disable_all();
+ __intel_pmu_disable_all(true);
handled = intel_pmu_drain_bts_buffer();
handled += intel_bts_interrupt();
status = intel_pmu_get_status();
@@ -2922,6 +3029,8 @@ again:
goto again;
done:
+ if (mid_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
/* Only restore PMU state when it's active. See x86_pmu_disable(). */
cpuc->enabled = pmu_enabled;
if (pmu_enabled)
@@ -2933,7 +3042,7 @@ done:
* have been reset. This avoids spurious NMIs on
* Haswell CPUs.
*/
- if (x86_pmu.late_ack)
+ if (late_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
return handled;
}
@@ -2955,14 +3064,18 @@ intel_vlbr_constraints(struct perf_event *event)
{
struct event_constraint *c = &vlbr_constraint;
- if (unlikely(constraint_match(c, event->hw.config)))
+ if (unlikely(constraint_match(c, event->hw.config))) {
+ event->hw.flags |= c->flags;
return c;
+ }
return NULL;
}
-static int intel_alt_er(int idx, u64 config)
+static int intel_alt_er(struct cpu_hw_events *cpuc,
+ int idx, u64 config)
{
+ struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs);
int alt_idx = idx;
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
@@ -2974,7 +3087,7 @@ static int intel_alt_er(int idx, u64 config)
if (idx == EXTRA_REG_RSP_1)
alt_idx = EXTRA_REG_RSP_0;
- if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+ if (config & ~extra_regs[alt_idx].valid_mask)
return idx;
return alt_idx;
@@ -2982,15 +3095,16 @@ static int intel_alt_er(int idx, u64 config)
static void intel_fixup_er(struct perf_event *event, int idx)
{
+ struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
event->hw.extra_reg.idx = idx;
if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+ event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
} else if (idx == EXTRA_REG_RSP_1) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+ event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
}
@@ -3066,7 +3180,7 @@ again:
*/
c = NULL;
} else {
- idx = intel_alt_er(idx, reg->config);
+ idx = intel_alt_er(cpuc, idx, reg->config);
if (idx != reg->idx) {
raw_spin_unlock_irqrestore(&era->lock, flags);
goto again;
@@ -3131,10 +3245,11 @@ struct event_constraint *
x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
+ struct event_constraint *event_constraints = hybrid(cpuc->pmu, event_constraints);
struct event_constraint *c;
- if (x86_pmu.event_constraints) {
- for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if (event_constraints) {
+ for_each_event_constraint(c, event_constraints) {
if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
@@ -3142,7 +3257,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
}
}
- return &unconstrained;
+ return &hybrid_var(cpuc->pmu, unconstrained);
}
static struct event_constraint *
@@ -3646,6 +3761,23 @@ static inline bool is_mem_loads_aux_event(struct perf_event *event)
return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
}
+static inline bool require_mem_loads_aux_event(struct perf_event *event)
+{
+ if (!(x86_pmu.flags & PMU_FL_MEM_LOADS_AUX))
+ return false;
+
+ if (is_hybrid())
+ return hybrid_pmu(event->pmu)->cpu_type == hybrid_big;
+
+ return true;
+}
+
+static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
+{
+ union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap);
+
+ return test_bit(idx, (unsigned long *)&intel_cap->capabilities);
+}
static int intel_pmu_hw_config(struct perf_event *event)
{
@@ -3702,7 +3834,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
}
- if (event->attr.type != PERF_TYPE_RAW)
+ if ((event->attr.type == PERF_TYPE_HARDWARE) ||
+ (event->attr.type == PERF_TYPE_HW_CACHE))
return 0;
/*
@@ -3715,7 +3848,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
* with a slots event as group leader. When the slots event
* is used in a metrics group, it too cannot support sampling.
*/
- if (x86_pmu.intel_cap.perf_metrics && is_topdown_event(event)) {
+ if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) {
if (event->attr.config1 || event->attr.config2)
return -EINVAL;
@@ -3766,7 +3899,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
* event. The rule is to simplify the implementation of the check.
* That's because perf cannot have a complete group at the moment.
*/
- if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+ if (require_mem_loads_aux_event(event) &&
(event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
is_mem_loads_event(event)) {
struct perf_event *leader = event->group_leader;
@@ -3801,10 +3934,11 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+ u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
- arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
- arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+ arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+ arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask;
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
arr[0].guest &= ~cpuc->pebs_enabled;
else
@@ -3973,8 +4107,10 @@ spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
* The :ppp indicates the Precise Distribution (PDist) facility, which
* is only supported on the GP counter 0. If a :ppp event which is not
* available on the GP counter 0, error out.
+ * Exception: Instruction PDIR is only available on the fixed counter 0.
*/
- if (event->attr.precise_ip == 3) {
+ if ((event->attr.precise_ip == 3) &&
+ !constraint_match(&fixed0_constraint, event->hw.config)) {
if (c->idxmsk64 & BIT_ULL(0))
return &counter0_constraint;
@@ -4042,6 +4178,39 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
return c;
}
+static struct event_constraint *
+adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type == hybrid_big)
+ return spr_get_event_constraints(cpuc, idx, event);
+ else if (pmu->cpu_type == hybrid_small)
+ return tnt_get_event_constraints(cpuc, idx, event);
+
+ WARN_ON(1);
+ return &emptyconstraint;
+}
+
+static int adl_hw_config(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type == hybrid_big)
+ return hsw_hw_config(event);
+ else if (pmu->cpu_type == hybrid_small)
+ return intel_pmu_hw_config(event);
+
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+
+static u8 adl_get_hybrid_cpu_type(void)
+{
+ return hybrid_big;
+}
+
/*
* Broadwell:
*
@@ -4145,7 +4314,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
{
cpuc->pebs_record_size = x86_pmu.pebs_record_size;
- if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+ if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
goto err;
@@ -4199,12 +4368,62 @@ static void flip_smm_bit(void *data)
}
}
+static bool init_hybrid_pmu(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ u8 cpu_type = get_this_hybrid_cpu_type();
+ struct x86_hybrid_pmu *pmu = NULL;
+ int i;
+
+ if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
+ cpu_type = x86_pmu.get_hybrid_cpu_type();
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ if (x86_pmu.hybrid_pmu[i].cpu_type == cpu_type) {
+ pmu = &x86_pmu.hybrid_pmu[i];
+ break;
+ }
+ }
+ if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) {
+ cpuc->pmu = NULL;
+ return false;
+ }
+
+ /* Only check and dump the PMU information for the first CPU */
+ if (!cpumask_empty(&pmu->supported_cpus))
+ goto end;
+
+ if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed))
+ return false;
+
+ pr_info("%s PMU driver: ", pmu->name);
+
+ if (pmu->intel_cap.pebs_output_pt_available)
+ pr_cont("PEBS-via-PT ");
+
+ pr_cont("\n");
+
+ x86_pmu_show_pmu_cap(pmu->num_counters, pmu->num_counters_fixed,
+ pmu->intel_ctrl);
+
+end:
+ cpumask_set_cpu(cpu, &pmu->supported_cpus);
+ cpuc->pmu = &pmu->pmu;
+
+ x86_pmu_update_cpu_context(&pmu->pmu, cpu);
+
+ return true;
+}
+
static void intel_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
int core_id = topology_core_id(cpu);
int i;
+ if (is_hybrid() && !init_hybrid_pmu(cpu))
+ return;
+
init_debug_store_on_cpu(cpu);
/*
* Deal with CPUs that don't clear their LBRs on power-up.
@@ -4222,8 +4441,16 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
- /* Disable perf metrics if any added CPU doesn't support it. */
- if (x86_pmu.intel_cap.perf_metrics) {
+ /*
+ * Disable perf metrics if any added CPU doesn't support it.
+ *
+ * Turn off the check for a hybrid architecture, because the
+ * architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicate
+ * the architecture features. The perf metrics is a model-specific
+ * feature for now. The corresponding bit should always be 0 on
+ * a hybrid platform, e.g., Alder Lake.
+ */
+ if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) {
union perf_capabilities perf_cap;
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
@@ -4310,7 +4537,12 @@ void intel_cpuc_finish(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dead(int cpu)
{
- intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ intel_cpuc_finish(cpuc);
+
+ if (is_hybrid() && cpuc->pmu)
+ cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -4331,14 +4563,30 @@ static int intel_pmu_check_period(struct perf_event *event, u64 value)
return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
}
+static void intel_aux_output_init(void)
+{
+ /* Refer also intel_pmu_aux_output_match() */
+ if (x86_pmu.intel_cap.pebs_output_pt_available)
+ x86_pmu.assign = intel_pmu_assign_event;
+}
+
static int intel_pmu_aux_output_match(struct perf_event *event)
{
+ /* intel_pmu_assign_event() is needed, refer intel_aux_output_init() */
if (!x86_pmu.intel_cap.pebs_output_pt_available)
return 0;
return is_intel_pt_event(event);
}
+static int intel_pmu_filter_match(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+ unsigned int cpu = smp_processor_id();
+
+ return cpumask_test_cpu(cpu, &pmu->supported_cpus);
+}
+
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -4476,6 +4724,19 @@ static __initconst const struct x86_pmu intel_pmu = {
.lbr_read = intel_pmu_lbr_read_64,
.lbr_save = intel_pmu_lbr_save,
.lbr_restore = intel_pmu_lbr_restore,
+
+ /*
+ * SMM has access to all 4 rings and while traditionally SMM code only
+ * ran in CPL0, 2021-era firmware is starting to make use of CPL3 in SMM.
+ *
+ * Since the EVENTSEL.{USR,OS} CPL filtering makes no distinction
+ * between SMM or not, this results in what should be pure userspace
+ * counters including SMM data.
+ *
+ * This is a clear privilege issue, therefore globally disable
+ * counting SMM by default.
+ */
+ .attr_freeze_on_smi = 1,
};
static __init void intel_clovertown_quirk(void)
@@ -4516,7 +4777,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009),
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009),
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
@@ -4594,7 +4855,7 @@ static bool check_msr(unsigned long msr, u64 mask)
/*
* Disable the check for real HW, so we don't
- * mess with potentionaly enabled registers:
+ * mess with potentially enabled registers:
*/
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
return true;
@@ -4659,7 +4920,7 @@ static __init void intel_arch_events_quirk(void)
{
int bit;
- /* disable event that reported as not presend by cpuid */
+ /* disable event that reported as not present by cpuid */
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
pr_warn("CPUID marked event: \'%s\' unavailable\n",
@@ -4861,9 +5122,9 @@ static ssize_t freeze_on_smi_store(struct device *cdev,
x86_pmu.attr_freeze_on_smi = val;
- get_online_cpus();
+ cpus_read_lock();
on_each_cpu(flip_smm_bit, &val, 1);
- put_online_cpus();
+ cpus_read_unlock();
done:
mutex_unlock(&freeze_on_smi_mutex);
@@ -4879,7 +5140,7 @@ static void update_tfa_sched(void *ignored)
* and if so force schedule out for all event types all contexts
*/
if (test_bit(3, cpuc->active_mask))
- perf_pmu_resched(x86_get_pmu());
+ perf_pmu_resched(x86_get_pmu(smp_processor_id()));
}
static ssize_t show_sysctl_tfa(struct device *cdev,
@@ -4906,9 +5167,9 @@ static ssize_t set_sysctl_tfa(struct device *cdev,
allow_tsx_force_abort = val;
- get_online_cpus();
+ cpus_read_lock();
on_each_cpu(update_tfa_sched, NULL, 1);
- put_online_cpus();
+ cpus_read_unlock();
return count;
}
@@ -5041,8 +5302,303 @@ static const struct attribute_group *attr_update[] = {
NULL,
};
+EVENT_ATTR_STR_HYBRID(slots, slots_adl, "event=0x00,umask=0x4", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_adl, "event=0xc2,umask=0x0;event=0x00,umask=0x80", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-bad-spec, td_bad_spec_adl, "event=0x73,umask=0x0;event=0x00,umask=0x81", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_adl, "event=0x71,umask=0x0;event=0x00,umask=0x82", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_adl, "event=0x74,umask=0x0;event=0x00,umask=0x83", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-heavy-ops, td_heavy_ops_adl, "event=0x00,umask=0x84", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-br-mispredict, td_br_mis_adl, "event=0x00,umask=0x85", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-fetch-lat, td_fetch_lat_adl, "event=0x00,umask=0x86", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-mem-bound, td_mem_bound_adl, "event=0x00,umask=0x87", hybrid_big);
+
+static struct attribute *adl_hybrid_events_attrs[] = {
+ EVENT_PTR(slots_adl),
+ EVENT_PTR(td_retiring_adl),
+ EVENT_PTR(td_bad_spec_adl),
+ EVENT_PTR(td_fe_bound_adl),
+ EVENT_PTR(td_be_bound_adl),
+ EVENT_PTR(td_heavy_ops_adl),
+ EVENT_PTR(td_br_mis_adl),
+ EVENT_PTR(td_fetch_lat_adl),
+ EVENT_PTR(td_mem_bound_adl),
+ NULL,
+};
+
+/* Must be in IDX order */
+EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(mem-loads-aux, mem_ld_aux_adl, "event=0x03,umask=0x82", hybrid_big);
+
+static struct attribute *adl_hybrid_mem_attrs[] = {
+ EVENT_PTR(mem_ld_adl),
+ EVENT_PTR(mem_st_adl),
+ EVENT_PTR(mem_ld_aux_adl),
+ NULL,
+};
+
+EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-conflict, tx_conflict_adl, "event=0x54,umask=0x1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(cycles-t, cycles_t_adl, "event=0x3c,in_tx=1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(cycles-ct, cycles_ct_adl, "event=0x3c,in_tx=1,in_tx_cp=1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-capacity-read, tx_capacity_read_adl, "event=0x54,umask=0x80", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-capacity-write, tx_capacity_write_adl, "event=0x54,umask=0x2", hybrid_big);
+
+static struct attribute *adl_hybrid_tsx_attrs[] = {
+ EVENT_PTR(tx_start_adl),
+ EVENT_PTR(tx_abort_adl),
+ EVENT_PTR(tx_commit_adl),
+ EVENT_PTR(tx_capacity_read_adl),
+ EVENT_PTR(tx_capacity_write_adl),
+ EVENT_PTR(tx_conflict_adl),
+ EVENT_PTR(cycles_t_adl),
+ EVENT_PTR(cycles_ct_adl),
+ NULL,
+};
+
+FORMAT_ATTR_HYBRID(in_tx, hybrid_big);
+FORMAT_ATTR_HYBRID(in_tx_cp, hybrid_big);
+FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small);
+FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small);
+FORMAT_ATTR_HYBRID(frontend, hybrid_big);
+
+static struct attribute *adl_hybrid_extra_attr_rtm[] = {
+ FORMAT_HYBRID_PTR(in_tx),
+ FORMAT_HYBRID_PTR(in_tx_cp),
+ FORMAT_HYBRID_PTR(offcore_rsp),
+ FORMAT_HYBRID_PTR(ldlat),
+ FORMAT_HYBRID_PTR(frontend),
+ NULL,
+};
+
+static struct attribute *adl_hybrid_extra_attr[] = {
+ FORMAT_HYBRID_PTR(offcore_rsp),
+ FORMAT_HYBRID_PTR(ldlat),
+ FORMAT_HYBRID_PTR(frontend),
+ NULL,
+};
+
+static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ struct perf_pmu_events_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr);
+
+ return pmu->cpu_type & pmu_attr->pmu_type;
+}
+
+static umode_t hybrid_events_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ return is_attr_for_this_pmu(kobj, attr) ? attr->mode : 0;
+}
+
+static inline int hybrid_find_supported_cpu(struct x86_hybrid_pmu *pmu)
+{
+ int cpu = cpumask_first(&pmu->supported_cpus);
+
+ return (cpu >= nr_cpu_ids) ? -1 : cpu;
+}
+
+static umode_t hybrid_tsx_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ int cpu = hybrid_find_supported_cpu(pmu);
+
+ return (cpu >= 0) && is_attr_for_this_pmu(kobj, attr) && cpu_has(&cpu_data(cpu), X86_FEATURE_RTM) ? attr->mode : 0;
+}
+
+static umode_t hybrid_format_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ struct perf_pmu_format_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr);
+ int cpu = hybrid_find_supported_cpu(pmu);
+
+ return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0;
+}
+
+static struct attribute_group hybrid_group_events_td = {
+ .name = "events",
+ .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_mem = {
+ .name = "events",
+ .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_tsx = {
+ .name = "events",
+ .is_visible = hybrid_tsx_is_visible,
+};
+
+static struct attribute_group hybrid_group_format_extra = {
+ .name = "format",
+ .is_visible = hybrid_format_is_visible,
+};
+
+static ssize_t intel_hybrid_get_attr_cpus(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+ return cpumap_print_to_pagebuf(true, buf, &pmu->supported_cpus);
+}
+
+static DEVICE_ATTR(cpus, S_IRUGO, intel_hybrid_get_attr_cpus, NULL);
+static struct attribute *intel_hybrid_cpus_attrs[] = {
+ &dev_attr_cpus.attr,
+ NULL,
+};
+
+static struct attribute_group hybrid_group_cpus = {
+ .attrs = intel_hybrid_cpus_attrs,
+};
+
+static const struct attribute_group *hybrid_attr_update[] = {
+ &hybrid_group_events_td,
+ &hybrid_group_events_mem,
+ &hybrid_group_events_tsx,
+ &group_caps_gen,
+ &group_caps_lbr,
+ &hybrid_group_format_extra,
+ &group_default,
+ &hybrid_group_cpus,
+ NULL,
+};
+
static struct attribute *empty_attrs;
+static void intel_pmu_check_num_counters(int *num_counters,
+ int *num_counters_fixed,
+ u64 *intel_ctrl, u64 fixed_mask)
+{
+ if (*num_counters > INTEL_PMC_MAX_GENERIC) {
+ WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+ *num_counters, INTEL_PMC_MAX_GENERIC);
+ *num_counters = INTEL_PMC_MAX_GENERIC;
+ }
+ *intel_ctrl = (1ULL << *num_counters) - 1;
+
+ if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+ WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+ *num_counters_fixed, INTEL_PMC_MAX_FIXED);
+ *num_counters_fixed = INTEL_PMC_MAX_FIXED;
+ }
+
+ *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED;
+}
+
+static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints,
+ int num_counters,
+ int num_counters_fixed,
+ u64 intel_ctrl)
+{
+ struct event_constraint *c;
+
+ if (!event_constraints)
+ return;
+
+ /*
+ * event on fixed counter2 (REF_CYCLES) only works on this
+ * counter, so do not extend mask to generic counters
+ */
+ for_each_event_constraint(c, event_constraints) {
+ /*
+ * Don't extend the topdown slots and metrics
+ * events to the generic counters.
+ */
+ if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+ /*
+ * Disable topdown slots and metrics events,
+ * if slots event is not in CPUID.
+ */
+ if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl))
+ c->idxmsk64 = 0;
+ c->weight = hweight64(c->idxmsk64);
+ continue;
+ }
+
+ if (c->cmask == FIXED_EVENT_FLAGS) {
+ /* Disabled fixed counters which are not in CPUID */
+ c->idxmsk64 &= intel_ctrl;
+
+ /*
+ * Don't extend the pseudo-encoding to the
+ * generic counters
+ */
+ if (!use_fixed_pseudo_encoding(c->code))
+ c->idxmsk64 |= (1ULL << num_counters) - 1;
+ }
+ c->idxmsk64 &=
+ ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed));
+ c->weight = hweight64(c->idxmsk64);
+ }
+}
+
+static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs)
+{
+ struct extra_reg *er;
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (!extra_regs)
+ return;
+
+ for (er = extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x11UL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+}
+
+static void intel_pmu_check_hybrid_pmus(u64 fixed_mask)
+{
+ struct x86_hybrid_pmu *pmu;
+ int i;
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ pmu = &x86_pmu.hybrid_pmu[i];
+
+ intel_pmu_check_num_counters(&pmu->num_counters,
+ &pmu->num_counters_fixed,
+ &pmu->intel_ctrl,
+ fixed_mask);
+
+ if (pmu->intel_cap.perf_metrics) {
+ pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+ pmu->intel_ctrl |= INTEL_PMC_MSK_FIXED_SLOTS;
+ }
+
+ if (pmu->intel_cap.pebs_output_pt_available)
+ pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+
+ intel_pmu_check_event_constraints(pmu->event_constraints,
+ pmu->num_counters,
+ pmu->num_counters_fixed,
+ pmu->intel_ctrl);
+
+ intel_pmu_check_extra_regs(pmu->extra_regs);
+ }
+}
+
__init int intel_pmu_init(void)
{
struct attribute **extra_skl_attr = &empty_attrs;
@@ -5053,12 +5609,11 @@ __init int intel_pmu_init(void)
union cpuid10_edx edx;
union cpuid10_eax eax;
union cpuid10_ebx ebx;
- struct event_constraint *c;
unsigned int fixed_mask;
- struct extra_reg *er;
bool pmem = false;
int version, i;
char *name;
+ struct x86_hybrid_pmu *pmu;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -5564,7 +6119,13 @@ __init int intel_pmu_init(void)
tsx_attr = hsw_tsx_events_attrs;
intel_pmu_pebs_data_source_skl(pmem);
- if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ /*
+ * Processors with CPUID.RTM_ALWAYS_ABORT have TSX deprecated by default.
+ * TSX force abort hooks are not required on these systems. Only deploy
+ * workaround when microcode has not enabled X86_FEATURE_RTM_ALWAYS_ABORT.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) &&
+ !boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
x86_pmu.flags |= PMU_FL_TFA;
x86_pmu.get_event_constraints = tfa_get_event_constraints;
x86_pmu.enable_all = intel_tfa_pmu_enable_all;
@@ -5653,6 +6214,121 @@ __init int intel_pmu_init(void)
name = "sapphire_rapids";
break;
+ case INTEL_FAM6_ALDERLAKE:
+ case INTEL_FAM6_ALDERLAKE_L:
+ case INTEL_FAM6_ALDERLAKE_N:
+ case INTEL_FAM6_RAPTORLAKE:
+ case INTEL_FAM6_RAPTORLAKE_P:
+ /*
+ * Alder Lake has 2 types of CPU, core and atom.
+ *
+ * Initialize the common PerfMon capabilities here.
+ */
+ x86_pmu.hybrid_pmu = kcalloc(X86_HYBRID_NUM_PMUS,
+ sizeof(struct x86_hybrid_pmu),
+ GFP_KERNEL);
+ if (!x86_pmu.hybrid_pmu)
+ return -ENOMEM;
+ static_branch_enable(&perf_is_hybrid);
+ x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS;
+
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ x86_pmu.lbr_pt_coexist = true;
+ intel_pmu_pebs_data_source_skl(false);
+ x86_pmu.num_topdown_events = 8;
+ x86_pmu.update_topdown_event = adl_update_topdown_event;
+ x86_pmu.set_topdown_event_period = adl_set_topdown_event_period;
+
+ x86_pmu.filter_match = intel_pmu_filter_match;
+ x86_pmu.get_event_constraints = adl_get_event_constraints;
+ x86_pmu.hw_config = adl_hw_config;
+ x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type;
+ /*
+ * The rtm_abort_event is used to check whether to enable GPRs
+ * for the RTM abort event. Atom doesn't have the RTM abort
+ * event. There is no harmful to set it in the common
+ * x86_pmu.rtm_abort_event.
+ */
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+
+ td_attr = adl_hybrid_events_attrs;
+ mem_attr = adl_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ adl_hybrid_extra_attr_rtm : adl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ pmu->name = "cpu_core";
+ pmu->cpu_type = hybrid_big;
+ pmu->late_ack = true;
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+ pmu->num_counters = x86_pmu.num_counters + 2;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
+ } else {
+ pmu->num_counters = x86_pmu.num_counters;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ }
+
+ /*
+ * Quirk: For some Alder Lake machine, when all E-cores are disabled in
+ * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However,
+ * the X86_FEATURE_HYBRID_CPU is still set. The above codes will
+ * mistakenly add extra counters for P-cores. Correct the number of
+ * counters here.
+ */
+ if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) {
+ pmu->num_counters = x86_pmu.num_counters;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ }
+
+ pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
+ 0, pmu->num_counters, 0, 0);
+ pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
+ pmu->intel_cap.perf_metrics = 1;
+ pmu->intel_cap.pebs_output_pt_available = 0;
+
+ memcpy(pmu->hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids));
+ memcpy(pmu->hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs));
+ pmu->event_constraints = intel_spr_event_constraints;
+ pmu->pebs_constraints = intel_spr_pebs_event_constraints;
+ pmu->extra_regs = intel_spr_extra_regs;
+
+ /* Initialize Atom core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ pmu->name = "cpu_atom";
+ pmu->cpu_type = hybrid_small;
+ pmu->mid_ack = true;
+ pmu->num_counters = x86_pmu.num_counters;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ pmu->max_pebs_events = x86_pmu.max_pebs_events;
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
+ 0, pmu->num_counters, 0, 0);
+ pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
+ pmu->intel_cap.perf_metrics = 0;
+ pmu->intel_cap.pebs_output_pt_available = 1;
+
+ memcpy(pmu->hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids));
+ memcpy(pmu->hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs));
+ pmu->hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+ pmu->event_constraints = intel_slm_event_constraints;
+ pmu->pebs_constraints = intel_grt_pebs_event_constraints;
+ pmu->extra_regs = intel_grt_extra_regs;
+ pr_cont("Alderlake Hybrid events, ");
+ name = "alderlake_hybrid";
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -5660,7 +6336,9 @@ __init int intel_pmu_init(void)
pr_cont("generic architected perfmon v1, ");
name = "generic_arch_v1";
break;
- default:
+ case 2:
+ case 3:
+ case 4:
/*
* default constraints for v2 and up
*/
@@ -5668,80 +6346,63 @@ __init int intel_pmu_init(void)
pr_cont("generic architected perfmon, ");
name = "generic_arch_v2+";
break;
+ default:
+ /*
+ * The default constraints for v5 and up can support up to
+ * 16 fixed counters. For the fixed counters 4 and later,
+ * the pseudo-encoding is applied.
+ * The constraints may be cut according to the CPUID enumeration
+ * by inserting the EVENT_CONSTRAINT_END.
+ */
+ if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED)
+ x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+ intel_v5_gen_event_constraints[x86_pmu.num_counters_fixed].weight = -1;
+ x86_pmu.event_constraints = intel_v5_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ name = "generic_arch_v5+";
+ break;
}
}
snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name);
+ if (!is_hybrid()) {
+ group_events_td.attrs = td_attr;
+ group_events_mem.attrs = mem_attr;
+ group_events_tsx.attrs = tsx_attr;
+ group_format_extra.attrs = extra_attr;
+ group_format_extra_skl.attrs = extra_skl_attr;
- group_events_td.attrs = td_attr;
- group_events_mem.attrs = mem_attr;
- group_events_tsx.attrs = tsx_attr;
- group_format_extra.attrs = extra_attr;
- group_format_extra_skl.attrs = extra_skl_attr;
-
- x86_pmu.attr_update = attr_update;
-
- if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
- WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
- x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
- x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
- }
- x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;
+ x86_pmu.attr_update = attr_update;
+ } else {
+ hybrid_group_events_td.attrs = td_attr;
+ hybrid_group_events_mem.attrs = mem_attr;
+ hybrid_group_events_tsx.attrs = tsx_attr;
+ hybrid_group_format_extra.attrs = extra_attr;
- if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
- WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
- x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
- x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+ x86_pmu.attr_update = hybrid_attr_update;
}
- x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
+ intel_pmu_check_num_counters(&x86_pmu.num_counters,
+ &x86_pmu.num_counters_fixed,
+ &x86_pmu.intel_ctrl,
+ (u64)fixed_mask);
/* AnyThread may be deprecated on arch perfmon v5 or later */
if (x86_pmu.intel_cap.anythread_deprecated)
x86_pmu.format_attrs = intel_arch_formats_attr;
- if (x86_pmu.event_constraints) {
- /*
- * event on fixed counter2 (REF_CYCLES) only works on this
- * counter, so do not extend mask to generic counters
- */
- for_each_event_constraint(c, x86_pmu.event_constraints) {
- /*
- * Don't extend the topdown slots and metrics
- * events to the generic counters.
- */
- if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
- /*
- * Disable topdown slots and metrics events,
- * if slots event is not in CPUID.
- */
- if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
- c->idxmsk64 = 0;
- c->weight = hweight64(c->idxmsk64);
- continue;
- }
-
- if (c->cmask == FIXED_EVENT_FLAGS) {
- /* Disabled fixed counters which are not in CPUID */
- c->idxmsk64 &= x86_pmu.intel_ctrl;
-
- if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
- }
- c->idxmsk64 &=
- ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
- c->weight = hweight64(c->idxmsk64);
- }
- }
-
+ intel_pmu_check_event_constraints(x86_pmu.event_constraints,
+ x86_pmu.num_counters,
+ x86_pmu.num_counters_fixed,
+ x86_pmu.intel_ctrl);
/*
* Access LBR MSR may cause #GP under certain circumstances.
* E.g. KVM doesn't support LBR MSR
* Check all LBT MSR here.
* Disable LBR access if any LBR MSRs can not be accessed.
*/
- if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))
x86_pmu.lbr_nr = 0;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
@@ -5749,23 +6410,25 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_nr = 0;
}
- if (x86_pmu.lbr_nr)
+ if (x86_pmu.lbr_nr) {
+ intel_pmu_lbr_init();
+
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
- /*
- * Access extra MSR may cause #GP under certain circumstances.
- * E.g. KVM doesn't support offcore event
- * Check all extra_regs here.
- */
- if (x86_pmu.extra_regs) {
- for (er = x86_pmu.extra_regs; er->msr; er++) {
- er->extra_msr_access = check_msr(er->msr, 0x11UL);
- /* Disable LBR select mapping */
- if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
- x86_pmu.lbr_sel_map = NULL;
+ /* only support branch_stack snapshot for perfmon >= v2 */
+ if (x86_pmu.disable_all == intel_pmu_disable_all) {
+ if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) {
+ static_call_update(perf_snapshot_branch_stack,
+ intel_pmu_snapshot_arch_branch_stack);
+ } else {
+ static_call_update(perf_snapshot_branch_stack,
+ intel_pmu_snapshot_branch_stack);
+ }
}
}
+ intel_pmu_check_extra_regs(x86_pmu.extra_regs);
+
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
@@ -5773,9 +6436,14 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
- if (x86_pmu.intel_cap.perf_metrics)
+ if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+ if (is_hybrid())
+ intel_pmu_check_hybrid_pmus((u64)fixed_mask);
+
+ intel_aux_output_init();
+
return 0;
}
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 407eee5f6f95..8ec23f47fee9 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,7 +40,7 @@
* Model specific counters:
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
- * Available model: SLM,AMT,GLM,CNL,TNT
+ * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -50,47 +50,51 @@
* MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
- * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
+ * TGL,TNT,RKL,ADL,RPL,SPR
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL
+ * ICL,TGL,RKL,ADL,RPL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,TGL,TNT,RKL
+ * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
+ * RPL,SPR
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
- * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL
+ * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
- * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
+ * TGL,TNT,RKL,ADL,RPL,SPR
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL,RKL
+ * KBL,CML,ICL,TGL,RKL,ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
* perf code: 0x04
- * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
* perf code: 0x05
- * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL,RPL
* Scope: Package (physical package)
*
*/
@@ -563,6 +567,28 @@ static const struct cstate_model icl_cstates __initconst = {
BIT(PERF_CSTATE_PKG_C10_RES),
};
+static const struct cstate_model icx_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
+};
+
+static const struct cstate_model adl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
static const struct cstate_model slm_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
BIT(PERF_CSTATE_CORE_C6_RES),
@@ -647,9 +673,18 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
+
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index d32b302719fe..376cc3d66094 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -779,6 +779,13 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_grt_pebs_event_constraints[] = {
+ /* Allow all events as PEBS with no flags */
+ INTEL_PLD_CONSTRAINT(0x5d0, 0xf),
+ INTEL_PSD_CONSTRAINT(0x6d0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
@@ -916,7 +923,8 @@ struct event_constraint intel_skl_pebs_event_constraints[] = {
};
struct event_constraint intel_icl_pebs_event_constraints[] = {
- INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL), /* old INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */
INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -936,7 +944,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
};
struct event_constraint intel_spr_pebs_event_constraints[] = {
- INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
@@ -959,13 +967,14 @@ struct event_constraint intel_spr_pebs_event_constraints[] = {
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
+ struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints);
struct event_constraint *c;
if (!event->attr.precise_ip)
return NULL;
- if (x86_pmu.pebs_constraints) {
- for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+ if (pebs_constraints) {
+ for_each_event_constraint(c, pebs_constraints) {
if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
@@ -1007,6 +1016,8 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
{
struct debug_store *ds = cpuc->ds;
+ int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
u64 threshold;
int reserved;
@@ -1014,9 +1025,9 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
return;
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
- reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed;
+ reserved = max_pebs_events + num_counters_fixed;
else
- reserved = x86_pmu.max_pebs_events;
+ reserved = max_pebs_events;
if (cpuc->n_pebs == cpuc->n_large_pebs) {
threshold = ds->pebs_absolute_maximum -
@@ -1177,6 +1188,9 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
+ u64 value = ds->pebs_event_reset[hwc->idx];
+ u32 base = MSR_RELOAD_PMC0;
+ unsigned int idx = hwc->idx;
if (!is_pebs_pt(event))
return;
@@ -1186,7 +1200,15 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event)
cpuc->pebs_enabled |= PEBS_OUTPUT_PT;
- wrmsrl(MSR_RELOAD_PMC0 + hwc->idx, ds->pebs_event_reset[hwc->idx]);
+ if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+ base = MSR_RELOAD_FIXED_CTR0;
+ idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+ if (x86_pmu.intel_cap.pebs_format < 5)
+ value = ds->pebs_event_reset[MAX_PEBS_EVENTS_FMT4 + idx];
+ else
+ value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx];
+ }
+ wrmsrl(base + idx, value);
}
void intel_pmu_pebs_enable(struct perf_event *event)
@@ -1194,6 +1216,7 @@ void intel_pmu_pebs_enable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
+ unsigned int idx = hwc->idx;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
@@ -1212,19 +1235,22 @@ void intel_pmu_pebs_enable(struct perf_event *event)
}
}
+ if (idx >= INTEL_PMC_IDX_FIXED) {
+ if (x86_pmu.intel_cap.pebs_format < 5)
+ idx = MAX_PEBS_EVENTS_FMT4 + (idx - INTEL_PMC_IDX_FIXED);
+ else
+ idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED);
+ }
+
/*
* Use auto-reload if possible to save a MSR write in the PMI.
* This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
*/
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
- unsigned int idx = hwc->idx;
-
- if (idx >= INTEL_PMC_IDX_FIXED)
- idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED);
ds->pebs_event_reset[idx] =
(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
} else {
- ds->pebs_event_reset[hwc->idx] = 0;
+ ds->pebs_event_reset[idx] = 0;
}
intel_pmu_pebs_via_pt_enable(event);
@@ -1283,7 +1309,7 @@ void intel_pmu_pebs_disable_all(void)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->pebs_enabled)
- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ __intel_pmu_pebs_disable_all();
}
static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
@@ -1353,14 +1379,13 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
is_64bit = kernel_ip(to) || any_64bit_mode(regs);
#endif
insn_init(&insn, kaddr, size, is_64bit);
- insn_get_length(&insn);
+
/*
- * Make sure there was not a problem decoding the
- * instruction and getting the length. This is
- * doubly important because we have an infinite
- * loop if insn.length=0.
+ * Make sure there was not a problem decoding the instruction.
+ * This is doubly important because we have an infinite loop if
+ * insn.length=0.
*/
- if (!insn.length)
+ if (insn_get_length(&insn))
break;
to += insn.length;
@@ -1805,7 +1830,7 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
*
* [-period, 0]
*
- * the difference between two consequtive reads is:
+ * the difference between two consecutive reads is:
*
* A) value2 - value1;
* when no overflows have happened in between,
@@ -2072,6 +2097,8 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct debug_store *ds = cpuc->ds;
struct perf_event *event;
void *base, *at, *top;
@@ -2086,9 +2113,9 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
ds->pebs_index = ds->pebs_buffer_base;
- mask = ((1ULL << x86_pmu.max_pebs_events) - 1) |
- (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
- size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
+ mask = ((1ULL << max_pebs_events) - 1) |
+ (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
+ size = INTEL_PMC_IDX_FIXED + num_counters_fixed;
if (unlikely(base >= top)) {
intel_pmu_pebs_event_update_no_drain(cpuc, size);
@@ -2184,6 +2211,7 @@ void __init intel_ds_init(void)
break;
case 4:
+ case 5:
x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl;
x86_pmu.pebs_record_size = sizeof(struct pebs_basic);
if (x86_pmu.intel_cap.pebs_baseline) {
@@ -2192,7 +2220,7 @@ void __init intel_ds_init(void)
PERF_SAMPLE_TIME;
x86_pmu.flags |= PMU_FL_PEBS_ALL;
pebs_qual = "-baseline";
- x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+ x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
} else {
/* Only basic record supported */
x86_pmu.large_pebs_flags &=
@@ -2205,9 +2233,9 @@ void __init intel_ds_init(void)
}
pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual);
- if (x86_pmu.intel_cap.pebs_output_pt_available) {
+ if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) {
pr_cont("PEBS-via-PT, ");
- x86_get_pmu()->capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+ x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
}
break;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 21890dacfcfe..13179f31fe10 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -8,14 +8,6 @@
#include "../perf_event.h"
-static const enum {
- LBR_EIP_FLAGS = 1,
- LBR_TSX = 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
- [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS,
- [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
/*
* Intel LBR_SELECT bits
* Intel Vol3a, April 2011, Section 16.7 Table 16-10
@@ -228,20 +220,6 @@ static void __intel_pmu_lbr_enable(bool pmi)
wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
}
-static void __intel_pmu_lbr_disable(void)
-{
- u64 debugctl;
-
- if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
- wrmsrl(MSR_ARCH_LBR_CTL, 0);
- return;
- }
-
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
void intel_pmu_lbr_reset_32(void)
{
int i;
@@ -257,7 +235,7 @@ void intel_pmu_lbr_reset_64(void)
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ if (x86_pmu.lbr_has_info)
wrmsrl(x86_pmu.lbr_info + i, 0);
}
}
@@ -279,6 +257,8 @@ void intel_pmu_lbr_reset(void)
cpuc->last_task_ctx = NULL;
cpuc->last_log_id = 0;
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && cpuc->lbr_select)
+ wrmsrl(MSR_LBR_SELECT, 0);
}
/*
@@ -317,11 +297,10 @@ enum {
*/
static inline bool lbr_from_signext_quirk_needed(void)
{
- int lbr_format = x86_pmu.intel_cap.lbr_format;
bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) ||
boot_cpu_has(X86_FEATURE_RTM);
- return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX);
+ return !tsx_support && x86_pmu.lbr_has_tsx;
}
static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key);
@@ -439,12 +418,12 @@ rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
void intel_pmu_lbr_restore(void *ctx)
{
- bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx = ctx;
- int i;
- unsigned lbr_idx, mask;
+ bool need_info = x86_pmu.lbr_has_info;
u64 tos = task_ctx->tos;
+ unsigned lbr_idx, mask;
+ int i;
mask = x86_pmu.lbr_nr - 1;
for (i = 0; i < task_ctx->valid_lbrs; i++) {
@@ -456,7 +435,7 @@ void intel_pmu_lbr_restore(void *ctx)
lbr_idx = (tos - i) & mask;
wrlbr_from(lbr_idx, 0);
wrlbr_to(lbr_idx, 0);
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ if (need_info)
wrlbr_info(lbr_idx, 0);
}
@@ -491,7 +470,7 @@ static void intel_pmu_arch_lbr_xrstors(void *ctx)
{
struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
- copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR);
+ xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR);
}
static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
@@ -531,9 +510,9 @@ static void __intel_pmu_lbr_restore(void *ctx)
void intel_pmu_lbr_save(void *ctx)
{
- bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx = ctx;
+ bool need_info = x86_pmu.lbr_has_info;
unsigned lbr_idx, mask;
u64 tos;
int i;
@@ -576,7 +555,7 @@ static void intel_pmu_arch_lbr_xsaves(void *ctx)
{
struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
- copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR);
+ xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR);
}
static void __intel_pmu_lbr_save(void *ctx)
@@ -658,7 +637,6 @@ static inline bool branch_user_callstack(unsigned br_sel)
void intel_pmu_lbr_add(struct perf_event *event)
{
- struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
@@ -696,16 +674,11 @@ void intel_pmu_lbr_add(struct perf_event *event)
perf_sched_cb_inc(event->ctx->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
-
- if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
- kmem_cache && !cpuc->lbr_xsave &&
- (cpuc->lbr_users != cpuc->lbr_pebs_users))
- cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
}
void release_lbr_buffers(void)
{
- struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+ struct kmem_cache *kmem_cache;
struct cpu_hw_events *cpuc;
int cpu;
@@ -714,6 +687,7 @@ void release_lbr_buffers(void)
for_each_possible_cpu(cpu) {
cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ kmem_cache = x86_get_pmu(cpu)->task_ctx_cache;
if (kmem_cache && cpuc->lbr_xsave) {
kmem_cache_free(kmem_cache, cpuc->lbr_xsave);
cpuc->lbr_xsave = NULL;
@@ -721,6 +695,27 @@ void release_lbr_buffers(void)
}
}
+void reserve_lbr_buffers(void)
+{
+ struct kmem_cache *kmem_cache;
+ struct cpu_hw_events *cpuc;
+ int cpu;
+
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ kmem_cache = x86_get_pmu(cpu)->task_ctx_cache;
+ if (!kmem_cache || cpuc->lbr_xsave)
+ continue;
+
+ cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache,
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
+ }
+}
+
void intel_pmu_lbr_del(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -763,13 +758,18 @@ void intel_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- if (cpuc->lbr_users && !vlbr_exclude_host())
+ if (cpuc->lbr_users && !vlbr_exclude_host()) {
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return __intel_pmu_arch_lbr_disable();
+
__intel_pmu_lbr_disable();
+ }
}
void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
+ struct perf_branch_entry *br = cpuc->lbr_entries;
u64 tos = intel_pmu_lbr_tos();
int i;
@@ -785,15 +785,11 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
- cpuc->lbr_entries[i].from = msr_lastbranch.from;
- cpuc->lbr_entries[i].to = msr_lastbranch.to;
- cpuc->lbr_entries[i].mispred = 0;
- cpuc->lbr_entries[i].predicted = 0;
- cpuc->lbr_entries[i].in_tx = 0;
- cpuc->lbr_entries[i].abort = 0;
- cpuc->lbr_entries[i].cycles = 0;
- cpuc->lbr_entries[i].type = 0;
- cpuc->lbr_entries[i].reserved = 0;
+ perf_clear_branch_entry_bitfields(br);
+
+ br->from = msr_lastbranch.from;
+ br->to = msr_lastbranch.to;
+ br++;
}
cpuc->lbr_stack.nr = i;
cpuc->lbr_stack.hw_idx = tos;
@@ -808,7 +804,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
bool need_info = false, call_stack = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
- int lbr_format = x86_pmu.intel_cap.lbr_format;
+ struct perf_branch_entry *br = cpuc->lbr_entries;
u64 tos = intel_pmu_lbr_tos();
int i;
int out = 0;
@@ -823,9 +819,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
- int skip = 0;
u16 cycles = 0;
- int lbr_flags = lbr_desc[lbr_format];
from = rdlbr_from(lbr_idx, NULL);
to = rdlbr_to(lbr_idx, NULL);
@@ -837,37 +831,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
if (call_stack && !from)
break;
- if (lbr_format == LBR_FORMAT_INFO && need_info) {
- u64 info;
-
- info = rdlbr_info(lbr_idx, NULL);
- mis = !!(info & LBR_INFO_MISPRED);
- pred = !mis;
- in_tx = !!(info & LBR_INFO_IN_TX);
- abort = !!(info & LBR_INFO_ABORT);
- cycles = (info & LBR_INFO_CYCLES);
- }
-
- if (lbr_format == LBR_FORMAT_TIME) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
- skip = 1;
- cycles = ((to >> 48) & LBR_INFO_CYCLES);
-
- to = (u64)((((s64)to) << 16) >> 16);
- }
-
- if (lbr_flags & LBR_EIP_FLAGS) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
- skip = 1;
- }
- if (lbr_flags & LBR_TSX) {
- in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
- abort = !!(from & LBR_FROM_FLAG_ABORT);
- skip = 3;
+ if (x86_pmu.lbr_has_info) {
+ if (need_info) {
+ u64 info;
+
+ info = rdlbr_info(lbr_idx, NULL);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ cycles = (info & LBR_INFO_CYCLES);
+ if (x86_pmu.lbr_has_tsx) {
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ }
+ }
+ } else {
+ int skip = 0;
+
+ if (x86_pmu.lbr_from_flags) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ skip = 1;
+ }
+ if (x86_pmu.lbr_has_tsx) {
+ in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+ abort = !!(from & LBR_FROM_FLAG_ABORT);
+ skip = 3;
+ }
+ from = (u64)((((s64)from) << skip) >> skip);
+
+ if (x86_pmu.lbr_to_cycles) {
+ cycles = ((to >> 48) & LBR_INFO_CYCLES);
+ to = (u64)((((s64)to) << 16) >> 16);
+ }
}
- from = (u64)((((s64)from) << skip) >> skip);
/*
* Some CPUs report duplicated abort records,
@@ -880,52 +876,54 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
if (abort && x86_pmu.lbr_double_abort && out > 0)
out--;
- cpuc->lbr_entries[out].from = from;
- cpuc->lbr_entries[out].to = to;
- cpuc->lbr_entries[out].mispred = mis;
- cpuc->lbr_entries[out].predicted = pred;
- cpuc->lbr_entries[out].in_tx = in_tx;
- cpuc->lbr_entries[out].abort = abort;
- cpuc->lbr_entries[out].cycles = cycles;
- cpuc->lbr_entries[out].type = 0;
- cpuc->lbr_entries[out].reserved = 0;
+ perf_clear_branch_entry_bitfields(br+out);
+ br[out].from = from;
+ br[out].to = to;
+ br[out].mispred = mis;
+ br[out].predicted = pred;
+ br[out].in_tx = in_tx;
+ br[out].abort = abort;
+ br[out].cycles = cycles;
out++;
}
cpuc->lbr_stack.nr = out;
cpuc->lbr_stack.hw_idx = tos;
}
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_mispred);
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_cycles);
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_type);
+
static __always_inline int get_lbr_br_type(u64 info)
{
- if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type)
- return 0;
+ int type = 0;
- return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+ if (static_branch_likely(&x86_lbr_type))
+ type = (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+
+ return type;
}
static __always_inline bool get_lbr_mispred(u64 info)
{
- if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
- return 0;
+ bool mispred = 0;
- return !!(info & LBR_INFO_MISPRED);
-}
+ if (static_branch_likely(&x86_lbr_mispred))
+ mispred = !!(info & LBR_INFO_MISPRED);
-static __always_inline bool get_lbr_predicted(u64 info)
-{
- if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
- return 0;
-
- return !(info & LBR_INFO_MISPRED);
+ return mispred;
}
static __always_inline u16 get_lbr_cycles(u64 info)
{
+ u16 cycles = info & LBR_INFO_CYCLES;
+
if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
- !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
- return 0;
+ (!static_branch_likely(&x86_lbr_cycles) ||
+ !(info & LBR_INFO_CYC_CNT_VALID)))
+ cycles = 0;
- return info & LBR_INFO_CYCLES;
+ return cycles;
}
static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
@@ -950,15 +948,16 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
to = rdlbr_to(i, lbr);
info = rdlbr_info(i, lbr);
+ perf_clear_branch_entry_bitfields(e);
+
e->from = from;
e->to = to;
e->mispred = get_lbr_mispred(info);
- e->predicted = get_lbr_predicted(info);
+ e->predicted = !e->mispred;
e->in_tx = !!(info & LBR_INFO_IN_TX);
e->abort = !!(info & LBR_INFO_ABORT);
e->cycles = get_lbr_cycles(info);
e->type = get_lbr_br_type(info);
- e->reserved = 0;
}
cpuc->lbr_stack.nr = i;
@@ -977,7 +976,7 @@ static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
intel_pmu_store_lbr(cpuc, NULL);
return;
}
- copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR);
+ xsaves(&xsave->xsave, XFEATURE_MASK_LBR);
intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
}
@@ -1112,7 +1111,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
(br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
- (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+ x86_pmu.lbr_has_info)
reg->config |= LBR_NO_INFO;
return 0;
@@ -1198,7 +1197,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
/*
* The LBR logs any address in the IP, even if the IP just
* faulted. This means userspace can control the from address.
- * Ensure we don't blindy read any address by validating it is
+ * Ensure we don't blindly read any address by validating it is
* a known text address.
*/
if (kernel_text_address(from)) {
@@ -1224,8 +1223,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs());
#endif
insn_init(&insn, addr, bytes_read, is64);
- insn_get_opcode(&insn);
- if (!insn.opcode.got)
+ if (insn_get_opcode(&insn))
return X86_BR_ABORT;
switch (insn.opcode.bytes[0]) {
@@ -1262,8 +1260,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
- insn_get_immediate(&insn);
- if (insn.immediate1.value == 0) {
+ if (insn_get_immediate(&insn) || insn.immediate1.value == 0) {
/* zero length call */
ret = X86_BR_ZERO_CALL;
break;
@@ -1279,7 +1276,9 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_JMP;
break;
case 0xff: /* call near absolute, call far absolute ind */
- insn_get_modrm(&insn);
+ if (insn_get_modrm(&insn))
+ return X86_BR_ABORT;
+
ext = (insn.modrm.bytes[0] >> 3) & 0x7;
switch (ext) {
case 2: /* near ind call */
@@ -1328,10 +1327,10 @@ static int branch_map[X86_BR_TYPE_MAP_MAX] = {
PERF_BR_SYSCALL, /* X86_BR_SYSCALL */
PERF_BR_SYSRET, /* X86_BR_SYSRET */
PERF_BR_UNKNOWN, /* X86_BR_INT */
- PERF_BR_UNKNOWN, /* X86_BR_IRET */
+ PERF_BR_ERET, /* X86_BR_IRET */
PERF_BR_COND, /* X86_BR_JCC */
PERF_BR_UNCOND, /* X86_BR_JMP */
- PERF_BR_UNKNOWN, /* X86_BR_IRQ */
+ PERF_BR_IRQ, /* X86_BR_IRQ */
PERF_BR_IND_CALL, /* X86_BR_IND_CALL */
PERF_BR_UNKNOWN, /* X86_BR_ABORT */
PERF_BR_UNKNOWN, /* X86_BR_IN_TX */
@@ -1609,7 +1608,7 @@ void intel_pmu_lbr_init_hsw(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
- x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+ x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0);
if (lbr_from_signext_quirk_needed())
static_branch_enable(&lbr_from_quirk_key);
@@ -1629,7 +1628,7 @@ __init void intel_pmu_lbr_init_skl(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
- x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+ x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0);
/*
* SW branch filter usage:
@@ -1698,6 +1697,38 @@ void intel_pmu_lbr_init_knl(void)
x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
}
+void intel_pmu_lbr_init(void)
+{
+ switch (x86_pmu.intel_cap.lbr_format) {
+ case LBR_FORMAT_EIP_FLAGS2:
+ x86_pmu.lbr_has_tsx = 1;
+ fallthrough;
+ case LBR_FORMAT_EIP_FLAGS:
+ x86_pmu.lbr_from_flags = 1;
+ break;
+
+ case LBR_FORMAT_INFO:
+ x86_pmu.lbr_has_tsx = 1;
+ fallthrough;
+ case LBR_FORMAT_INFO2:
+ x86_pmu.lbr_has_info = 1;
+ break;
+
+ case LBR_FORMAT_TIME:
+ x86_pmu.lbr_from_flags = 1;
+ x86_pmu.lbr_to_cycles = 1;
+ break;
+ }
+
+ if (x86_pmu.lbr_has_info) {
+ /*
+ * Only used in combination with baseline pebs.
+ */
+ static_branch_enable(&x86_lbr_mispred);
+ static_branch_enable(&x86_lbr_cycles);
+ }
+}
+
/*
* LBR state size is variable based on the max number of registers.
* This calculates the expected state size, which should match
@@ -1718,6 +1749,9 @@ static bool is_arch_lbr_xsave_available(void)
* Check the LBR state with the corresponding software structure.
* Disable LBR XSAVES support if the size doesn't match.
*/
+ if (xfeature_size(XFEATURE_LBR) == 0)
+ return false;
+
if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size()))
return false;
@@ -1726,7 +1760,7 @@ static bool is_arch_lbr_xsave_available(void)
void __init intel_pmu_arch_lbr_init(void)
{
- struct pmu *pmu = x86_get_pmu();
+ struct pmu *pmu = x86_get_pmu(smp_processor_id());
union cpuid28_eax eax;
union cpuid28_ebx ebx;
union cpuid28_ecx ecx;
@@ -1757,6 +1791,12 @@ void __init intel_pmu_arch_lbr_init(void)
x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
x86_pmu.lbr_nr = lbr_nr;
+ if (x86_pmu.lbr_mispred)
+ static_branch_enable(&x86_lbr_mispred);
+ if (x86_pmu.lbr_timed_lbr)
+ static_branch_enable(&x86_lbr_cycles);
+ if (x86_pmu.lbr_br_type)
+ static_branch_enable(&x86_lbr_type);
arch_lbr_xsave = is_arch_lbr_xsave_available();
if (arch_lbr_xsave) {
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index a4cc66005ce8..7951a5dc73b6 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -24,7 +24,7 @@ struct p4_event_bind {
unsigned int escr_msr[2]; /* ESCR MSR for this event */
unsigned int escr_emask; /* valid ESCR EventMask bits */
unsigned int shared; /* event is shared across threads */
- char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
+ char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */
};
struct p4_pebs_bind {
@@ -45,7 +45,7 @@ struct p4_pebs_bind {
* it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
* event configuration to find out which values are to be
* written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
- * resgisters
+ * registers
*/
static struct p4_pebs_bind p4_pebs_bind_map[] = {
P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
@@ -947,7 +947,7 @@ static void p4_pmu_enable_pebs(u64 config)
(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
}
-static void p4_pmu_enable_event(struct perf_event *event)
+static void __p4_pmu_enable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
int thread = p4_ht_config_thread(hwc->config);
@@ -983,6 +983,16 @@ static void p4_pmu_enable_event(struct perf_event *event)
(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
}
+static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(X86_PMC_IDX_MAX)], p4_running);
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+ int idx = event->hw.idx;
+
+ __set_bit(idx, per_cpu(p4_running, smp_processor_id()));
+ __p4_pmu_enable_event(event);
+}
+
static void p4_pmu_enable_all(int added)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -992,7 +1002,7 @@ static void p4_pmu_enable_all(int added)
struct perf_event *event = cpuc->events[idx];
if (!test_bit(idx, cpuc->active_mask))
continue;
- p4_pmu_enable_event(event);
+ __p4_pmu_enable_event(event);
}
}
@@ -1012,7 +1022,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
if (!test_bit(idx, cpuc->active_mask)) {
/* catch in-flight IRQs */
- if (__test_and_clear_bit(idx, cpuc->running))
+ if (__test_and_clear_bit(idx, per_cpu(p4_running, smp_processor_id())))
handled++;
continue;
}
@@ -1313,7 +1323,7 @@ static __initconst const struct x86_pmu p4_pmu = {
.get_event_constraints = x86_get_event_constraints,
/*
* IF HT disabled we may need to use all
- * ARCH_P4_MAX_CCCR counters simulaneously
+ * ARCH_P4_MAX_CCCR counters simultaneously
* though leave it restricted at moment assuming
* HT is on
*/
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e94af4a54d0d..82ef87e9a897 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -13,6 +13,8 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
+#include <linux/bits.h>
+#include <linux/limits.h>
#include <linux/slab.h>
#include <linux/device.h>
@@ -57,12 +59,14 @@ static struct pt_cap_desc {
PT_CAP(mtc, 0, CPUID_EBX, BIT(3)),
PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)),
PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)),
+ PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)),
+ PT_CAP(tnt_disable, 0, CPUID_EBX, BIT(8)),
PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)),
PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)),
- PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3),
+ PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7),
PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000),
PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff),
PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
@@ -108,6 +112,8 @@ PMU_FORMAT_ATTR(tsc, "config:10" );
PMU_FORMAT_ATTR(noretcomp, "config:11" );
PMU_FORMAT_ATTR(ptw, "config:12" );
PMU_FORMAT_ATTR(branch, "config:13" );
+PMU_FORMAT_ATTR(event, "config:31" );
+PMU_FORMAT_ATTR(notnt, "config:55" );
PMU_FORMAT_ATTR(mtc_period, "config:14-17" );
PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" );
PMU_FORMAT_ATTR(psb_period, "config:24-27" );
@@ -116,6 +122,8 @@ static struct attribute *pt_formats_attr[] = {
&format_attr_pt.attr,
&format_attr_cyc.attr,
&format_attr_pwr_evt.attr,
+ &format_attr_event.attr,
+ &format_attr_notnt.attr,
&format_attr_fup_on_ptw.attr,
&format_attr_mtc.attr,
&format_attr_tsc.attr,
@@ -296,6 +304,8 @@ fail:
RTIT_CTL_CYC_PSB | \
RTIT_CTL_MTC | \
RTIT_CTL_PWR_EVT_EN | \
+ RTIT_CTL_EVENT_EN | \
+ RTIT_CTL_NOTNT | \
RTIT_CTL_FUP_ON_PTW | \
RTIT_CTL_PTW_EN)
@@ -350,6 +360,14 @@ static bool pt_event_valid(struct perf_event *event)
!intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
return false;
+ if (config & RTIT_CTL_EVENT_EN &&
+ !intel_pt_validate_hw_cap(PT_CAP_event_trace))
+ return false;
+
+ if (config & RTIT_CTL_NOTNT &&
+ !intel_pt_validate_hw_cap(PT_CAP_tnt_disable))
+ return false;
+
if (config & RTIT_CTL_PTW) {
if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
return false;
@@ -362,7 +380,7 @@ static bool pt_event_valid(struct perf_event *event)
/*
* Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
- * clears the assomption that BranchEn must always be enabled,
+ * clears the assumption that BranchEn must always be enabled,
* as was the case with the first implementation of PT.
* If this bit is not set, the legacy behavior is preserved
* for compatibility with the older userspace.
@@ -472,7 +490,7 @@ static u64 pt_config_filters(struct perf_event *event)
pt->filters.filter[range].msr_b = filter->msr_b;
}
- rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;
+ rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
}
return rtit_ctl;
@@ -897,8 +915,9 @@ static void pt_handle_status(struct pt *pt)
* means we are already losing data; need to let the decoder
* know.
*/
- if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
- buf->output_off == pt_buffer_region_size(buf)) {
+ if (!buf->single &&
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
+ buf->output_off == pt_buffer_region_size(buf))) {
perf_aux_output_flag(&pt->handle,
PERF_AUX_FLAG_TRUNCATED);
advance++;
@@ -1347,10 +1366,26 @@ static void pt_addr_filters_fini(struct perf_event *event)
event->hw.addr_filters = NULL;
}
-static inline bool valid_kernel_ip(unsigned long ip)
+#ifdef CONFIG_X86_64
+/* Clamp to a canonical address greater-than-or-equal-to the address given */
+static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits)
+{
+ return __is_canonical_address(vaddr, vaddr_bits) ?
+ vaddr :
+ -BIT_ULL(vaddr_bits - 1);
+}
+
+/* Clamp to a canonical address less-than-or-equal-to the address given */
+static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits)
{
- return virt_addr_valid(ip) && kernel_ip(ip);
+ return __is_canonical_address(vaddr, vaddr_bits) ?
+ vaddr :
+ BIT_ULL(vaddr_bits - 1) - 1;
}
+#else
+#define clamp_to_ge_canonical_addr(x, y) (x)
+#define clamp_to_le_canonical_addr(x, y) (x)
+#endif
static int pt_event_addr_filters_validate(struct list_head *filters)
{
@@ -1366,14 +1401,6 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
filter->action == PERF_ADDR_FILTER_ACTION_START)
return -EOPNOTSUPP;
- if (!filter->path.dentry) {
- if (!valid_kernel_ip(filter->offset))
- return -EINVAL;
-
- if (!valid_kernel_ip(filter->offset + filter->size))
- return -EINVAL;
- }
-
if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
return -EOPNOTSUPP;
}
@@ -1397,9 +1424,26 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
if (filter->path.dentry && !fr[range].start) {
msr_a = msr_b = 0;
} else {
- /* apply the offset */
- msr_a = fr[range].start;
- msr_b = msr_a + fr[range].size - 1;
+ unsigned long n = fr[range].size - 1;
+ unsigned long a = fr[range].start;
+ unsigned long b;
+
+ if (a > ULONG_MAX - n)
+ b = ULONG_MAX;
+ else
+ b = a + n;
+ /*
+ * Apply the offset. 64-bit addresses written to the
+ * MSRs must be canonical, but the range can encompass
+ * non-canonical addresses. Since software cannot
+ * execute at non-canonical addresses, adjusting to
+ * canonical addresses does not affect the result of the
+ * address filter.
+ */
+ msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits);
+ msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits);
+ if (msr_b < msr_a)
+ msr_a = msr_b = 0;
}
filters->filter[range].msr_a = msr_a;
@@ -1708,7 +1752,7 @@ static __init int pt_init(void)
if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
return -ENODEV;
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
u64 ctl;
@@ -1716,7 +1760,7 @@ static __init int pt_init(void)
if (!ret && (ctl & RTIT_CTL_TRACEEN))
prior_warn++;
}
- put_online_cpus();
+ cpus_read_unlock();
if (prior_warn) {
x86_add_exclusive(x86_lbr_exclusive_pt);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 33c8180d5a87..db6c31bca809 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -4,8 +4,13 @@
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include "uncore.h"
+#include "uncore_discovery.h"
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
+static bool uncore_no_discover;
+module_param(uncore_no_discover, bool, 0);
+MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism "
+ "(default: enable the discovery mechanism).");
+struct intel_uncore_type *empty_uncore[] = { NULL, };
struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
struct intel_uncore_type **uncore_mmio_uncores = empty_uncore;
@@ -48,6 +53,18 @@ int uncore_pcibus_to_dieid(struct pci_bus *bus)
return die_id;
}
+int uncore_die_to_segment(int die)
+{
+ struct pci_bus *bus = NULL;
+
+ /* Find first pci bus which attributes to specified die. */
+ while ((bus = pci_find_next_bus(bus)) &&
+ (die != uncore_pcibus_to_dieid(bus)))
+ ;
+
+ return bus ? pci_domain_nr(bus) : -EINVAL;
+}
+
static void uncore_free_pcibus_map(void)
{
struct pci2phy_map *map, *tmp;
@@ -784,8 +801,6 @@ static void uncore_pmu_enable(struct pmu *pmu)
struct intel_uncore_box *box;
uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
- if (!uncore_pmu)
- return;
box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
if (!box)
@@ -801,8 +816,6 @@ static void uncore_pmu_disable(struct pmu *pmu)
struct intel_uncore_box *box;
uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
- if (!uncore_pmu)
- return;
box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
if (!box)
@@ -829,6 +842,45 @@ static const struct attribute_group uncore_pmu_attr_group = {
.attrs = uncore_pmu_attrs,
};
+void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
+{
+ struct intel_uncore_type *type = pmu->type;
+
+ if (type->num_boxes == 1)
+ sprintf(pmu_name, "uncore_type_%u", type->type_id);
+ else {
+ sprintf(pmu_name, "uncore_type_%u_%d",
+ type->type_id, type->box_ids[pmu->pmu_idx]);
+ }
+}
+
+static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
+{
+ struct intel_uncore_type *type = pmu->type;
+
+ /*
+ * No uncore block name in discovery table.
+ * Use uncore_type_&typeid_&boxid as name.
+ */
+ if (!type->name) {
+ uncore_get_alias_name(pmu->name, pmu);
+ return;
+ }
+
+ if (type->num_boxes == 1) {
+ if (strlen(type->name) > 0)
+ sprintf(pmu->name, "uncore_%s", type->name);
+ else
+ sprintf(pmu->name, "uncore");
+ } else {
+ /*
+ * Use the box ID from the discovery table if applicable.
+ */
+ sprintf(pmu->name, "uncore_%s_%d", type->name,
+ type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx);
+ }
+}
+
static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
{
int ret;
@@ -855,15 +907,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
pmu->pmu.attr_update = pmu->type->attr_update;
}
- if (pmu->type->num_boxes == 1) {
- if (strlen(pmu->type->name) > 0)
- sprintf(pmu->name, "uncore_%s", pmu->type->name);
- else
- sprintf(pmu->name, "uncore");
- } else {
- sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
- pmu->pmu_idx);
- }
+ uncore_get_pmu_name(pmu);
ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
if (!ret)
@@ -904,6 +948,10 @@ static void uncore_type_exit(struct intel_uncore_type *type)
kfree(type->pmus);
type->pmus = NULL;
}
+ if (type->box_ids) {
+ kfree(type->box_ids);
+ type->box_ids = NULL;
+ }
kfree(type->events_group);
type->events_group = NULL;
}
@@ -1003,10 +1051,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
return 0;
}
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev)
+{
+ struct intel_uncore_type **types = uncore_pci_uncores;
+ struct intel_uncore_type *type;
+ u64 box_ctl;
+ int i, die;
+
+ for (; *types; types++) {
+ type = *types;
+ for (die = 0; die < __uncore_max_dies; die++) {
+ for (i = 0; i < type->num_boxes; i++) {
+ if (!type->box_ctls[die])
+ continue;
+ box_ctl = type->box_ctls[die] + type->pci_offsets[i];
+ if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) &&
+ pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(box_ctl) &&
+ pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl))
+ return &type->pmus[i];
+ }
+ }
+ }
+
+ return NULL;
+}
+
/*
* Find the PMU of a PCI device.
* @pdev: The PCI device.
* @ids: The ID table of the available PCI devices with a PMU.
+ * If NULL, search the whole uncore_pci_uncores.
*/
static struct intel_uncore_pmu *
uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
@@ -1016,6 +1091,9 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
kernel_ulong_t data;
unsigned int devfn;
+ if (!ids)
+ return uncore_pci_find_dev_pmu_from_types(pdev);
+
while (ids && ids->vendor) {
if ((ids->vendor == pdev->vendor) &&
(ids->device == pdev->device)) {
@@ -1109,7 +1187,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
* PCI slot and func to indicate the uncore box.
*/
if (id->driver_data & ~0xffff) {
- struct pci_driver *pci_drv = pdev->driver;
+ struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver);
pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table);
if (pmu == NULL)
@@ -1174,7 +1252,8 @@ static void uncore_pci_remove(struct pci_dev *pdev)
}
static int uncore_bus_notify(struct notifier_block *nb,
- unsigned long action, void *data)
+ unsigned long action, void *data,
+ const struct pci_device_id *ids)
{
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
@@ -1185,7 +1264,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (action != BUS_NOTIFY_DEL_DEVICE)
return NOTIFY_DONE;
- pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table);
+ pmu = uncore_pci_find_dev_pmu(pdev, ids);
if (!pmu)
return NOTIFY_DONE;
@@ -1197,8 +1276,15 @@ static int uncore_bus_notify(struct notifier_block *nb,
return NOTIFY_OK;
}
-static struct notifier_block uncore_notifier = {
- .notifier_call = uncore_bus_notify,
+static int uncore_pci_sub_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ return uncore_bus_notify(nb, action, data,
+ uncore_pci_sub_driver->id_table);
+}
+
+static struct notifier_block uncore_pci_sub_notifier = {
+ .notifier_call = uncore_pci_sub_bus_notify,
};
static void uncore_pci_sub_driver_init(void)
@@ -1239,13 +1325,55 @@ static void uncore_pci_sub_driver_init(void)
ids++;
}
- if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier))
+ if (notify && bus_register_notifier(&pci_bus_type, &uncore_pci_sub_notifier))
notify = false;
if (!notify)
uncore_pci_sub_driver = NULL;
}
+static int uncore_pci_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ return uncore_bus_notify(nb, action, data, NULL);
+}
+
+static struct notifier_block uncore_pci_notifier = {
+ .notifier_call = uncore_pci_bus_notify,
+};
+
+
+static void uncore_pci_pmus_register(void)
+{
+ struct intel_uncore_type **types = uncore_pci_uncores;
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct pci_dev *pdev;
+ u64 box_ctl;
+ int i, die;
+
+ for (; *types; types++) {
+ type = *types;
+ for (die = 0; die < __uncore_max_dies; die++) {
+ for (i = 0; i < type->num_boxes; i++) {
+ if (!type->box_ctls[die])
+ continue;
+ box_ctl = type->box_ctls[die] + type->pci_offsets[i];
+ pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl),
+ UNCORE_DISCOVERY_PCI_BUS(box_ctl),
+ UNCORE_DISCOVERY_PCI_DEVFN(box_ctl));
+ if (!pdev)
+ continue;
+ pmu = &type->pmus[i];
+
+ uncore_pci_pmu_register(pdev, type, pmu, die);
+ }
+ }
+ }
+
+ bus_register_notifier(&pci_bus_type, &uncore_pci_notifier);
+}
+
static int __init uncore_pci_init(void)
{
size_t size;
@@ -1262,12 +1390,15 @@ static int __init uncore_pci_init(void)
if (ret)
goto errtype;
- uncore_pci_driver->probe = uncore_pci_probe;
- uncore_pci_driver->remove = uncore_pci_remove;
+ if (uncore_pci_driver) {
+ uncore_pci_driver->probe = uncore_pci_probe;
+ uncore_pci_driver->remove = uncore_pci_remove;
- ret = pci_register_driver(uncore_pci_driver);
- if (ret)
- goto errtype;
+ ret = pci_register_driver(uncore_pci_driver);
+ if (ret)
+ goto errtype;
+ } else
+ uncore_pci_pmus_register();
if (uncore_pci_sub_driver)
uncore_pci_sub_driver_init();
@@ -1290,8 +1421,11 @@ static void uncore_pci_exit(void)
if (pcidrv_registered) {
pcidrv_registered = false;
if (uncore_pci_sub_driver)
- bus_unregister_notifier(&pci_bus_type, &uncore_notifier);
- pci_unregister_driver(uncore_pci_driver);
+ bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier);
+ if (uncore_pci_driver)
+ pci_unregister_driver(uncore_pci_driver);
+ else
+ bus_unregister_notifier(&pci_bus_type, &uncore_pci_notifier);
uncore_types_exit(uncore_pci_uncores);
kfree(uncore_extra_pci_dev);
uncore_free_pcibus_map();
@@ -1540,6 +1674,7 @@ struct intel_uncore_init_fun {
void (*cpu_init)(void);
int (*pci_init)(void);
void (*mmio_init)(void);
+ bool use_discovery;
};
static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
@@ -1625,6 +1760,11 @@ static const struct intel_uncore_init_fun rkl_uncore_init __initconst = {
.pci_init = skl_uncore_pci_init,
};
+static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
+ .cpu_init = adl_uncore_cpu_init,
+ .mmio_init = adl_uncore_mmio_init,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1637,6 +1777,19 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
.mmio_init = snr_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun spr_uncore_init __initconst = {
+ .cpu_init = spr_uncore_cpu_init,
+ .pci_init = spr_uncore_pci_init,
+ .mmio_init = spr_uncore_mmio_init,
+ .use_discovery = true,
+};
+
+static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
+ .cpu_init = intel_uncore_generic_uncore_cpu_init,
+ .pci_init = intel_uncore_generic_uncore_pci_init,
+ .mmio_init = intel_uncore_generic_uncore_mmio_init,
+};
+
static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init),
@@ -1673,6 +1826,12 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
{},
};
@@ -1684,17 +1843,26 @@ static int __init intel_uncore_init(void)
struct intel_uncore_init_fun *uncore_init;
int pret = 0, cret = 0, mret = 0, ret;
- id = x86_match_cpu(intel_uncore_match);
- if (!id)
- return -ENODEV;
-
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return -ENODEV;
__uncore_max_dies =
topology_max_packages() * topology_max_die_per_package();
- uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+ id = x86_match_cpu(intel_uncore_match);
+ if (!id) {
+ if (!uncore_no_discover && intel_uncore_has_discovery_tables())
+ uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init;
+ else
+ return -ENODEV;
+ } else {
+ uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+ if (uncore_no_discover && uncore_init->use_discovery)
+ return -ENODEV;
+ if (uncore_init->use_discovery && !intel_uncore_has_discovery_tables())
+ return -ENODEV;
+ }
+
if (uncore_init->pci_init) {
pret = uncore_init->pci_init();
if (!pret)
@@ -1711,8 +1879,10 @@ static int __init intel_uncore_init(void)
mret = uncore_mmio_init();
}
- if (cret && pret && mret)
- return -ENODEV;
+ if (cret && pret && mret) {
+ ret = -ENODEV;
+ goto free_discovery;
+ }
/* Install hotplug callbacks to setup the targets for each package */
ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
@@ -1727,6 +1897,8 @@ err:
uncore_types_exit(uncore_msr_uncores);
uncore_types_exit(uncore_mmio_uncores);
uncore_pci_exit();
+free_discovery:
+ intel_uncore_clear_discovery_tables();
return ret;
}
module_init(intel_uncore_init);
@@ -1737,5 +1909,6 @@ static void __exit intel_uncore_exit(void)
uncore_types_exit(uncore_msr_uncores);
uncore_types_exit(uncore_mmio_uncores);
uncore_pci_exit();
+ intel_uncore_clear_discovery_tables();
}
module_exit(intel_uncore_exit);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index a3c6e1643ad2..2adeaf4de4df 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -42,6 +42,7 @@ struct intel_uncore_pmu;
struct intel_uncore_box;
struct uncore_event_desc;
struct freerunning_counters;
+struct intel_uncore_topology;
struct intel_uncore_type {
const char *name;
@@ -50,6 +51,7 @@ struct intel_uncore_type {
int perf_ctr_bits;
int fixed_ctr_bits;
int num_freerunning_types;
+ int type_id;
unsigned perf_ctr;
unsigned event_ctl;
unsigned event_mask;
@@ -57,6 +59,7 @@ struct intel_uncore_type {
unsigned fixed_ctr;
unsigned fixed_ctl;
unsigned box_ctl;
+ u64 *box_ctls; /* Unit ctrl addr of the first box of each die */
union {
unsigned msr_offset;
unsigned mmio_offset;
@@ -65,7 +68,12 @@ struct intel_uncore_type {
unsigned num_shared_regs:8;
unsigned single_fixed:1;
unsigned pair_ctr_ctl:1;
- unsigned *msr_offsets;
+ union {
+ unsigned *msr_offsets;
+ unsigned *pci_offsets;
+ unsigned *mmio_offsets;
+ };
+ unsigned *box_ids;
struct event_constraint unconstrainted;
struct event_constraint *constraints;
struct intel_uncore_pmu *pmus;
@@ -80,10 +88,11 @@ struct intel_uncore_type {
* to identify which platform component each PMON block of that type is
* supposed to monitor.
*/
- u64 *topology;
+ struct intel_uncore_topology *topology;
/*
* Optional callbacks for managing mapping of Uncore units to PMONs
*/
+ int (*get_topology)(struct intel_uncore_type *type);
int (*set_mapping)(struct intel_uncore_type *type);
void (*cleanup_mapping)(struct intel_uncore_type *type);
};
@@ -169,6 +178,11 @@ struct freerunning_counters {
unsigned *box_offsets;
};
+struct intel_uncore_topology {
+ u64 configuration;
+ int segment;
+};
+
struct pci2phy_map {
struct list_head list;
int segment;
@@ -177,6 +191,7 @@ struct pci2phy_map {
struct pci2phy_map *__find_pci2phy_map(int segment);
int uncore_pcibus_to_dieid(struct pci_bus *bus);
+int uncore_die_to_segment(int die);
ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
@@ -546,7 +561,9 @@ struct event_constraint *
uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu);
+extern struct intel_uncore_type *empty_uncore[];
extern struct intel_uncore_type **uncore_msr_uncores;
extern struct intel_uncore_type **uncore_pci_uncores;
extern struct intel_uncore_type **uncore_mmio_uncores;
@@ -568,8 +585,10 @@ void nhm_uncore_cpu_init(void);
void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
+void adl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
+void adl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
@@ -591,6 +610,9 @@ void snr_uncore_mmio_init(void);
int icx_uncore_pci_init(void);
void icx_uncore_cpu_init(void);
void icx_uncore_mmio_init(void);
+int spr_uncore_pci_init(void);
+void spr_uncore_cpu_init(void);
+void spr_uncore_mmio_init(void);
/* uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
new file mode 100644
index 000000000000..5fd72d4b8bbb
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -0,0 +1,630 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Support Intel uncore PerfMon discovery mechanism.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "uncore.h"
+#include "uncore_discovery.h"
+
+static struct rb_root discovery_tables = RB_ROOT;
+static int num_discovered_types[UNCORE_ACCESS_MAX];
+
+static bool has_generic_discovery_table(void)
+{
+ struct pci_dev *dev;
+ int dvsec;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, UNCORE_DISCOVERY_TABLE_DEVICE, NULL);
+ if (!dev)
+ return false;
+
+ /* A discovery table device has the unique capability ID. */
+ dvsec = pci_find_next_ext_capability(dev, 0, UNCORE_EXT_CAP_ID_DISCOVERY);
+ pci_dev_put(dev);
+ if (dvsec)
+ return true;
+
+ return false;
+}
+
+static int logical_die_id;
+
+static int get_device_die_id(struct pci_dev *dev)
+{
+ int cpu, node = pcibus_to_node(dev->bus);
+
+ /*
+ * If the NUMA info is not available, assume that the logical die id is
+ * continuous in the order in which the discovery table devices are
+ * detected.
+ */
+ if (node < 0)
+ return logical_die_id++;
+
+ for_each_cpu(cpu, cpumask_of_node(node)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node)
+ return c->logical_die_id;
+ }
+
+ /*
+ * All CPUs of a node may be offlined. For this case,
+ * the PCI and MMIO type of uncore blocks which are
+ * enumerated by the device will be unavailable.
+ */
+ return -1;
+}
+
+#define __node_2_type(cur) \
+ rb_entry((cur), struct intel_uncore_discovery_type, node)
+
+static inline int __type_cmp(const void *key, const struct rb_node *b)
+{
+ struct intel_uncore_discovery_type *type_b = __node_2_type(b);
+ const u16 *type_id = key;
+
+ if (type_b->type > *type_id)
+ return -1;
+ else if (type_b->type < *type_id)
+ return 1;
+
+ return 0;
+}
+
+static inline struct intel_uncore_discovery_type *
+search_uncore_discovery_type(u16 type_id)
+{
+ struct rb_node *node = rb_find(&type_id, &discovery_tables, __type_cmp);
+
+ return (node) ? __node_2_type(node) : NULL;
+}
+
+static inline bool __type_less(struct rb_node *a, const struct rb_node *b)
+{
+ return (__node_2_type(a)->type < __node_2_type(b)->type);
+}
+
+static struct intel_uncore_discovery_type *
+add_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+ struct intel_uncore_discovery_type *type;
+
+ if (unit->access_type >= UNCORE_ACCESS_MAX) {
+ pr_warn("Unsupported access type %d\n", unit->access_type);
+ return NULL;
+ }
+
+ type = kzalloc(sizeof(struct intel_uncore_discovery_type), GFP_KERNEL);
+ if (!type)
+ return NULL;
+
+ type->box_ctrl_die = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL);
+ if (!type->box_ctrl_die)
+ goto free_type;
+
+ type->access_type = unit->access_type;
+ num_discovered_types[type->access_type]++;
+ type->type = unit->box_type;
+
+ rb_add(&type->node, &discovery_tables, __type_less);
+
+ return type;
+
+free_type:
+ kfree(type);
+
+ return NULL;
+
+}
+
+static struct intel_uncore_discovery_type *
+get_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+ struct intel_uncore_discovery_type *type;
+
+ type = search_uncore_discovery_type(unit->box_type);
+ if (type)
+ return type;
+
+ return add_uncore_discovery_type(unit);
+}
+
+static void
+uncore_insert_box_info(struct uncore_unit_discovery *unit,
+ int die, bool parsed)
+{
+ struct intel_uncore_discovery_type *type;
+ unsigned int *box_offset, *ids;
+ int i;
+
+ if (WARN_ON_ONCE(!unit->ctl || !unit->ctl_offset || !unit->ctr_offset))
+ return;
+
+ if (parsed) {
+ type = search_uncore_discovery_type(unit->box_type);
+ if (WARN_ON_ONCE(!type))
+ return;
+ /* Store the first box of each die */
+ if (!type->box_ctrl_die[die])
+ type->box_ctrl_die[die] = unit->ctl;
+ return;
+ }
+
+ type = get_uncore_discovery_type(unit);
+ if (!type)
+ return;
+
+ box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+ if (!box_offset)
+ return;
+
+ ids = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+ if (!ids)
+ goto free_box_offset;
+
+ /* Store generic information for the first box */
+ if (!type->num_boxes) {
+ type->box_ctrl = unit->ctl;
+ type->box_ctrl_die[die] = unit->ctl;
+ type->num_counters = unit->num_regs;
+ type->counter_width = unit->bit_width;
+ type->ctl_offset = unit->ctl_offset;
+ type->ctr_offset = unit->ctr_offset;
+ *ids = unit->box_id;
+ goto end;
+ }
+
+ for (i = 0; i < type->num_boxes; i++) {
+ ids[i] = type->ids[i];
+ box_offset[i] = type->box_offset[i];
+
+ if (WARN_ON_ONCE(unit->box_id == ids[i]))
+ goto free_ids;
+ }
+ ids[i] = unit->box_id;
+ box_offset[i] = unit->ctl - type->box_ctrl;
+ kfree(type->ids);
+ kfree(type->box_offset);
+end:
+ type->ids = ids;
+ type->box_offset = box_offset;
+ type->num_boxes++;
+ return;
+
+free_ids:
+ kfree(ids);
+
+free_box_offset:
+ kfree(box_offset);
+
+}
+
+static int parse_discovery_table(struct pci_dev *dev, int die,
+ u32 bar_offset, bool *parsed)
+{
+ struct uncore_global_discovery global;
+ struct uncore_unit_discovery unit;
+ void __iomem *io_addr;
+ resource_size_t addr;
+ unsigned long size;
+ u32 val;
+ int i;
+
+ pci_read_config_dword(dev, bar_offset, &val);
+
+ if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64)
+ return -EINVAL;
+
+ addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ u32 val2;
+
+ pci_read_config_dword(dev, bar_offset + 4, &val2);
+ addr |= ((resource_size_t)val2) << 32;
+ }
+#endif
+ size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE;
+ io_addr = ioremap(addr, size);
+ if (!io_addr)
+ return -ENOMEM;
+
+ /* Read Global Discovery State */
+ memcpy_fromio(&global, io_addr, sizeof(struct uncore_global_discovery));
+ if (uncore_discovery_invalid_unit(global)) {
+ pr_info("Invalid Global Discovery State: 0x%llx 0x%llx 0x%llx\n",
+ global.table1, global.ctl, global.table3);
+ iounmap(io_addr);
+ return -EINVAL;
+ }
+ iounmap(io_addr);
+
+ size = (1 + global.max_units) * global.stride * 8;
+ io_addr = ioremap(addr, size);
+ if (!io_addr)
+ return -ENOMEM;
+
+ /* Parsing Unit Discovery State */
+ for (i = 0; i < global.max_units; i++) {
+ memcpy_fromio(&unit, io_addr + (i + 1) * (global.stride * 8),
+ sizeof(struct uncore_unit_discovery));
+
+ if (uncore_discovery_invalid_unit(unit))
+ continue;
+
+ if (unit.access_type >= UNCORE_ACCESS_MAX)
+ continue;
+
+ uncore_insert_box_info(&unit, die, *parsed);
+ }
+
+ *parsed = true;
+ iounmap(io_addr);
+ return 0;
+}
+
+bool intel_uncore_has_discovery_tables(void)
+{
+ u32 device, val, entry_id, bar_offset;
+ int die, dvsec = 0, ret = true;
+ struct pci_dev *dev = NULL;
+ bool parsed = false;
+
+ if (has_generic_discovery_table())
+ device = UNCORE_DISCOVERY_TABLE_DEVICE;
+ else
+ device = PCI_ANY_ID;
+
+ /*
+ * Start a new search and iterates through the list of
+ * the discovery table devices.
+ */
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
+ while ((dvsec = pci_find_next_ext_capability(dev, dvsec, UNCORE_EXT_CAP_ID_DISCOVERY))) {
+ pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC_OFFSET, &val);
+ entry_id = val & UNCORE_DISCOVERY_DVSEC_ID_MASK;
+ if (entry_id != UNCORE_DISCOVERY_DVSEC_ID_PMON)
+ continue;
+
+ pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC2_OFFSET, &val);
+
+ if (val & ~UNCORE_DISCOVERY_DVSEC2_BIR_MASK) {
+ ret = false;
+ goto err;
+ }
+ bar_offset = UNCORE_DISCOVERY_BIR_BASE +
+ (val & UNCORE_DISCOVERY_DVSEC2_BIR_MASK) * UNCORE_DISCOVERY_BIR_STEP;
+
+ die = get_device_die_id(dev);
+ if (die < 0)
+ continue;
+
+ parse_discovery_table(dev, die, bar_offset, &parsed);
+ }
+ }
+
+ /* None of the discovery tables are available */
+ if (!parsed)
+ ret = false;
+err:
+ pci_dev_put(dev);
+
+ return ret;
+}
+
+void intel_uncore_clear_discovery_tables(void)
+{
+ struct intel_uncore_discovery_type *type, *next;
+
+ rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) {
+ kfree(type->box_ctrl_die);
+ kfree(type);
+ }
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh, thresh, "config:24-31");
+
+static struct attribute *generic_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh.attr,
+ NULL,
+};
+
+static const struct attribute_group generic_uncore_format_group = {
+ .name = "format",
+ .attrs = generic_uncore_formats_attr,
+};
+
+void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_INT);
+}
+
+void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(uncore_msr_box_ctl(box), 0);
+}
+
+static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, 0);
+}
+
+static struct intel_uncore_ops generic_uncore_msr_ops = {
+ .init_box = intel_generic_uncore_msr_init_box,
+ .disable_box = intel_generic_uncore_msr_disable_box,
+ .enable_box = intel_generic_uncore_msr_enable_box,
+ .disable_event = intel_generic_uncore_msr_disable_event,
+ .enable_event = intel_generic_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT);
+}
+
+void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void intel_generic_uncore_pci_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, 0);
+}
+
+u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+ pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops generic_uncore_pci_ops = {
+ .init_box = intel_generic_uncore_pci_init_box,
+ .disable_box = intel_generic_uncore_pci_disable_box,
+ .enable_box = intel_generic_uncore_pci_enable_box,
+ .disable_event = intel_generic_uncore_pci_disable_event,
+ .enable_event = intel_generic_uncore_pci_enable_event,
+ .read_counter = intel_generic_uncore_pci_read_counter,
+};
+
+#define UNCORE_GENERIC_MMIO_SIZE 0x4000
+
+static u64 generic_uncore_mmio_box_ctl(struct intel_uncore_box *box)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+
+ if (!type->box_ctls || !type->box_ctls[box->dieid] || !type->mmio_offsets)
+ return 0;
+
+ return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx];
+}
+
+void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
+{
+ u64 box_ctl = generic_uncore_mmio_box_ctl(box);
+ struct intel_uncore_type *type = box->pmu->type;
+ resource_size_t addr;
+
+ if (!box_ctl) {
+ pr_warn("Uncore type %d box %d: Invalid box control address.\n",
+ type->type_id, type->box_ids[box->pmu->pmu_idx]);
+ return;
+ }
+
+ addr = box_ctl;
+ box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
+ if (!box->io_addr) {
+ pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
+ type->type_id, type->box_ids[box->pmu->pmu_idx],
+ (unsigned long long)addr);
+ return;
+ }
+
+ writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr);
+}
+
+void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr);
+}
+
+void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr);
+}
+
+void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops generic_uncore_mmio_ops = {
+ .init_box = intel_generic_uncore_mmio_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static bool uncore_update_uncore_type(enum uncore_access_type type_id,
+ struct intel_uncore_type *uncore,
+ struct intel_uncore_discovery_type *type)
+{
+ uncore->type_id = type->type;
+ uncore->num_boxes = type->num_boxes;
+ uncore->num_counters = type->num_counters;
+ uncore->perf_ctr_bits = type->counter_width;
+ uncore->box_ids = type->ids;
+
+ switch (type_id) {
+ case UNCORE_ACCESS_MSR:
+ uncore->ops = &generic_uncore_msr_ops;
+ uncore->perf_ctr = (unsigned int)type->box_ctrl + type->ctr_offset;
+ uncore->event_ctl = (unsigned int)type->box_ctrl + type->ctl_offset;
+ uncore->box_ctl = (unsigned int)type->box_ctrl;
+ uncore->msr_offsets = type->box_offset;
+ break;
+ case UNCORE_ACCESS_PCI:
+ uncore->ops = &generic_uncore_pci_ops;
+ uncore->perf_ctr = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctr_offset;
+ uncore->event_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctl_offset;
+ uncore->box_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl);
+ uncore->box_ctls = type->box_ctrl_die;
+ uncore->pci_offsets = type->box_offset;
+ break;
+ case UNCORE_ACCESS_MMIO:
+ uncore->ops = &generic_uncore_mmio_ops;
+ uncore->perf_ctr = (unsigned int)type->ctr_offset;
+ uncore->event_ctl = (unsigned int)type->ctl_offset;
+ uncore->box_ctl = (unsigned int)type->box_ctrl;
+ uncore->box_ctls = type->box_ctrl_die;
+ uncore->mmio_offsets = type->box_offset;
+ uncore->mmio_map_size = UNCORE_GENERIC_MMIO_SIZE;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra)
+{
+ struct intel_uncore_discovery_type *type;
+ struct intel_uncore_type **uncores;
+ struct intel_uncore_type *uncore;
+ struct rb_node *node;
+ int i = 0;
+
+ uncores = kcalloc(num_discovered_types[type_id] + num_extra + 1,
+ sizeof(struct intel_uncore_type *), GFP_KERNEL);
+ if (!uncores)
+ return empty_uncore;
+
+ for (node = rb_first(&discovery_tables); node; node = rb_next(node)) {
+ type = rb_entry(node, struct intel_uncore_discovery_type, node);
+ if (type->access_type != type_id)
+ continue;
+
+ uncore = kzalloc(sizeof(struct intel_uncore_type), GFP_KERNEL);
+ if (!uncore)
+ break;
+
+ uncore->event_mask = GENERIC_PMON_RAW_EVENT_MASK;
+ uncore->format_group = &generic_uncore_format_group;
+
+ if (!uncore_update_uncore_type(type_id, uncore, type)) {
+ kfree(uncore);
+ continue;
+ }
+ uncores[i++] = uncore;
+ }
+
+ return uncores;
+}
+
+void intel_uncore_generic_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR, 0);
+}
+
+int intel_uncore_generic_uncore_pci_init(void)
+{
+ uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI, 0);
+
+ return 0;
+}
+
+void intel_uncore_generic_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO, 0);
+}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
new file mode 100644
index 000000000000..f4439357779a
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* Generic device ID of a discovery table device */
+#define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7
+/* Capability ID for a discovery table device */
+#define UNCORE_EXT_CAP_ID_DISCOVERY 0x23
+/* First DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC_OFFSET 0x8
+/* Mask of the supported discovery entry type */
+#define UNCORE_DISCOVERY_DVSEC_ID_MASK 0xffff
+/* PMON discovery entry type ID */
+#define UNCORE_DISCOVERY_DVSEC_ID_PMON 0x1
+/* Second DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC2_OFFSET 0xc
+/* Mask of the discovery table BAR offset */
+#define UNCORE_DISCOVERY_DVSEC2_BIR_MASK 0x7
+/* Discovery table BAR base offset */
+#define UNCORE_DISCOVERY_BIR_BASE 0x10
+/* Discovery table BAR step */
+#define UNCORE_DISCOVERY_BIR_STEP 0x4
+/* Global discovery table size */
+#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE 0x20
+
+#define UNCORE_DISCOVERY_PCI_DOMAIN(data) ((data >> 28) & 0x7)
+#define UNCORE_DISCOVERY_PCI_BUS(data) ((data >> 20) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DEVFN(data) ((data >> 12) & 0xff)
+#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data) (data & 0xfff)
+
+
+#define uncore_discovery_invalid_unit(unit) \
+ (!unit.table1 || !unit.ctl || \
+ unit.table1 == -1ULL || unit.ctl == -1ULL || \
+ unit.table3 == -1ULL)
+
+#define GENERIC_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define GENERIC_PMON_CTL_UMASK_MASK 0x0000ff00
+#define GENERIC_PMON_CTL_EDGE_DET (1 << 18)
+#define GENERIC_PMON_CTL_INVERT (1 << 23)
+#define GENERIC_PMON_CTL_TRESH_MASK 0xff000000
+#define GENERIC_PMON_RAW_EVENT_MASK (GENERIC_PMON_CTL_EV_SEL_MASK | \
+ GENERIC_PMON_CTL_UMASK_MASK | \
+ GENERIC_PMON_CTL_EDGE_DET | \
+ GENERIC_PMON_CTL_INVERT | \
+ GENERIC_PMON_CTL_TRESH_MASK)
+
+#define GENERIC_PMON_BOX_CTL_FRZ (1 << 0)
+#define GENERIC_PMON_BOX_CTL_RST_CTRL (1 << 8)
+#define GENERIC_PMON_BOX_CTL_RST_CTRS (1 << 9)
+#define GENERIC_PMON_BOX_CTL_INT (GENERIC_PMON_BOX_CTL_RST_CTRL | \
+ GENERIC_PMON_BOX_CTL_RST_CTRS)
+
+enum uncore_access_type {
+ UNCORE_ACCESS_MSR = 0,
+ UNCORE_ACCESS_MMIO,
+ UNCORE_ACCESS_PCI,
+
+ UNCORE_ACCESS_MAX,
+};
+
+struct uncore_global_discovery {
+ union {
+ u64 table1;
+ struct {
+ u64 type : 8,
+ stride : 8,
+ max_units : 10,
+ __reserved_1 : 36,
+ access_type : 2;
+ };
+ };
+
+ u64 ctl; /* Global Control Address */
+
+ union {
+ u64 table3;
+ struct {
+ u64 status_offset : 8,
+ num_status : 16,
+ __reserved_2 : 40;
+ };
+ };
+};
+
+struct uncore_unit_discovery {
+ union {
+ u64 table1;
+ struct {
+ u64 num_regs : 8,
+ ctl_offset : 8,
+ bit_width : 8,
+ ctr_offset : 8,
+ status_offset : 8,
+ __reserved_1 : 22,
+ access_type : 2;
+ };
+ };
+
+ u64 ctl; /* Unit Control Address */
+
+ union {
+ u64 table3;
+ struct {
+ u64 box_type : 16,
+ box_id : 16,
+ __reserved_2 : 32;
+ };
+ };
+};
+
+struct intel_uncore_discovery_type {
+ struct rb_node node;
+ enum uncore_access_type access_type;
+ u64 box_ctrl; /* Unit ctrl addr of the first box */
+ u64 *box_ctrl_die; /* Unit ctrl addr of the first box of each die */
+ u16 type; /* Type ID of the uncore block */
+ u8 num_counters;
+ u8 counter_width;
+ u8 ctl_offset; /* Counter Control 0 offset */
+ u8 ctr_offset; /* Counter 0 offset */
+ u16 num_boxes; /* number of boxes for the uncore block */
+ unsigned int *ids; /* Box IDs */
+ unsigned int *box_offset; /* Box offset */
+};
+
+bool intel_uncore_has_discovery_tables(void);
+void intel_uncore_clear_discovery_tables(void);
+void intel_uncore_generic_uncore_cpu_init(void);
+int intel_uncore_generic_uncore_pci_init(void);
+void intel_uncore_generic_uncore_mmio_init(void);
+
+void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box);
+
+void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 51271288499e..ce440011cc4e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */
#include "uncore.h"
+#include "uncore_discovery.h"
/* Uncore IMC PCI IDs */
#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100
@@ -62,6 +63,59 @@
#define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36
#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43
#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53
+#define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660
+#define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641
+#define PCI_DEVICE_ID_INTEL_ADL_3_IMC 0x4601
+#define PCI_DEVICE_ID_INTEL_ADL_4_IMC 0x4602
+#define PCI_DEVICE_ID_INTEL_ADL_5_IMC 0x4609
+#define PCI_DEVICE_ID_INTEL_ADL_6_IMC 0x460a
+#define PCI_DEVICE_ID_INTEL_ADL_7_IMC 0x4621
+#define PCI_DEVICE_ID_INTEL_ADL_8_IMC 0x4623
+#define PCI_DEVICE_ID_INTEL_ADL_9_IMC 0x4629
+#define PCI_DEVICE_ID_INTEL_ADL_10_IMC 0x4637
+#define PCI_DEVICE_ID_INTEL_ADL_11_IMC 0x463b
+#define PCI_DEVICE_ID_INTEL_ADL_12_IMC 0x4648
+#define PCI_DEVICE_ID_INTEL_ADL_13_IMC 0x4649
+#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650
+#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668
+#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670
+#define PCI_DEVICE_ID_INTEL_ADL_17_IMC 0x4614
+#define PCI_DEVICE_ID_INTEL_ADL_18_IMC 0x4617
+#define PCI_DEVICE_ID_INTEL_ADL_19_IMC 0x4618
+#define PCI_DEVICE_ID_INTEL_ADL_20_IMC 0x461B
+#define PCI_DEVICE_ID_INTEL_ADL_21_IMC 0x461C
+#define PCI_DEVICE_ID_INTEL_RPL_1_IMC 0xA700
+#define PCI_DEVICE_ID_INTEL_RPL_2_IMC 0xA702
+#define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706
+#define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709
+#define PCI_DEVICE_ID_INTEL_RPL_5_IMC 0xA701
+#define PCI_DEVICE_ID_INTEL_RPL_6_IMC 0xA703
+#define PCI_DEVICE_ID_INTEL_RPL_7_IMC 0xA704
+#define PCI_DEVICE_ID_INTEL_RPL_8_IMC 0xA705
+#define PCI_DEVICE_ID_INTEL_RPL_9_IMC 0xA706
+#define PCI_DEVICE_ID_INTEL_RPL_10_IMC 0xA707
+#define PCI_DEVICE_ID_INTEL_RPL_11_IMC 0xA708
+#define PCI_DEVICE_ID_INTEL_RPL_12_IMC 0xA709
+#define PCI_DEVICE_ID_INTEL_RPL_13_IMC 0xA70a
+#define PCI_DEVICE_ID_INTEL_RPL_14_IMC 0xA70b
+#define PCI_DEVICE_ID_INTEL_RPL_15_IMC 0xA715
+#define PCI_DEVICE_ID_INTEL_RPL_16_IMC 0xA716
+#define PCI_DEVICE_ID_INTEL_RPL_17_IMC 0xA717
+#define PCI_DEVICE_ID_INTEL_RPL_18_IMC 0xA718
+#define PCI_DEVICE_ID_INTEL_RPL_19_IMC 0xA719
+#define PCI_DEVICE_ID_INTEL_RPL_20_IMC 0xA71A
+#define PCI_DEVICE_ID_INTEL_RPL_21_IMC 0xA71B
+#define PCI_DEVICE_ID_INTEL_RPL_22_IMC 0xA71C
+#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728
+#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729
+#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A
+
+
+#define IMC_UNCORE_DEV(a) \
+{ \
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_##a##_IMC), \
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), \
+}
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -131,12 +185,34 @@
#define ICL_UNC_ARB_PER_CTR 0x3b1
#define ICL_UNC_ARB_PERFEVTSEL 0x3b3
+/* ADL uncore global control */
+#define ADL_UNC_PERF_GLOBAL_CTL 0x2ff0
+#define ADL_UNC_FIXED_CTR_CTRL 0x2fde
+#define ADL_UNC_FIXED_CTR 0x2fdf
+
+/* ADL Cbo register */
+#define ADL_UNC_CBO_0_PER_CTR0 0x2002
+#define ADL_UNC_CBO_0_PERFEVTSEL0 0x2000
+#define ADL_UNC_CTL_THRESHOLD 0x3f000000
+#define ADL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ ADL_UNC_CTL_THRESHOLD)
+
+/* ADL ARB register */
+#define ADL_UNC_ARB_PER_CTR0 0x2FD2
+#define ADL_UNC_ARB_PERFEVTSEL0 0x2FD0
+#define ADL_UNC_ARB_MSR_OFFSET 0x8
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29");
/* Sandy Bridge uncore support */
static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
@@ -422,6 +498,106 @@ void tgl_uncore_cpu_init(void)
skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box;
}
+static void adl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static void adl_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static void adl_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void adl_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct intel_uncore_ops adl_uncore_msr_ops = {
+ .init_box = adl_uncore_msr_init_box,
+ .enable_box = adl_uncore_msr_enable_box,
+ .disable_box = adl_uncore_msr_disable_box,
+ .exit_box = adl_uncore_msr_exit_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct attribute *adl_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_threshold.attr,
+ NULL,
+};
+
+static const struct attribute_group adl_uncore_format_group = {
+ .name = "format",
+ .attrs = adl_uncore_formats_attr,
+};
+
+static struct intel_uncore_type adl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .perf_ctr_bits = 44,
+ .perf_ctr = ADL_UNC_CBO_0_PER_CTR0,
+ .event_ctl = ADL_UNC_CBO_0_PERFEVTSEL0,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = ICL_UNC_CBO_MSR_OFFSET,
+ .ops = &adl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type adl_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .perf_ctr = ADL_UNC_ARB_PER_CTR0,
+ .event_ctl = ADL_UNC_ARB_PERFEVTSEL0,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = ADL_UNC_ARB_MSR_OFFSET,
+ .constraints = snb_uncore_arb_constraints,
+ .ops = &adl_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type adl_uncore_clockbox = {
+ .name = "clock",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = ADL_UNC_FIXED_CTR,
+ .fixed_ctl = ADL_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &adl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type *adl_msr_uncores[] = {
+ &adl_uncore_cbox,
+ &adl_uncore_arb,
+ &adl_uncore_clockbox,
+ NULL,
+};
+
+void adl_uncore_cpu_init(void)
+{
+ adl_uncore_cbox.num_boxes = icl_get_cbox_num();
+ uncore_msr_uncores = adl_msr_uncores;
+}
+
enum {
SNB_PCI_UNCORE_IMC,
};
@@ -706,242 +882,80 @@ static struct intel_uncore_type *snb_pci_uncores[] = {
};
static const struct pci_device_id snb_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(SNB),
{ /* end: all zeroes */ },
};
static const struct pci_device_id ivb_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(IVB),
+ IMC_UNCORE_DEV(IVB_E3),
{ /* end: all zeroes */ },
};
static const struct pci_device_id hsw_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(HSW),
+ IMC_UNCORE_DEV(HSW_U),
{ /* end: all zeroes */ },
};
static const struct pci_device_id bdw_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(BDW),
{ /* end: all zeroes */ },
};
static const struct pci_device_id skl_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_Y_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_E3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_HQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_WQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(SKL_Y),
+ IMC_UNCORE_DEV(SKL_U),
+ IMC_UNCORE_DEV(SKL_HD),
+ IMC_UNCORE_DEV(SKL_HQ),
+ IMC_UNCORE_DEV(SKL_SD),
+ IMC_UNCORE_DEV(SKL_SQ),
+ IMC_UNCORE_DEV(SKL_E3),
+ IMC_UNCORE_DEV(KBL_Y),
+ IMC_UNCORE_DEV(KBL_U),
+ IMC_UNCORE_DEV(KBL_UQ),
+ IMC_UNCORE_DEV(KBL_SD),
+ IMC_UNCORE_DEV(KBL_SQ),
+ IMC_UNCORE_DEV(KBL_HQ),
+ IMC_UNCORE_DEV(KBL_WQ),
+ IMC_UNCORE_DEV(CFL_2U),
+ IMC_UNCORE_DEV(CFL_4U),
+ IMC_UNCORE_DEV(CFL_4H),
+ IMC_UNCORE_DEV(CFL_6H),
+ IMC_UNCORE_DEV(CFL_2S_D),
+ IMC_UNCORE_DEV(CFL_4S_D),
+ IMC_UNCORE_DEV(CFL_6S_D),
+ IMC_UNCORE_DEV(CFL_8S_D),
+ IMC_UNCORE_DEV(CFL_4S_W),
+ IMC_UNCORE_DEV(CFL_6S_W),
+ IMC_UNCORE_DEV(CFL_8S_W),
+ IMC_UNCORE_DEV(CFL_4S_S),
+ IMC_UNCORE_DEV(CFL_6S_S),
+ IMC_UNCORE_DEV(CFL_8S_S),
+ IMC_UNCORE_DEV(AML_YD),
+ IMC_UNCORE_DEV(AML_YQ),
+ IMC_UNCORE_DEV(WHL_UQ),
+ IMC_UNCORE_DEV(WHL_4_UQ),
+ IMC_UNCORE_DEV(WHL_UD),
+ IMC_UNCORE_DEV(CML_H1),
+ IMC_UNCORE_DEV(CML_H2),
+ IMC_UNCORE_DEV(CML_H3),
+ IMC_UNCORE_DEV(CML_U1),
+ IMC_UNCORE_DEV(CML_U2),
+ IMC_UNCORE_DEV(CML_U3),
+ IMC_UNCORE_DEV(CML_S1),
+ IMC_UNCORE_DEV(CML_S2),
+ IMC_UNCORE_DEV(CML_S3),
+ IMC_UNCORE_DEV(CML_S4),
+ IMC_UNCORE_DEV(CML_S5),
{ /* end: all zeroes */ },
};
static const struct pci_device_id icl_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(ICL_U),
+ IMC_UNCORE_DEV(ICL_U2),
+ IMC_UNCORE_DEV(RKL_1),
+ IMC_UNCORE_DEV(RKL_2),
{ /* end: all zeroes */ },
};
@@ -1183,26 +1197,57 @@ void nhm_uncore_cpu_init(void)
/* Tiger Lake MMIO uncore support */
static const struct pci_device_id tgl_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U4_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_H_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(TGL_U1),
+ IMC_UNCORE_DEV(TGL_U2),
+ IMC_UNCORE_DEV(TGL_U3),
+ IMC_UNCORE_DEV(TGL_U4),
+ IMC_UNCORE_DEV(TGL_H),
+ IMC_UNCORE_DEV(ADL_1),
+ IMC_UNCORE_DEV(ADL_2),
+ IMC_UNCORE_DEV(ADL_3),
+ IMC_UNCORE_DEV(ADL_4),
+ IMC_UNCORE_DEV(ADL_5),
+ IMC_UNCORE_DEV(ADL_6),
+ IMC_UNCORE_DEV(ADL_7),
+ IMC_UNCORE_DEV(ADL_8),
+ IMC_UNCORE_DEV(ADL_9),
+ IMC_UNCORE_DEV(ADL_10),
+ IMC_UNCORE_DEV(ADL_11),
+ IMC_UNCORE_DEV(ADL_12),
+ IMC_UNCORE_DEV(ADL_13),
+ IMC_UNCORE_DEV(ADL_14),
+ IMC_UNCORE_DEV(ADL_15),
+ IMC_UNCORE_DEV(ADL_16),
+ IMC_UNCORE_DEV(ADL_17),
+ IMC_UNCORE_DEV(ADL_18),
+ IMC_UNCORE_DEV(ADL_19),
+ IMC_UNCORE_DEV(ADL_20),
+ IMC_UNCORE_DEV(ADL_21),
+ IMC_UNCORE_DEV(RPL_1),
+ IMC_UNCORE_DEV(RPL_2),
+ IMC_UNCORE_DEV(RPL_3),
+ IMC_UNCORE_DEV(RPL_4),
+ IMC_UNCORE_DEV(RPL_5),
+ IMC_UNCORE_DEV(RPL_6),
+ IMC_UNCORE_DEV(RPL_7),
+ IMC_UNCORE_DEV(RPL_8),
+ IMC_UNCORE_DEV(RPL_9),
+ IMC_UNCORE_DEV(RPL_10),
+ IMC_UNCORE_DEV(RPL_11),
+ IMC_UNCORE_DEV(RPL_12),
+ IMC_UNCORE_DEV(RPL_13),
+ IMC_UNCORE_DEV(RPL_14),
+ IMC_UNCORE_DEV(RPL_15),
+ IMC_UNCORE_DEV(RPL_16),
+ IMC_UNCORE_DEV(RPL_17),
+ IMC_UNCORE_DEV(RPL_18),
+ IMC_UNCORE_DEV(RPL_19),
+ IMC_UNCORE_DEV(RPL_20),
+ IMC_UNCORE_DEV(RPL_21),
+ IMC_UNCORE_DEV(RPL_22),
+ IMC_UNCORE_DEV(RPL_23),
+ IMC_UNCORE_DEV(RPL_24),
+ IMC_UNCORE_DEV(RPL_25),
{ /* end: all zeroes */ }
};
@@ -1259,7 +1304,8 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
-static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+static void __uncore_imc_init_box(struct intel_uncore_box *box,
+ unsigned int base_offset)
{
struct pci_dev *pdev = tgl_uncore_get_mc_dev();
struct intel_uncore_pmu *pmu = box->pmu;
@@ -1286,11 +1332,17 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
addr |= ((resource_size_t)mch_bar << 32);
#endif
+ addr += base_offset;
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr)
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
}
+static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, 0);
+}
+
static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = {
.init_box = tgl_uncore_imc_freerunning_init_box,
.exit_box = uncore_mmio_exit_box,
@@ -1338,3 +1390,136 @@ void tgl_uncore_mmio_init(void)
}
/* end of Tiger Lake MMIO uncore support */
+
+/* Alder Lake MMIO uncore support */
+#define ADL_UNCORE_IMC_BASE 0xd900
+#define ADL_UNCORE_IMC_MAP_SIZE 0x200
+#define ADL_UNCORE_IMC_CTR 0xe8
+#define ADL_UNCORE_IMC_CTRL 0xd0
+#define ADL_UNCORE_IMC_GLOBAL_CTL 0xc0
+#define ADL_UNCORE_IMC_BOX_CTL 0xc4
+#define ADL_UNCORE_IMC_FREERUNNING_BASE 0xd800
+#define ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE 0x100
+
+#define ADL_UNCORE_IMC_CTL_FRZ (1 << 0)
+#define ADL_UNCORE_IMC_CTL_RST_CTRL (1 << 1)
+#define ADL_UNCORE_IMC_CTL_RST_CTRS (1 << 2)
+#define ADL_UNCORE_IMC_CTL_INT (ADL_UNCORE_IMC_CTL_RST_CTRL | \
+ ADL_UNCORE_IMC_CTL_RST_CTRS)
+
+static void adl_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, ADL_UNCORE_IMC_BASE);
+
+ /* The global control in MC1 can control both MCs. */
+ if (box->io_addr && (box->pmu->pmu_idx == 1))
+ writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + ADL_UNCORE_IMC_GLOBAL_CTL);
+}
+
+static void adl_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(ADL_UNCORE_IMC_CTL_FRZ, box->io_addr + uncore_mmio_box_ctl(box));
+}
+
+static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr + uncore_mmio_box_ctl(box));
+}
+
+static struct intel_uncore_ops adl_uncore_mmio_ops = {
+ .init_box = adl_uncore_imc_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = adl_uncore_mmio_disable_box,
+ .enable_box = adl_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00
+#define ADL_UNC_IMC_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ ADL_UNC_CTL_CHMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET)
+
+static struct attribute *adl_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_chmask.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static const struct attribute_group adl_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = adl_uncore_imc_formats_attr,
+};
+
+static struct intel_uncore_type adl_uncore_imc = {
+ .name = "imc",
+ .num_counters = 5,
+ .num_boxes = 2,
+ .perf_ctr_bits = 64,
+ .perf_ctr = ADL_UNCORE_IMC_CTR,
+ .event_ctl = ADL_UNCORE_IMC_CTRL,
+ .event_mask = ADL_UNC_IMC_EVENT_MASK,
+ .box_ctl = ADL_UNCORE_IMC_BOX_CTL,
+ .mmio_offset = 0,
+ .mmio_map_size = ADL_UNCORE_IMC_MAP_SIZE,
+ .ops = &adl_uncore_mmio_ops,
+ .format_group = &adl_uncore_imc_format_group,
+};
+
+enum perf_adl_uncore_imc_freerunning_types {
+ ADL_MMIO_UNCORE_IMC_DATA_TOTAL,
+ ADL_MMIO_UNCORE_IMC_DATA_READ,
+ ADL_MMIO_UNCORE_IMC_DATA_WRITE,
+ ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX
+};
+
+static struct freerunning_counters adl_uncore_imc_freerunning[] = {
+ [ADL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x40, 0x0, 0x0, 1, 64 },
+ [ADL_MMIO_UNCORE_IMC_DATA_READ] = { 0x58, 0x0, 0x0, 1, 64 },
+ [ADL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xA0, 0x0, 0x0, 1, 64 },
+};
+
+static void adl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, ADL_UNCORE_IMC_FREERUNNING_BASE);
+}
+
+static struct intel_uncore_ops adl_uncore_imc_freerunning_ops = {
+ .init_box = adl_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type adl_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .num_freerunning_types = ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE,
+ .freerunning = adl_uncore_imc_freerunning,
+ .ops = &adl_uncore_imc_freerunning_ops,
+ .event_descs = tgl_uncore_imc_events,
+ .format_group = &tgl_uncore_imc_format_group,
+};
+
+static struct intel_uncore_type *adl_mmio_uncores[] = {
+ &adl_uncore_imc,
+ &adl_uncore_imc_free_running,
+ NULL
+};
+
+void adl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = adl_mmio_uncores;
+}
+
+/* end of Alder Lake MMIO uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index b79951d0707c..ed869443efb2 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* SandyBridge-EP/IvyTown uncore support */
#include "uncore.h"
+#include "uncore_discovery.h"
/* SNB-EP pci bus to socket mapping */
#define SNBEP_CPUNODEID 0x40
@@ -280,17 +281,17 @@
* | [63] | 00h | VALID - When set, indicates the CPU bus
* numbers have been initialized. (RO)
* |[62:48]| --- | Reserved
- * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned
+ * |[47:40]| 00h | BUS_NUM_5 - Return the bus number BIOS assigned
* CPUBUSNO(5). (RO)
- * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned
+ * |[39:32]| 00h | BUS_NUM_4 - Return the bus number BIOS assigned
* CPUBUSNO(4). (RO)
- * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned
+ * |[31:24]| 00h | BUS_NUM_3 - Return the bus number BIOS assigned
* CPUBUSNO(3). (RO)
- * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned
+ * |[23:16]| 00h | BUS_NUM_2 - Return the bus number BIOS assigned
* CPUBUSNO(2). (RO)
- * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned
+ * |[15:8] | 00h | BUS_NUM_1 - Return the bus number BIOS assigned
* CPUBUSNO(1). (RO)
- * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned
+ * | [7:0] | 00h | BUS_NUM_0 - Return the bus number BIOS assigned
* CPUBUSNO(0). (RO)
*/
#define SKX_MSR_CPU_BUS_NUMBER 0x300
@@ -348,6 +349,13 @@
#define SKX_M2M_PCI_PMON_CTR0 0x200
#define SKX_M2M_PCI_PMON_BOX_CTL 0x258
+/* Memory Map registers device ID */
+#define SNR_ICX_MESH2IIO_MMAP_DID 0x9a2
+#define SNR_ICX_SAD_CONTROL_CFG 0x3f4
+
+/* Getting I/O stack id in SAD_COTROL_CFG notation */
+#define SAD_CONTROL_STACK_ID(data) (((data) >> 4) & 0x7)
+
/* SNR Ubox */
#define SNR_U_MSR_PMON_CTR0 0x1f98
#define SNR_U_MSR_PMON_CTL0 0x1f91
@@ -444,9 +452,20 @@
#define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0
/* ICX IMC */
-#define ICX_NUMBER_IMC_CHN 2
+#define ICX_NUMBER_IMC_CHN 3
#define ICX_IMC_MEM_STRIDE 0x4
+/* SPR */
+#define SPR_RAW_EVENT_MASK_EXT 0xffffff
+
+/* SPR CHA */
+#define SPR_CHA_PMON_CTL_TID_EN (1 << 16)
+#define SPR_CHA_PMON_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SPR_CHA_PMON_CTL_TID_EN)
+#define SPR_CHA_PMON_BOX_FILTER_TID 0x3ff
+
+#define SPR_C0_MSR_PMON_BOX_FILTER0 0x200e
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
@@ -459,6 +478,7 @@ DEFINE_UNCORE_FORMAT_ATTR(umask_ext4, umask, "config:8-15,32-55");
DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en2, tid_en, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
@@ -1159,7 +1179,6 @@ enum {
SNBEP_PCI_QPI_PORT0_FILTER,
SNBEP_PCI_QPI_PORT1_FILTER,
BDX_PCI_QPI_PORT2_FILTER,
- HSWEP_PCI_PCU_3,
};
static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
@@ -1407,6 +1426,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
die_id = i;
else
die_id = topology_phys_to_logical_pkg(i);
+ if (die_id < 0)
+ die_id = -ENODEV;
map->pbus_to_dieid[bus] = die_id;
break;
}
@@ -1453,14 +1474,14 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
i = -1;
if (reverse) {
for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_dieid[bus] >= 0)
+ if (map->pbus_to_dieid[bus] != -1)
i = map->pbus_to_dieid[bus];
else
map->pbus_to_dieid[bus] = i;
}
} else {
for (bus = 0; bus <= 255; bus++) {
- if (map->pbus_to_dieid[bus] >= 0)
+ if (map->pbus_to_dieid[bus] != -1)
i = map->pbus_to_dieid[bus];
else
map->pbus_to_dieid[bus] = i;
@@ -2857,22 +2878,33 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {
NULL,
};
-void hswep_uncore_cpu_init(void)
+#define HSWEP_PCU_DID 0x2fc0
+#define HSWEP_PCU_CAPID4_OFFET 0x94
+#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3)
+
+static bool hswep_has_limit_sbox(unsigned int device)
{
- int pkg = boot_cpu_data.logical_proc_id;
+ struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL);
+ u32 capid4;
+ if (!dev)
+ return false;
+
+ pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4);
+ if (!hswep_get_chop(capid4))
+ return true;
+
+ return false;
+}
+
+void hswep_uncore_cpu_init(void)
+{
if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
/* Detect 6-8 core systems with only two SBOXes */
- if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
- u32 capid4;
-
- pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
- 0x94, &capid4);
- if (((capid4 >> 6) & 0x3) == 0)
- hswep_uncore_sbox.num_boxes = 2;
- }
+ if (hswep_has_limit_sbox(HSWEP_PCU_DID))
+ hswep_uncore_sbox.num_boxes = 2;
uncore_msr_uncores = hswep_msr_uncores;
}
@@ -3135,11 +3167,6 @@ static const struct pci_device_id hswep_uncore_pci_ids[] = {
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
SNBEP_PCI_QPI_PORT1_FILTER),
},
- { /* PCU.3 (for Capability registers) */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- HSWEP_PCI_PCU_3),
- },
{ /* end: all zeroes */ }
};
@@ -3231,27 +3258,18 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = {
EVENT_CONSTRAINT_END
};
+#define BDX_PCU_DID 0x6fc0
+
void bdx_uncore_cpu_init(void)
{
- int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id);
-
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
uncore_msr_uncores = bdx_msr_uncores;
- /* BDX-DE doesn't have SBOX */
- if (boot_cpu_data.x86_model == 86) {
- uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
/* Detect systems with no SBOXes */
- } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
- struct pci_dev *pdev;
- u32 capid4;
-
- pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3];
- pci_read_config_dword(pdev, 0x94, &capid4);
- if (((capid4 >> 6) & 0x3) == 0)
- bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
- }
+ if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID))
+ uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+
hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
}
@@ -3472,11 +3490,6 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
BDX_PCI_QPI_PORT2_FILTER),
},
- { /* PCU.3 (for Capability registers) */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- HSWEP_PCI_PCU_3),
- },
{ /* end: all zeroes */ }
};
@@ -3595,6 +3608,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
struct extra_reg *er;
int idx = 0;
+ /* Any of the CHA events may be filtered by Thread/Core-ID.*/
+ if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN)
+ idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID;
for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
if (er->event != (event->hw.config & er->config_mask))
@@ -3662,6 +3678,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
EVENT_CONSTRAINT_END
};
@@ -3684,32 +3701,35 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
{
- return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+ return pmu->type->topology[die].configuration >>
+ (pmu->pmu_idx * BUS_NUM_STRIDE);
}
static umode_t
-skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr,
+ int die, int zero_bus_pmu)
{
struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
- /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */
- return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode;
+ return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
+}
+
+static umode_t
+skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 0. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 0);
}
static ssize_t skx_iio_mapping_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr, char *buf)
{
- struct pci_bus *bus = pci_find_next_bus(NULL);
- struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
long die = (long)ea->var;
- /*
- * Current implementation is for single segment configuration hence it's
- * safe to take the segment value from the first available root bus.
- */
- return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus),
- skx_iio_stack(uncore_pmu, die));
+ return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment,
+ skx_iio_stack(pmu, die));
}
static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
@@ -3746,34 +3766,32 @@ static int die_to_cpu(int die)
static int skx_iio_get_topology(struct intel_uncore_type *type)
{
- int i, ret;
- struct pci_bus *bus = NULL;
-
- /*
- * Verified single-segment environments only; disabled for multiple
- * segment topologies for now except VMD domains.
- * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
- */
- while ((bus = pci_find_next_bus(bus))
- && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff))
- ;
- if (bus)
- return -EPERM;
+ int die, ret = -EPERM;
- type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL);
+ type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
+ GFP_KERNEL);
if (!type->topology)
return -ENOMEM;
- for (i = 0; i < uncore_max_dies(); i++) {
- ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]);
- if (ret) {
- kfree(type->topology);
- type->topology = NULL;
- return ret;
- }
+ for (die = 0; die < uncore_max_dies(); die++) {
+ ret = skx_msr_cpu_bus_read(die_to_cpu(die),
+ &type->topology[die].configuration);
+ if (ret)
+ break;
+
+ ret = uncore_die_to_segment(die);
+ if (ret < 0)
+ break;
+
+ type->topology[die].segment = ret;
}
- return 0;
+ if (ret < 0) {
+ kfree(type->topology);
+ type->topology = NULL;
+ }
+
+ return ret;
}
static struct attribute_group skx_iio_mapping_group = {
@@ -3785,7 +3803,8 @@ static const struct attribute_group *skx_iio_attr_update[] = {
NULL,
};
-static int skx_iio_set_mapping(struct intel_uncore_type *type)
+static int
+pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
{
char buf[64];
int ret;
@@ -3793,8 +3812,8 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type)
struct attribute **attrs = NULL;
struct dev_ext_attribute *eas = NULL;
- ret = skx_iio_get_topology(type);
- if (ret)
+ ret = type->get_topology(type);
+ if (ret < 0)
goto clear_attr_update;
ret = -ENOMEM;
@@ -3802,11 +3821,11 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type)
/* One more for NULL. */
attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
if (!attrs)
- goto err;
+ goto clear_topology;
eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL);
if (!eas)
- goto err;
+ goto clear_attrs;
for (die = 0; die < uncore_max_dies(); die++) {
sprintf(buf, "die%ld", die);
@@ -3820,35 +3839,48 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type)
eas[die].var = (void *)die;
attrs[die] = &eas[die].attr.attr;
}
- skx_iio_mapping_group.attrs = attrs;
+ ag->attrs = attrs;
return 0;
err:
for (; die >= 0; die--)
kfree(eas[die].attr.attr.name);
kfree(eas);
+clear_attrs:
kfree(attrs);
+clear_topology:
kfree(type->topology);
clear_attr_update:
type->attr_update = NULL;
return ret;
}
-static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
+static void
+pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
{
- struct attribute **attr = skx_iio_mapping_group.attrs;
+ struct attribute **attr = ag->attrs;
if (!attr)
return;
for (; *attr; attr++)
kfree((*attr)->name);
- kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs));
- kfree(skx_iio_mapping_group.attrs);
- skx_iio_mapping_group.attrs = NULL;
+ kfree(attr_to_ext_attr(*ag->attrs));
+ kfree(ag->attrs);
+ ag->attrs = NULL;
kfree(type->topology);
}
+static int skx_iio_set_mapping(struct intel_uncore_type *type)
+{
+ return pmu_iio_set_mapping(type, &skx_iio_mapping_group);
+}
+
+static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group);
+}
+
static struct intel_uncore_type skx_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -3864,6 +3896,7 @@ static struct intel_uncore_type skx_uncore_iio = {
.ops = &skx_uncore_iio_ops,
.format_group = &skx_uncore_iio_format_group,
.attr_update = skx_iio_attr_update,
+ .get_topology = skx_iio_get_topology,
.set_mapping = skx_iio_set_mapping,
.cleanup_mapping = skx_iio_cleanup_mapping,
};
@@ -4406,6 +4439,103 @@ static const struct attribute_group snr_uncore_iio_format_group = {
.attrs = snr_uncore_iio_formats_attr,
};
+static umode_t
+snr_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 1. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 1);
+}
+
+static struct attribute_group snr_iio_mapping_group = {
+ .is_visible = snr_iio_mapping_visible,
+};
+
+static const struct attribute_group *snr_iio_attr_update[] = {
+ &snr_iio_mapping_group,
+ NULL,
+};
+
+static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_mapping)
+{
+ u32 sad_cfg;
+ int die, stack_id, ret = -EPERM;
+ struct pci_dev *dev = NULL;
+
+ type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
+ GFP_KERNEL);
+ if (!type->topology)
+ return -ENOMEM;
+
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) {
+ ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ break;
+ }
+
+ die = uncore_pcibus_to_dieid(dev->bus);
+ stack_id = SAD_CONTROL_STACK_ID(sad_cfg);
+ if (die < 0 || stack_id >= type->num_boxes) {
+ ret = -EPERM;
+ break;
+ }
+
+ /* Convert stack id from SAD_CONTROL to PMON notation. */
+ stack_id = sad_pmon_mapping[stack_id];
+
+ ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number;
+ type->topology[die].segment = pci_domain_nr(dev->bus);
+ }
+
+ if (ret) {
+ kfree(type->topology);
+ type->topology = NULL;
+ }
+
+ return ret;
+}
+
+/*
+ * SNR has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON
+ */
+enum {
+ SNR_QAT_PMON_ID,
+ SNR_CBDMA_DMI_PMON_ID,
+ SNR_NIS_PMON_ID,
+ SNR_DLB_PMON_ID,
+ SNR_PCIE_GEN3_PMON_ID
+};
+
+static u8 snr_sad_pmon_mapping[] = {
+ SNR_CBDMA_DMI_PMON_ID,
+ SNR_PCIE_GEN3_PMON_ID,
+ SNR_DLB_PMON_ID,
+ SNR_NIS_PMON_ID,
+ SNR_QAT_PMON_ID
+};
+
+static int snr_iio_get_topology(struct intel_uncore_type *type)
+{
+ return sad_cfg_iio_topology(type, snr_sad_pmon_mapping);
+}
+
+static int snr_iio_set_mapping(struct intel_uncore_type *type)
+{
+ return pmu_iio_set_mapping(type, &snr_iio_mapping_group);
+}
+
+static void snr_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group);
+}
+
+static struct event_constraint snr_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
static struct intel_uncore_type snr_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -4417,8 +4547,13 @@ static struct intel_uncore_type snr_uncore_iio = {
.event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
.box_ctl = SNR_IIO_MSR_PMON_BOX_CTL,
.msr_offset = SNR_IIO_MSR_OFFSET,
+ .constraints = snr_uncore_iio_constraints,
.ops = &ivbep_uncore_msr_ops,
.format_group = &snr_uncore_iio_format_group,
+ .attr_update = snr_iio_attr_update,
+ .get_topology = snr_iio_get_topology,
+ .set_mapping = snr_iio_set_mapping,
+ .cleanup_mapping = snr_iio_cleanup_mapping,
};
static struct intel_uncore_type snr_uncore_irp = {
@@ -4684,13 +4819,15 @@ int snr_uncore_pci_init(void)
return 0;
}
-static struct pci_dev *snr_uncore_get_mc_dev(int id)
+#define SNR_MC_DEVICE_ID 0x3451
+
+static struct pci_dev *snr_uncore_get_mc_dev(unsigned int device, int id)
{
struct pci_dev *mc_dev = NULL;
int pkg;
while (1) {
- mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
+ mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, mc_dev);
if (!mc_dev)
break;
pkg = uncore_pcibus_to_dieid(mc_dev->bus);
@@ -4700,19 +4837,20 @@ static struct pci_dev *snr_uncore_get_mc_dev(int id)
return mc_dev;
}
-static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
- unsigned int box_ctl, int mem_offset)
+static int snr_uncore_mmio_map(struct intel_uncore_box *box,
+ unsigned int box_ctl, int mem_offset,
+ unsigned int device)
{
- struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid);
+ struct pci_dev *pdev = snr_uncore_get_mc_dev(device, box->dieid);
struct intel_uncore_type *type = box->pmu->type;
resource_size_t addr;
u32 pci_dword;
if (!pdev)
- return;
+ return -ENODEV;
pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword);
- addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23;
+ addr = ((resource_size_t)pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23;
pci_read_config_dword(pdev, mem_offset, &pci_dword);
addr |= (pci_dword & SNR_IMC_MMIO_MEM0_MASK) << 12;
@@ -4722,16 +4860,25 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr) {
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
- return;
+ return -EINVAL;
}
- writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
+ return 0;
+}
+
+static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
+ unsigned int box_ctl, int mem_offset,
+ unsigned int device)
+{
+ if (!snr_uncore_mmio_map(box, box_ctl, mem_offset, device))
+ writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
}
static void snr_uncore_mmio_init_box(struct intel_uncore_box *box)
{
__snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box),
- SNR_IMC_MMIO_MEM0_OFFSET);
+ SNR_IMC_MMIO_MEM0_OFFSET,
+ SNR_MC_DEVICE_ID);
}
static void snr_uncore_mmio_disable_box(struct intel_uncore_box *box)
@@ -4941,11 +5088,65 @@ static struct event_constraint icx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
UNCORE_EVENT_CONSTRAINT(0x03, 0x3),
UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
EVENT_CONSTRAINT_END
};
+static umode_t
+icx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 5. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 5);
+}
+
+static struct attribute_group icx_iio_mapping_group = {
+ .is_visible = icx_iio_mapping_visible,
+};
+
+static const struct attribute_group *icx_iio_attr_update[] = {
+ &icx_iio_mapping_group,
+ NULL,
+};
+
+/*
+ * ICX has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON
+ */
+enum {
+ ICX_PCIE1_PMON_ID,
+ ICX_PCIE2_PMON_ID,
+ ICX_PCIE3_PMON_ID,
+ ICX_PCIE4_PMON_ID,
+ ICX_PCIE5_PMON_ID,
+ ICX_CBDMA_DMI_PMON_ID
+};
+
+static u8 icx_sad_pmon_mapping[] = {
+ ICX_CBDMA_DMI_PMON_ID,
+ ICX_PCIE1_PMON_ID,
+ ICX_PCIE2_PMON_ID,
+ ICX_PCIE3_PMON_ID,
+ ICX_PCIE4_PMON_ID,
+ ICX_PCIE5_PMON_ID,
+};
+
+static int icx_iio_get_topology(struct intel_uncore_type *type)
+{
+ return sad_cfg_iio_topology(type, icx_sad_pmon_mapping);
+}
+
+static int icx_iio_set_mapping(struct intel_uncore_type *type)
+{
+ return pmu_iio_set_mapping(type, &icx_iio_mapping_group);
+}
+
+static void icx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group);
+}
+
static struct intel_uncore_type icx_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -4960,6 +5161,10 @@ static struct intel_uncore_type icx_uncore_iio = {
.constraints = icx_uncore_iio_constraints,
.ops = &skx_uncore_iio_ops,
.format_group = &snr_uncore_iio_format_group,
+ .attr_update = icx_iio_attr_update,
+ .get_topology = icx_iio_get_topology,
+ .set_mapping = icx_iio_set_mapping,
+ .cleanup_mapping = icx_iio_cleanup_mapping,
};
static struct intel_uncore_type icx_uncore_irp = {
@@ -5112,9 +5317,10 @@ static struct intel_uncore_type icx_uncore_m2m = {
.perf_ctr = SNR_M2M_PCI_PMON_CTR0,
.event_ctl = SNR_M2M_PCI_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT,
.box_ctl = SNR_M2M_PCI_PMON_BOX_CTL,
.ops = &snr_m2m_uncore_pci_ops,
- .format_group = &skx_uncore_format_group,
+ .format_group = &snr_m2m_uncore_format_group,
};
static struct attribute *icx_upi_uncore_formats_attr[] = {
@@ -5254,7 +5460,8 @@ static void icx_uncore_imc_init_box(struct intel_uncore_box *box)
int mem_offset = (box->pmu->pmu_idx / ICX_NUMBER_IMC_CHN) * ICX_IMC_MEM_STRIDE +
SNR_IMC_MMIO_MEM0_OFFSET;
- __snr_uncore_mmio_init_box(box, box_ctl, mem_offset);
+ __snr_uncore_mmio_init_box(box, box_ctl, mem_offset,
+ SNR_MC_DEVICE_ID);
}
static struct intel_uncore_ops icx_uncore_mmio_ops = {
@@ -5270,12 +5477,12 @@ static struct intel_uncore_ops icx_uncore_mmio_ops = {
static struct intel_uncore_type icx_uncore_imc = {
.name = "imc",
.num_counters = 4,
- .num_boxes = 8,
+ .num_boxes = 12,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
.fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
- .event_descs = hswep_uncore_imc_events,
+ .event_descs = snr_uncore_imc_events,
.perf_ctr = SNR_IMC_MMIO_PMON_CTR0,
.event_ctl = SNR_IMC_MMIO_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
@@ -5324,7 +5531,8 @@ static void icx_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE +
SNR_IMC_MMIO_MEM0_OFFSET;
- __snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box), mem_offset);
+ snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box),
+ mem_offset, SNR_MC_DEVICE_ID);
}
static struct intel_uncore_ops icx_uncore_imc_freerunning_ops = {
@@ -5358,3 +5566,507 @@ void icx_uncore_mmio_init(void)
}
/* end of ICX uncore support */
+
+/* SPR uncore support */
+
+static void spr_uncore_msr_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrl(reg1->reg, reg1->config);
+
+ wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void spr_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrl(reg1->reg, 0);
+
+ wrmsrl(hwc->config_base, 0);
+}
+
+static int spr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ bool tie_en = !!(event->hw.config & SPR_CHA_PMON_CTL_TID_EN);
+ struct intel_uncore_type *type = box->pmu->type;
+
+ if (tie_en) {
+ reg1->reg = SPR_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * type->box_ids[box->pmu->pmu_idx];
+ reg1->config = event->attr.config1 & SPR_CHA_PMON_BOX_FILTER_TID;
+ reg1->idx = 0;
+ }
+
+ return 0;
+}
+
+static struct intel_uncore_ops spr_uncore_chabox_ops = {
+ .init_box = intel_generic_uncore_msr_init_box,
+ .disable_box = intel_generic_uncore_msr_disable_box,
+ .enable_box = intel_generic_uncore_msr_enable_box,
+ .disable_event = spr_uncore_msr_disable_event,
+ .enable_event = spr_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = spr_cha_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct attribute *spr_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext4.attr,
+ &format_attr_tid_en2.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid5.attr,
+ NULL,
+};
+static const struct attribute_group spr_uncore_chabox_format_group = {
+ .name = "format",
+ .attrs = spr_uncore_cha_formats_attr,
+};
+
+static ssize_t alias_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+ char pmu_name[UNCORE_PMU_NAME_LEN];
+
+ uncore_get_alias_name(pmu_name, pmu);
+ return sysfs_emit(buf, "%s\n", pmu_name);
+}
+
+static DEVICE_ATTR_RO(alias);
+
+static struct attribute *uncore_alias_attrs[] = {
+ &dev_attr_alias.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(uncore_alias);
+
+static struct intel_uncore_type spr_uncore_chabox = {
+ .name = "cha",
+ .event_mask = SPR_CHA_PMON_EVENT_MASK,
+ .event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
+ .num_shared_regs = 1,
+ .constraints = skx_uncore_chabox_constraints,
+ .ops = &spr_uncore_chabox_ops,
+ .format_group = &spr_uncore_chabox_format_group,
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type spr_uncore_iio = {
+ .name = "iio",
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .format_group = &snr_uncore_iio_format_group,
+ .attr_update = uncore_alias_groups,
+ .constraints = icx_uncore_iio_constraints,
+};
+
+static struct attribute *spr_uncore_raw_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext4.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group spr_uncore_raw_format_group = {
+ .name = "format",
+ .attrs = spr_uncore_raw_formats_attr,
+};
+
+#define SPR_UNCORE_COMMON_FORMAT() \
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
+ .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, \
+ .format_group = &spr_uncore_raw_format_group, \
+ .attr_update = uncore_alias_groups
+
+static struct intel_uncore_type spr_uncore_irp = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "irp",
+
+};
+
+static struct event_constraint spr_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type spr_uncore_m2pcie = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "m2pcie",
+ .constraints = spr_uncore_m2pcie_constraints,
+};
+
+static struct intel_uncore_type spr_uncore_pcu = {
+ .name = "pcu",
+ .attr_update = uncore_alias_groups,
+};
+
+static void spr_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ if (uncore_pmc_fixed(hwc->idx))
+ writel(SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base);
+ else
+ writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops spr_uncore_mmio_ops = {
+ .init_box = intel_generic_uncore_mmio_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = spr_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type spr_uncore_imc = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "imc",
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
+ .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
+ .ops = &spr_uncore_mmio_ops,
+};
+
+static void spr_uncore_pci_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32));
+ pci_write_config_dword(pdev, hwc->config_base, (u32)hwc->config);
+}
+
+static struct intel_uncore_ops spr_uncore_pci_ops = {
+ .init_box = intel_generic_uncore_pci_init_box,
+ .disable_box = intel_generic_uncore_pci_disable_box,
+ .enable_box = intel_generic_uncore_pci_enable_box,
+ .disable_event = intel_generic_uncore_pci_disable_event,
+ .enable_event = spr_uncore_pci_enable_event,
+ .read_counter = intel_generic_uncore_pci_read_counter,
+};
+
+#define SPR_UNCORE_PCI_COMMON_FORMAT() \
+ SPR_UNCORE_COMMON_FORMAT(), \
+ .ops = &spr_uncore_pci_ops
+
+static struct intel_uncore_type spr_uncore_m2m = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "m2m",
+};
+
+static struct intel_uncore_type spr_uncore_upi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "upi",
+};
+
+static struct intel_uncore_type spr_uncore_m3upi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "m3upi",
+ .constraints = icx_uncore_m3upi_constraints,
+};
+
+static struct intel_uncore_type spr_uncore_mdf = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "mdf",
+};
+
+#define UNCORE_SPR_NUM_UNCORE_TYPES 12
+#define UNCORE_SPR_IIO 1
+#define UNCORE_SPR_IMC 6
+
+static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
+ &spr_uncore_chabox,
+ &spr_uncore_iio,
+ &spr_uncore_irp,
+ &spr_uncore_m2pcie,
+ &spr_uncore_pcu,
+ NULL,
+ &spr_uncore_imc,
+ &spr_uncore_m2m,
+ &spr_uncore_upi,
+ &spr_uncore_m3upi,
+ NULL,
+ &spr_uncore_mdf,
+};
+
+enum perf_uncore_spr_iio_freerunning_type_id {
+ SPR_IIO_MSR_IOCLK,
+ SPR_IIO_MSR_BW_IN,
+ SPR_IIO_MSR_BW_OUT,
+
+ SPR_IIO_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters spr_iio_freerunning[] = {
+ [SPR_IIO_MSR_IOCLK] = { 0x340e, 0x1, 0x10, 1, 48 },
+ [SPR_IIO_MSR_BW_IN] = { 0x3800, 0x1, 0x10, 8, 48 },
+ [SPR_IIO_MSR_BW_OUT] = { 0x3808, 0x1, 0x10, 8, 48 },
+};
+
+static struct uncore_event_desc spr_uncore_iio_freerunning_events[] = {
+ /* Free-Running IIO CLOCKS Counter */
+ INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
+ /* Free-Running IIO BANDWIDTH IN Counters */
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"),
+ /* Free-Running IIO BANDWIDTH OUT Counters */
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x30"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x31"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x32"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x33"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port4, "event=0xff,umask=0x34"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port4.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port4.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port5, "event=0xff,umask=0x35"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port5.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port5.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port6, "event=0xff,umask=0x36"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port6.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port6.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port7, "event=0xff,umask=0x37"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port7.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port7.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type spr_uncore_iio_free_running = {
+ .name = "iio_free_running",
+ .num_counters = 17,
+ .num_freerunning_types = SPR_IIO_FREERUNNING_TYPE_MAX,
+ .freerunning = spr_iio_freerunning,
+ .ops = &skx_uncore_iio_freerunning_ops,
+ .event_descs = spr_uncore_iio_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+enum perf_uncore_spr_imc_freerunning_type_id {
+ SPR_IMC_DCLK,
+ SPR_IMC_PQ_CYCLES,
+
+ SPR_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters spr_imc_freerunning[] = {
+ [SPR_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 },
+ [SPR_IMC_PQ_CYCLES] = { 0x2318, 0x8, 0, 2, 48 },
+};
+
+static struct uncore_event_desc spr_uncore_imc_freerunning_events[] = {
+ INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"),
+
+ INTEL_UNCORE_EVENT_DESC(rpq_cycles, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(wpq_cycles, "event=0xff,umask=0x21"),
+ { /* end: all zeroes */ },
+};
+
+#define SPR_MC_DEVICE_ID 0x3251
+
+static void spr_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE + SNR_IMC_MMIO_MEM0_OFFSET;
+
+ snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box),
+ mem_offset, SPR_MC_DEVICE_ID);
+}
+
+static struct intel_uncore_ops spr_uncore_imc_freerunning_ops = {
+ .init_box = spr_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type spr_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 3,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
+ .num_freerunning_types = SPR_IMC_FREERUNNING_TYPE_MAX,
+ .freerunning = spr_imc_freerunning,
+ .ops = &spr_uncore_imc_freerunning_ops,
+ .event_descs = spr_uncore_imc_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+#define UNCORE_SPR_MSR_EXTRA_UNCORES 1
+#define UNCORE_SPR_MMIO_EXTRA_UNCORES 1
+
+static struct intel_uncore_type *spr_msr_uncores[UNCORE_SPR_MSR_EXTRA_UNCORES] = {
+ &spr_uncore_iio_free_running,
+};
+
+static struct intel_uncore_type *spr_mmio_uncores[UNCORE_SPR_MMIO_EXTRA_UNCORES] = {
+ &spr_uncore_imc_free_running,
+};
+
+static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
+ struct intel_uncore_type *from_type)
+{
+ if (!to_type || !from_type)
+ return;
+
+ if (from_type->name)
+ to_type->name = from_type->name;
+ if (from_type->fixed_ctr_bits)
+ to_type->fixed_ctr_bits = from_type->fixed_ctr_bits;
+ if (from_type->event_mask)
+ to_type->event_mask = from_type->event_mask;
+ if (from_type->event_mask_ext)
+ to_type->event_mask_ext = from_type->event_mask_ext;
+ if (from_type->fixed_ctr)
+ to_type->fixed_ctr = from_type->fixed_ctr;
+ if (from_type->fixed_ctl)
+ to_type->fixed_ctl = from_type->fixed_ctl;
+ if (from_type->fixed_ctr_bits)
+ to_type->fixed_ctr_bits = from_type->fixed_ctr_bits;
+ if (from_type->num_shared_regs)
+ to_type->num_shared_regs = from_type->num_shared_regs;
+ if (from_type->constraints)
+ to_type->constraints = from_type->constraints;
+ if (from_type->ops)
+ to_type->ops = from_type->ops;
+ if (from_type->event_descs)
+ to_type->event_descs = from_type->event_descs;
+ if (from_type->format_group)
+ to_type->format_group = from_type->format_group;
+ if (from_type->attr_update)
+ to_type->attr_update = from_type->attr_update;
+}
+
+static struct intel_uncore_type **
+uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
+ struct intel_uncore_type **extra)
+{
+ struct intel_uncore_type **types, **start_types;
+ int i;
+
+ start_types = types = intel_uncore_generic_init_uncores(type_id, num_extra);
+
+ /* Only copy the customized features */
+ for (; *types; types++) {
+ if ((*types)->type_id >= UNCORE_SPR_NUM_UNCORE_TYPES)
+ continue;
+ uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]);
+ }
+
+ for (i = 0; i < num_extra; i++, types++)
+ *types = extra[i];
+
+ return start_types;
+}
+
+static struct intel_uncore_type *
+uncore_find_type_by_id(struct intel_uncore_type **types, int type_id)
+{
+ for (; *types; types++) {
+ if (type_id == (*types)->type_id)
+ return *types;
+ }
+
+ return NULL;
+}
+
+static int uncore_type_max_boxes(struct intel_uncore_type **types,
+ int type_id)
+{
+ struct intel_uncore_type *type;
+ int i, max = 0;
+
+ type = uncore_find_type_by_id(types, type_id);
+ if (!type)
+ return 0;
+
+ for (i = 0; i < type->num_boxes; i++) {
+ if (type->box_ids[i] > max)
+ max = type->box_ids[i];
+ }
+
+ return max + 1;
+}
+
+void spr_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
+ UNCORE_SPR_MSR_EXTRA_UNCORES,
+ spr_msr_uncores);
+
+ spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
+}
+
+int spr_uncore_pci_init(void)
+{
+ uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL);
+ return 0;
+}
+
+void spr_uncore_mmio_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true);
+
+ if (ret)
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL);
+ else {
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
+ UNCORE_SPR_MMIO_EXTRA_UNCORES,
+ spr_mmio_uncores);
+
+ spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2;
+ }
+}
+
+/* end of SPR uncore support */
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 680404c58cb1..ac542f98c070 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -68,6 +68,7 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_BROADWELL_D:
case INTEL_FAM6_BROADWELL_G:
case INTEL_FAM6_BROADWELL_X:
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_ATOM_SILVERMONT:
case INTEL_FAM6_ATOM_SILVERMONT_D:
@@ -100,6 +101,11 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
case INTEL_FAM6_ROCKETLAKE:
+ case INTEL_FAM6_ALDERLAKE:
+ case INTEL_FAM6_ALDERLAKE_L:
+ case INTEL_FAM6_ALDERLAKE_N:
+ case INTEL_FAM6_RAPTORLAKE:
+ case INTEL_FAM6_RAPTORLAKE_P:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 53b2b5fc23bc..21a5482bcf84 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,7 +14,9 @@
#include <linux/perf_event.h>
+#include <asm/fpu/xstate.h>
#include <asm/intel_ds.h>
+#include <asm/cpu.h>
/* To enable MSR tracing please use the generic trace points. */
@@ -65,22 +67,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
/*
* struct hw_perf_event.flags flags
*/
-#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */
-#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
-#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */
-#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */
-#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */
-#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */
-#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */
+#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */
+#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */
+
+#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */
+#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */
+#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */
+#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */
+#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */
+#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */
+#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */
static inline bool is_topdown_count(struct perf_event *event)
{
@@ -213,7 +216,8 @@ enum {
LBR_FORMAT_EIP_FLAGS2 = 0x04,
LBR_FORMAT_INFO = 0x05,
LBR_FORMAT_TIME = 0x06,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME,
+ LBR_FORMAT_INFO2 = 0x07,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO2,
};
enum {
@@ -228,7 +232,7 @@ struct cpu_hw_events {
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events; /* the # of events in the below arrays */
@@ -322,11 +326,15 @@ struct cpu_hw_events {
* AMD specific bits
*/
struct amd_nb *amd_nb;
+ int brs_active; /* BRS is enabled */
+
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
int n_pair; /* Large increment events */
void *kfree_on_online[X86_PERF_KFREE_MAX];
+
+ struct pmu *pmu;
};
#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \
@@ -630,6 +638,85 @@ enum {
x86_lbr_exclusive_max,
};
+struct x86_hybrid_pmu {
+ struct pmu pmu;
+ const char *name;
+ u8 cpu_type;
+ cpumask_t supported_cpus;
+ union perf_capabilities intel_cap;
+ u64 intel_ctrl;
+ int max_pebs_events;
+ int num_counters;
+ int num_counters_fixed;
+ struct event_constraint unconstrained;
+
+ u64 hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+ u64 hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+ struct event_constraint *event_constraints;
+ struct event_constraint *pebs_constraints;
+ struct extra_reg *extra_regs;
+
+ unsigned int late_ack :1,
+ mid_ack :1,
+ enabled_ack :1;
+};
+
+static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
+{
+ return container_of(pmu, struct x86_hybrid_pmu, pmu);
+}
+
+extern struct static_key_false perf_is_hybrid;
+#define is_hybrid() static_branch_unlikely(&perf_is_hybrid)
+
+#define hybrid(_pmu, _field) \
+(*({ \
+ typeof(&x86_pmu._field) __Fp = &x86_pmu._field; \
+ \
+ if (is_hybrid() && (_pmu)) \
+ __Fp = &hybrid_pmu(_pmu)->_field; \
+ \
+ __Fp; \
+}))
+
+#define hybrid_var(_pmu, _var) \
+(*({ \
+ typeof(&_var) __Fp = &_var; \
+ \
+ if (is_hybrid() && (_pmu)) \
+ __Fp = &hybrid_pmu(_pmu)->_var; \
+ \
+ __Fp; \
+}))
+
+#define hybrid_bit(_pmu, _field) \
+({ \
+ bool __Fp = x86_pmu._field; \
+ \
+ if (is_hybrid() && (_pmu)) \
+ __Fp = hybrid_pmu(_pmu)->_field; \
+ \
+ __Fp; \
+})
+
+enum hybrid_pmu_type {
+ hybrid_big = 0x40,
+ hybrid_small = 0x20,
+
+ hybrid_big_small = hybrid_big | hybrid_small,
+};
+
+#define X86_HYBRID_PMU_ATOM_IDX 0
+#define X86_HYBRID_PMU_CORE_IDX 1
+
+#define X86_HYBRID_NUM_PMUS 2
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -644,6 +731,7 @@ struct x86_pmu {
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ void (*assign)(struct perf_event *event, int idx);
void (*add)(struct perf_event *);
void (*del)(struct perf_event *);
void (*read)(struct perf_event *event);
@@ -687,6 +775,7 @@ struct x86_pmu {
/* PMI handler bits */
unsigned int late_ack :1,
+ mid_ack :1,
enabled_ack :1;
/*
* sysfs attrs
@@ -755,6 +844,11 @@ struct x86_pmu {
bool lbr_double_abort; /* duplicated lbr aborts */
bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */
+ unsigned int lbr_has_info:1;
+ unsigned int lbr_has_tsx:1;
+ unsigned int lbr_from_flags:1;
+ unsigned int lbr_to_cycles:1;
+
/*
* Intel Architectural LBR CPUID Enumeration
*/
@@ -816,6 +910,19 @@ struct x86_pmu {
int (*check_period) (struct perf_event *event, u64 period);
int (*aux_output_match) (struct perf_event *event);
+
+ int (*filter_match)(struct perf_event *event);
+ /*
+ * Hybrid support
+ *
+ * Most PMU capabilities are the same among different hybrid PMUs.
+ * The global x86_pmu saves the architecture capabilities, which
+ * are available for all PMUs. The hybrid_pmu only includes the
+ * unique capabilities.
+ */
+ int num_hybrid_pmus;
+ struct x86_hybrid_pmu *hybrid_pmu;
+ u8 (*get_hybrid_cpu_type) (void);
};
struct x86_perf_task_context_opt {
@@ -905,7 +1012,23 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \
.event_str_ht = ht, \
}
-struct pmu *x86_get_pmu(void);
+#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu) \
+static struct perf_pmu_events_hybrid_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\
+ .id = 0, \
+ .event_str = str, \
+ .pmu_type = _pmu, \
+}
+
+#define FORMAT_HYBRID_PTR(_id) (&format_attr_hybrid_##_id.attr.attr)
+
+#define FORMAT_ATTR_HYBRID(_name, _pmu) \
+static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
+ .attr = __ATTR_RO(_name), \
+ .pmu_type = _pmu, \
+}
+
+struct pmu *x86_get_pmu(unsigned int cpu);
extern struct x86_pmu x86_pmu __read_mostly;
static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
@@ -964,6 +1087,9 @@ static inline int x86_pmu_rdpmc_index(int index)
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
+bool check_hw_exists(struct pmu *pmu, int num_counters,
+ int num_counters_fixed);
+
int x86_add_exclusive(unsigned int what);
void x86_del_exclusive(unsigned int what);
@@ -982,6 +1108,11 @@ int x86_pmu_hw_config(struct perf_event *event);
void x86_pmu_disable_all(void);
+static inline bool has_amd_brs(struct hw_perf_event *hwc)
+{
+ return hwc->flags & PERF_X86_EVENT_AMD_BRS;
+}
+
static inline bool is_counter_pair(struct hw_perf_event *hwc)
{
return hwc->flags & PERF_X86_EVENT_PAIR;
@@ -1015,9 +1146,10 @@ void x86_pmu_stop(struct perf_event *event, int flags);
static inline void x86_pmu_disable_event(struct perf_event *event)
{
+ u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
struct hw_perf_event *hwc = &event->hw;
- wrmsrl(hwc->config_base, hwc->config);
+ wrmsrl(hwc->config_base, hwc->config & ~disable_mask);
if (is_counter_pair(hwc))
wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0);
@@ -1027,6 +1159,11 @@ void x86_pmu_enable_event(struct perf_event *event);
int x86_pmu_handle_irq(struct pt_regs *regs);
+void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
+ u64 intel_ctrl);
+
+void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu);
+
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
@@ -1067,16 +1204,90 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page);
-static inline bool fixed_counter_disabled(int i)
+static inline bool fixed_counter_disabled(int i, struct pmu *pmu)
{
- return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+ u64 intel_ctrl = hybrid(pmu, intel_ctrl);
+
+ return !(intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
}
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
+#ifdef CONFIG_PERF_EVENTS_AMD_BRS
+int amd_brs_init(void);
+void amd_brs_disable(void);
+void amd_brs_enable(void);
+void amd_brs_enable_all(void);
+void amd_brs_disable_all(void);
+void amd_brs_drain(void);
+void amd_brs_lopwr_init(void);
+void amd_brs_disable_all(void);
+int amd_brs_setup_filter(struct perf_event *event);
+void amd_brs_reset(void);
+
+static inline void amd_pmu_brs_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ perf_sched_cb_inc(event->ctx->pmu);
+ cpuc->lbr_users++;
+ /*
+ * No need to reset BRS because it is reset
+ * on brs_enable() and it is saturating
+ */
+}
+
+static inline void amd_pmu_brs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+
+ perf_sched_cb_dec(event->ctx->pmu);
+}
+
+void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in);
+#else
+static inline int amd_brs_init(void)
+{
+ return 0;
+}
+static inline void amd_brs_disable(void) {}
+static inline void amd_brs_enable(void) {}
+static inline void amd_brs_drain(void) {}
+static inline void amd_brs_lopwr_init(void) {}
+static inline void amd_brs_disable_all(void) {}
+static inline int amd_brs_setup_filter(struct perf_event *event)
+{
+ return 0;
+}
+static inline void amd_brs_reset(void) {}
+
+static inline void amd_pmu_brs_add(struct perf_event *event)
+{
+}
+
+static inline void amd_pmu_brs_del(struct perf_event *event)
+{
+}
+
+static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+}
+
+static inline void amd_brs_enable_all(void)
+{
+}
+
+#endif
+
#else /* CONFIG_CPU_SUP_AMD */
static inline int amd_pmu_init(void)
@@ -1084,6 +1295,22 @@ static inline int amd_pmu_init(void)
return 0;
}
+static inline int amd_brs_init(void)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void amd_brs_drain(void)
+{
+}
+
+static inline void amd_brs_enable_all(void)
+{
+}
+
+static inline void amd_brs_disable_all(void)
+{
+}
#endif /* CONFIG_CPU_SUP_AMD */
static inline int is_pebs_pt(struct perf_event *event)
@@ -1114,6 +1341,25 @@ static inline bool intel_pmu_has_bts(struct perf_event *event)
return intel_pmu_has_bts_period(event, hwc->sample_period);
}
+static __always_inline void __intel_pmu_pebs_disable_all(void)
+{
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static __always_inline void __intel_pmu_arch_lbr_disable(void)
+{
+ wrmsrl(MSR_ARCH_LBR_CTL, 0);
+}
+
+static __always_inline void __intel_pmu_lbr_disable(void)
+{
+ u64 debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
int intel_pmu_save_and_restart(struct perf_event *event);
struct event_constraint *
@@ -1135,6 +1381,8 @@ void reserve_ds_buffers(void);
void release_lbr_buffers(void);
+void reserve_lbr_buffers(void);
+
extern struct event_constraint bts_constraint;
extern struct event_constraint vlbr_constraint;
@@ -1154,6 +1402,8 @@ extern struct event_constraint intel_glm_pebs_event_constraints[];
extern struct event_constraint intel_glp_pebs_event_constraints[];
+extern struct event_constraint intel_grt_pebs_event_constraints[];
+
extern struct event_constraint intel_nehalem_pebs_event_constraints[];
extern struct event_constraint intel_westmere_pebs_event_constraints[];
@@ -1241,6 +1491,8 @@ void intel_pmu_lbr_init_skl(void);
void intel_pmu_lbr_init_knl(void);
+void intel_pmu_lbr_init(void);
+
void intel_pmu_arch_lbr_init(void);
void intel_pmu_pebs_data_source_nhm(void);
@@ -1282,6 +1534,10 @@ static inline void release_lbr_buffers(void)
{
}
+static inline void reserve_lbr_buffers(void)
+{
+}
+
static inline int intel_pmu_init(void)
{
return 0;
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index f42a70496a24..77e3a47af5ad 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -536,11 +536,14 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
* - perf_msr_probe(PERF_RAPL_MAX)
* - want to use same event codes across both architectures
*/
-static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = {
- [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr },
+static struct perf_msr amd_rapl_msrs[] = {
+ [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 },
+ [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 },
+ [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 },
+ [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, 0, false, 0 },
};
-
static int rapl_cpu_offline(unsigned int cpu)
{
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
@@ -764,13 +767,14 @@ static struct rapl_model model_spr = {
.rapl_msrs = intel_rapl_spr_msrs,
};
-static struct rapl_model model_amd_fam17h = {
+static struct rapl_model model_amd_hygon = {
.events = BIT(PERF_RAPL_PKG),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
.rapl_msrs = amd_rapl_msrs,
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
+ X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon),
X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb),
X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep),
X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb),
@@ -800,10 +804,9 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr),
- X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h),
- X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h),
- X86_MATCH_VENDOR_FAM(AMD, 0x19, &model_amd_fam17h),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
index e68827e604ad..949d845c922b 100644
--- a/arch/x86/events/zhaoxin/core.c
+++ b/arch/x86/events/zhaoxin/core.c
@@ -494,7 +494,7 @@ static __init void zhaoxin_arch_events_quirk(void)
{
int bit;
- /* disable event that reported as not presend by cpuid */
+ /* disable event that reported as not present by cpuid */
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
pr_warn("CPUID marked event: \'%s\' unavailable\n",
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 48e2c51464e8..5d2de10809ae 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y := hv_init.o mmu.o nested.o irqdomain.o
+obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
ifdef CONFIG_X86_64
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 284e73661a18..db2d92fb44da 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -60,9 +60,11 @@ static u32 hv_apic_read(u32 reg)
switch (reg) {
case APIC_EOI:
rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+ (void)hi;
return reg_val;
case APIC_TASKPRI:
rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+ (void)hi;
return reg_val;
default:
@@ -97,13 +99,14 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
/*
* IPI implementation on Hyper-V.
*/
-static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
+static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
+ bool exclude_self)
{
struct hv_send_ipi_ex **arg;
struct hv_send_ipi_ex *ipi_arg;
unsigned long flags;
int nr_bank = 0;
- int ret = 1;
+ u64 status = HV_STATUS_INVALID_PARAMETER;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
return false;
@@ -119,32 +122,55 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
ipi_arg->reserved = 0;
ipi_arg->vp_set.valid_bank_mask = 0;
- if (!cpumask_equal(mask, cpu_present_mask)) {
+ /*
+ * Use HV_GENERIC_SET_ALL and avoid converting cpumask to VP_SET
+ * when the IPI is sent to all currently present CPUs.
+ */
+ if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) {
ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
- nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
- }
- if (nr_bank < 0)
- goto ipi_mask_ex_done;
- if (!nr_bank)
+ if (exclude_self)
+ nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask);
+ else
+ nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
+
+ /*
+ * 'nr_bank <= 0' means some CPUs in cpumask can't be
+ * represented in VP_SET. Return an error and fall back to
+ * native (architectural) method of sending IPIs.
+ */
+ if (nr_bank <= 0)
+ goto ipi_mask_ex_done;
+ } else {
ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
+ }
- ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
+ status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
ipi_arg, NULL);
ipi_mask_ex_done:
local_irq_restore(flags);
- return ((ret == 0) ? true : false);
+ return hv_result_success(status);
}
-static bool __send_ipi_mask(const struct cpumask *mask, int vector)
+static bool __send_ipi_mask(const struct cpumask *mask, int vector,
+ bool exclude_self)
{
- int cur_cpu, vcpu;
+ int cur_cpu, vcpu, this_cpu = smp_processor_id();
struct hv_send_ipi ipi_arg;
- int ret = 1;
+ u64 status;
+ unsigned int weight;
trace_hyperv_send_ipi_mask(mask, vector);
- if (cpumask_empty(mask))
+ weight = cpumask_weight(mask);
+
+ /*
+ * Do nothing if
+ * 1. the mask is empty
+ * 2. the mask only contains self when exclude_self is true
+ */
+ if (weight == 0 ||
+ (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
return true;
if (!hv_hypercall_pg)
@@ -170,6 +196,8 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector)
ipi_arg.cpu_mask = 0;
for_each_cpu(cur_cpu, mask) {
+ if (exclude_self && cur_cpu == this_cpu)
+ continue;
vcpu = hv_cpu_number_to_vp_number(cur_cpu);
if (vcpu == VP_INVAL)
return false;
@@ -184,17 +212,18 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector)
__set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask);
}
- ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
+ status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
ipi_arg.cpu_mask);
- return ((ret == 0) ? true : false);
+ return hv_result_success(status);
do_ex_hypercall:
- return __send_ipi_mask_ex(mask, vector);
+ return __send_ipi_mask_ex(mask, vector, exclude_self);
}
static bool __send_ipi_one(int cpu, int vector)
{
int vp = hv_cpu_number_to_vp_number(cpu);
+ u64 status;
trace_hyperv_send_ipi_one(cpu, vector);
@@ -205,9 +234,10 @@ static bool __send_ipi_one(int cpu, int vector)
return false;
if (vp >= 64)
- return __send_ipi_mask_ex(cpumask_of(cpu), vector);
+ return __send_ipi_mask_ex(cpumask_of(cpu), vector, false);
- return !hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+ status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+ return hv_result_success(status);
}
static void hv_send_ipi(int cpu, int vector)
@@ -218,20 +248,13 @@ static void hv_send_ipi(int cpu, int vector)
static void hv_send_ipi_mask(const struct cpumask *mask, int vector)
{
- if (!__send_ipi_mask(mask, vector))
+ if (!__send_ipi_mask(mask, vector, false))
orig_apic.send_IPI_mask(mask, vector);
}
static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
{
- unsigned int this_cpu = smp_processor_id();
- struct cpumask new_mask;
- const struct cpumask *local_mask;
-
- cpumask_copy(&new_mask, mask);
- cpumask_clear_cpu(this_cpu, &new_mask);
- local_mask = &new_mask;
- if (!__send_ipi_mask(local_mask, vector))
+ if (!__send_ipi_mask(mask, vector, true))
orig_apic.send_IPI_mask_allbutself(mask, vector);
}
@@ -242,7 +265,7 @@ static void hv_send_ipi_allbutself(int vector)
static void hv_send_ipi_all(int vector)
{
- if (!__send_ipi_mask(cpu_online_mask, vector))
+ if (!__send_ipi_mask(cpu_online_mask, vector, false))
orig_apic.send_IPI_all(vector);
}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index b81047dec1da..3de6d8b53367 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -7,12 +7,13 @@
* Author : K. Y. Srinivasan <kys@microsoft.com>
*/
-#include <linux/acpi.h>
#include <linux/efi.h>
#include <linux/types.h>
#include <linux/bitfield.h>
+#include <linux/io.h>
#include <asm/apic.h>
#include <asm/desc.h>
+#include <asm/sev.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
@@ -28,6 +29,7 @@
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
+#include <linux/swiotlb.h>
int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
@@ -36,99 +38,86 @@ EXPORT_SYMBOL_GPL(hv_current_partition_id);
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
+union hv_ghcb * __percpu *hv_ghcb_pg;
+
/* Storage to save the hypercall page temporarily for hibernation */
static void *hv_hypercall_pg_saved;
-u32 *hv_vp_index;
-EXPORT_SYMBOL_GPL(hv_vp_index);
-
struct hv_vp_assist_page **hv_vp_assist_page;
EXPORT_SYMBOL_GPL(hv_vp_assist_page);
-void __percpu **hyperv_pcpu_input_arg;
-EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
-
-void __percpu **hyperv_pcpu_output_arg;
-EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
-
-u32 hv_max_vp_index;
-EXPORT_SYMBOL_GPL(hv_max_vp_index);
-
-void *hv_alloc_hyperv_page(void)
+static int hyperv_init_ghcb(void)
{
- BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
+ u64 ghcb_gpa;
+ void *ghcb_va;
+ void **ghcb_base;
- return (void *)__get_free_page(GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
+ if (!hv_isolation_type_snp())
+ return 0;
-void *hv_alloc_hyperv_zeroed_page(void)
-{
- BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
+ if (!hv_ghcb_pg)
+ return -EINVAL;
- return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
+ /*
+ * GHCB page is allocated by paravisor. The address
+ * returned by MSR_AMD64_SEV_ES_GHCB is above shared
+ * memory boundary and map it here.
+ */
+ rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa);
+ ghcb_va = memremap(ghcb_gpa, HV_HYP_PAGE_SIZE, MEMREMAP_WB);
+ if (!ghcb_va)
+ return -ENOMEM;
-void hv_free_hyperv_page(unsigned long addr)
-{
- free_page(addr);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ *ghcb_base = ghcb_va;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
static int hv_cpu_init(unsigned int cpu)
{
- u64 msr_vp_index;
+ union hv_vp_assist_msr_contents msr = { 0 };
struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
- void **input_arg;
- struct page *pg;
-
- /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
- pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0);
- if (unlikely(!pg))
- return -ENOMEM;
-
- input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- *input_arg = page_address(pg);
- if (hv_root_partition) {
- void **output_arg;
-
- output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
- *output_arg = page_address(pg + 1);
- }
-
- hv_get_vp_index(msr_vp_index);
-
- hv_vp_index[smp_processor_id()] = msr_vp_index;
+ int ret;
- if (msr_vp_index > hv_max_vp_index)
- hv_max_vp_index = msr_vp_index;
+ ret = hv_common_cpu_init(cpu);
+ if (ret)
+ return ret;
if (!hv_vp_assist_page)
return 0;
- /*
- * The VP ASSIST PAGE is an "overlay" page (see Hyper-V TLFS's Section
- * 5.2.1 "GPA Overlay Pages"). Here it must be zeroed out to make sure
- * we always write the EOI MSR in hv_apic_eoi_write() *after* the
- * EOI optimization is disabled in hv_cpu_die(), otherwise a CPU may
- * not be stopped in the case of CPU offlining and the VM will hang.
- */
if (!*hvp) {
- *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ if (hv_root_partition) {
+ /*
+ * For root partition we get the hypervisor provided VP assist
+ * page, instead of allocating a new page.
+ */
+ rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ *hvp = memremap(msr.pfn <<
+ HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+ } else {
+ /*
+ * The VP assist page is an "overlay" page (see Hyper-V TLFS's
+ * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
+ * out to make sure we always write the EOI MSR in
+ * hv_apic_eoi_write() *after* the EOI optimization is disabled
+ * in hv_cpu_die(), otherwise a CPU may not be stopped in the
+ * case of CPU offlining and the VM will hang.
+ */
+ *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ if (*hvp)
+ msr.pfn = vmalloc_to_pfn(*hvp);
+ }
+ WARN_ON(!(*hvp));
+ if (*hvp) {
+ msr.enable = 1;
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ }
}
- if (*hvp) {
- u64 val;
-
- val = vmalloc_to_pfn(*hvp);
- val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) |
- HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-
- wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val);
- }
-
- return 0;
+ return hyperv_init_ghcb();
}
static void (*hv_reenlightenment_cb)(void);
@@ -162,7 +151,7 @@ EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
static inline bool hv_reenlightenment_available(void)
{
/*
- * Check for required features and priviliges to make TSC frequency
+ * Check for required features and privileges to make TSC frequency
* change notifications work.
*/
return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
@@ -182,7 +171,6 @@ void set_hv_tscchange_cb(void (*cb)(void))
struct hv_reenlightenment_control re_ctrl = {
.vector = HYPERV_REENLIGHTENMENT_VECTOR,
.enabled = 1,
- .target_vp = hv_vp_index[smp_processor_id()]
};
struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
@@ -191,13 +179,20 @@ void set_hv_tscchange_cb(void (*cb)(void))
return;
}
+ if (!hv_vp_index)
+ return;
+
hv_reenlightenment_cb = cb;
/* Make sure callback is registered before we write to MSRs */
wmb();
+ re_ctrl.target_vp = hv_vp_index[get_cpu()];
+
wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
+
+ put_cpu();
}
EXPORT_SYMBOL_GPL(set_hv_tscchange_cb);
@@ -220,28 +215,33 @@ static int hv_cpu_die(unsigned int cpu)
{
struct hv_reenlightenment_control re_ctrl;
unsigned int new_cpu;
- unsigned long flags;
- void **input_arg;
- void *pg;
-
- local_irq_save(flags);
- input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- pg = *input_arg;
- *input_arg = NULL;
+ void **ghcb_va;
- if (hv_root_partition) {
- void **output_arg;
-
- output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
- *output_arg = NULL;
+ if (hv_ghcb_pg) {
+ ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg);
+ if (*ghcb_va)
+ memunmap(*ghcb_va);
+ *ghcb_va = NULL;
}
- local_irq_restore(flags);
-
- free_pages((unsigned long)pg, hv_root_partition ? 1 : 0);
-
- if (hv_vp_assist_page && hv_vp_assist_page[cpu])
- wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
+ hv_common_cpu_die(cpu);
+
+ if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
+ union hv_vp_assist_msr_contents msr = { 0 };
+ if (hv_root_partition) {
+ /*
+ * For root partition the VP assist page is mapped to
+ * hypervisor provided page, and thus we unmap the
+ * page here and nullify it, so that in future we have
+ * correct page address mapped in hv_cpu_init.
+ */
+ memunmap(hv_vp_assist_page[cpu]);
+ hv_vp_assist_page[cpu] = NULL;
+ rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ msr.enable = 0;
+ }
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ }
if (hv_reenlightenment_cb == NULL)
return 0;
@@ -292,7 +292,7 @@ static int hv_suspend(void)
/*
* Reset the hypercall page as it is going to be invalidated
- * accross hibernation. Setting hv_hypercall_pg to NULL ensures
+ * across hibernation. Setting hv_hypercall_pg to NULL ensures
* that any subsequent hypercall operation fails safely instead of
* crashing due to an access of an invalid page. The hypercall page
* pointer is restored on resume.
@@ -349,7 +349,7 @@ static void __init hv_stimer_setup_percpu_clockev(void)
* Ignore any errors in setting up stimer clockevents
* as we can run with the LAPIC timer as a fallback.
*/
- (void)hv_stimer_alloc();
+ (void)hv_stimer_alloc(false);
/*
* Still register the LAPIC timer, because the direct-mode STIMER is
@@ -369,7 +369,7 @@ static void __init hv_get_partition_id(void)
local_irq_save(flags);
output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
- if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ if (!hv_result_success(status)) {
/* No point in proceeding if this failed */
pr_err("Failed to get partition ID: %lld\n", status);
BUG();
@@ -388,56 +388,38 @@ static void __init hv_get_partition_id(void)
*/
void __init hyperv_init(void)
{
- u64 guest_id, required_msrs;
+ u64 guest_id;
union hv_x64_msr_hypercall_contents hypercall_msr;
- int cpuhp, i;
+ int cpuhp;
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
- /* Absolutely required MSRs */
- required_msrs = HV_MSR_HYPERCALL_AVAILABLE |
- HV_MSR_VP_INDEX_AVAILABLE;
-
- if ((ms_hyperv.features & required_msrs) != required_msrs)
- return;
-
- /*
- * Allocate the per-CPU state for the hypercall input arg.
- * If this allocation fails, we will not be able to setup
- * (per-CPU) hypercall input page and thus this failure is
- * fatal on Hyper-V.
- */
- hyperv_pcpu_input_arg = alloc_percpu(void *);
-
- BUG_ON(hyperv_pcpu_input_arg == NULL);
-
- /* Allocate the per-CPU state for output arg for root */
- if (hv_root_partition) {
- hyperv_pcpu_output_arg = alloc_percpu(void *);
- BUG_ON(hyperv_pcpu_output_arg == NULL);
- }
-
- /* Allocate percpu VP index */
- hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
- GFP_KERNEL);
- if (!hv_vp_index)
+ if (hv_common_init())
return;
- for (i = 0; i < num_possible_cpus(); i++)
- hv_vp_index[i] = VP_INVAL;
-
hv_vp_assist_page = kcalloc(num_possible_cpus(),
sizeof(*hv_vp_assist_page), GFP_KERNEL);
if (!hv_vp_assist_page) {
ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
- goto free_vp_index;
+ goto common_free;
+ }
+
+ if (hv_isolation_type_snp()) {
+ /* Negotiate GHCB Version. */
+ if (!hv_ghcb_negotiate_protocol())
+ hv_ghcb_terminate(SEV_TERM_SET_GEN,
+ GHCB_SEV_ES_PROT_UNSUPPORTED);
+
+ hv_ghcb_pg = alloc_percpu(union hv_ghcb *);
+ if (!hv_ghcb_pg)
+ goto free_vp_assist_page;
}
cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
hv_cpu_init, hv_cpu_die);
if (cpuhp < 0)
- goto free_vp_assist_page;
+ goto free_ghcb_page;
/*
* Setup the hypercall page and enable hypercalls.
@@ -447,14 +429,15 @@ void __init hyperv_init(void)
guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
+ /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */
+ hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX,
VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
__builtin_return_address(0));
- if (hv_hypercall_pg == NULL) {
- wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- goto remove_cpuhp_state;
- }
+ if (hv_hypercall_pg == NULL)
+ goto clean_guest_os_id;
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
hypercall_msr.enable = 1;
@@ -520,16 +503,32 @@ void __init hyperv_init(void)
x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
#endif
+ /* Query the VMs extended capability once, so that it can be cached. */
+ hv_query_ext_cap(0);
+
+#ifdef CONFIG_SWIOTLB
+ /*
+ * Swiotlb bounce buffer needs to be mapped in extra address
+ * space. Map function doesn't work in the early place and so
+ * call swiotlb_update_mem_attributes() here.
+ */
+ if (hv_is_isolation_supported())
+ swiotlb_update_mem_attributes();
+#endif
+
return;
-remove_cpuhp_state:
+clean_guest_os_id:
+ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
cpuhp_remove_state(cpuhp);
+free_ghcb_page:
+ free_percpu(hv_ghcb_pg);
free_vp_assist_page:
kfree(hv_vp_assist_page);
hv_vp_assist_page = NULL;
-free_vp_index:
- kfree(hv_vp_index);
- hv_vp_index = NULL;
+common_free:
+ hv_common_free();
}
/*
@@ -543,6 +542,7 @@ void hyperv_cleanup(void)
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
/*
* Reset hypercall page reference before reset the page,
@@ -559,7 +559,6 @@ void hyperv_cleanup(void)
hypercall_msr.as_uint64 = 0;
wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64);
}
-EXPORT_SYMBOL_GPL(hyperv_cleanup);
void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
{
@@ -593,33 +592,6 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
}
EXPORT_SYMBOL_GPL(hyperv_report_panic);
-/**
- * hyperv_report_panic_msg - report panic message to Hyper-V
- * @pa: physical address of the panic page containing the message
- * @size: size of the message in the page
- */
-void hyperv_report_panic_msg(phys_addr_t pa, size_t size)
-{
- /*
- * P3 to contain the physical address of the panic page & P4 to
- * contain the size of the panic data in that page. Rest of the
- * registers are no-op when the NOTIFY_MSG flag is set.
- */
- wrmsrl(HV_X64_MSR_CRASH_P0, 0);
- wrmsrl(HV_X64_MSR_CRASH_P1, 0);
- wrmsrl(HV_X64_MSR_CRASH_P2, 0);
- wrmsrl(HV_X64_MSR_CRASH_P3, pa);
- wrmsrl(HV_X64_MSR_CRASH_P4, size);
-
- /*
- * Let Hyper-V know there is crash data available along with
- * the panic message.
- */
- wrmsrl(HV_X64_MSR_CRASH_CTL,
- (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG));
-}
-EXPORT_SYMBOL_GPL(hyperv_report_panic_msg);
-
bool hv_is_hyperv_initialized(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
@@ -641,23 +613,3 @@ bool hv_is_hyperv_initialized(void)
return hypercall_msr.enable;
}
EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
-
-bool hv_is_hibernation_supported(void)
-{
- return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
-}
-EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
-
-enum hv_isolation_type hv_get_isolation_type(void)
-{
- if (!(ms_hyperv.features_b & HV_ISOLATION))
- return HV_ISOLATION_TYPE_NONE;
- return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
-}
-EXPORT_SYMBOL_GPL(hv_get_isolation_type);
-
-bool hv_is_isolation_supported(void)
-{
- return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
-}
-EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
index 60461e598239..68a0843d4750 100644
--- a/arch/x86/hyperv/hv_proc.c
+++ b/arch/x86/hyperv/hv_proc.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/clockchips.h>
@@ -93,10 +92,9 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
page_count, 0, input_page, NULL);
local_irq_restore(flags);
-
- if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ if (!hv_result_success(status)) {
pr_err("Failed to deposit pages: %lld\n", status);
- ret = status;
+ ret = hv_result(status);
goto err_free_allocations;
}
@@ -122,7 +120,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
struct hv_add_logical_processor_out *output;
u64 status;
unsigned long flags;
- int ret = 0;
+ int ret = HV_STATUS_SUCCESS;
int pxm = node_to_pxm(node);
/*
@@ -148,13 +146,11 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
input, output);
local_irq_restore(flags);
- status &= HV_HYPERCALL_RESULT_MASK;
-
- if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
- if (status != HV_STATUS_SUCCESS) {
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (!hv_result_success(status)) {
pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
lp_index, apic_id, status);
- ret = status;
+ ret = hv_result(status);
}
break;
}
@@ -169,7 +165,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
struct hv_create_vp *input;
u64 status;
unsigned long irq_flags;
- int ret = 0;
+ int ret = HV_STATUS_SUCCESS;
int pxm = node_to_pxm(node);
/* Root VPs don't seem to need pages deposited */
@@ -200,13 +196,11 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
local_irq_restore(irq_flags);
- status &= HV_HYPERCALL_RESULT_MASK;
-
- if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
- if (status != HV_STATUS_SUCCESS) {
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (!hv_result_success(status)) {
pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
vp_index, flags, status);
- ret = status;
+ ret = hv_result(status);
}
break;
}
diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index f3270c1fc48c..91cfe698bde0 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -25,7 +25,6 @@ static void hv_qlock_kick(int cpu)
static void hv_qlock_wait(u8 *byte, u8 val)
{
- unsigned long msr_val;
unsigned long flags;
if (in_nmi())
@@ -48,8 +47,13 @@ static void hv_qlock_wait(u8 *byte, u8 val)
/*
* Only issue the rdmsrl() when the lock state has not changed.
*/
- if (READ_ONCE(*byte) == val)
+ if (READ_ONCE(*byte) == val) {
+ unsigned long msr_val;
+
rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val);
+
+ (void)msr_val;
+ }
local_irq_restore(flags);
}
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 4421a8d92e23..7e0f6bedc248 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -63,10 +63,10 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
local_irq_restore(flags);
- if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
+ if (!hv_result_success(status))
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
- return status & HV_HYPERCALL_RESULT_MASK;
+ return hv_result(status);
}
static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
@@ -88,7 +88,7 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
local_irq_restore(flags);
- return status & HV_HYPERCALL_RESULT_MASK;
+ return hv_result(status);
}
#ifdef CONFIG_PCI_MSI
@@ -253,64 +253,43 @@ static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry
return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
}
-static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
{
- u64 status;
struct hv_interrupt_entry old_entry;
- struct irq_desc *desc;
- struct irq_data *data;
struct msi_msg msg;
+ u64 status;
- desc = irq_to_desc(irq);
- if (!desc) {
- pr_debug("%s: no irq desc\n", __func__);
- return;
- }
-
- data = &desc->irq_data;
- if (!data) {
- pr_debug("%s: no irq data\n", __func__);
- return;
- }
-
- if (!data->chip_data) {
+ if (!irqd->chip_data) {
pr_debug("%s: no chip data\n!", __func__);
return;
}
- old_entry = *(struct hv_interrupt_entry *)data->chip_data;
+ old_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
entry_to_msi_msg(&old_entry, &msg);
- kfree(data->chip_data);
- data->chip_data = NULL;
+ kfree(irqd->chip_data);
+ irqd->chip_data = NULL;
status = hv_unmap_msi_interrupt(dev, &old_entry);
- if (status != HV_STATUS_SUCCESS) {
+ if (status != HV_STATUS_SUCCESS)
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
- return;
- }
}
-static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+static void hv_msi_free_irq(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
{
- int i;
- struct msi_desc *entry;
- struct pci_dev *pdev;
+ struct irq_data *irqd = irq_get_irq_data(virq);
+ struct msi_desc *desc;
- if (WARN_ON_ONCE(!dev_is_pci(dev)))
+ if (!irqd)
return;
- pdev = to_pci_dev(dev);
+ desc = irq_data_get_msi_desc(irqd);
+ if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev)))
+ return;
- for_each_pci_msi_entry(entry, pdev) {
- if (entry->irq) {
- for (i = 0; i < entry->nvec_used; i++) {
- hv_teardown_msi_irq_common(pdev, entry, entry->irq + i);
- irq_domain_free_irqs(entry->irq + i, 1);
- }
- }
- }
+ hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd);
}
/*
@@ -329,7 +308,7 @@ static struct irq_chip hv_pci_msi_controller = {
};
static struct msi_domain_ops pci_msi_domain_ops = {
- .domain_free_irqs = hv_msi_domain_free_irqs,
+ .msi_free = hv_msi_free_irq,
.msi_prepare = pci_msi_prepare,
};
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
new file mode 100644
index 000000000000..1dbcbd9da74d
--- /dev/null
+++ b/arch/x86/hyperv/ivm.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V Isolation VM interface with paravisor and hypervisor
+ *
+ * Author:
+ * Tianyu Lan <Tianyu.Lan@microsoft.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/hyperv.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <asm/svm.h>
+#include <asm/sev.h>
+#include <asm/io.h>
+#include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+#define GHCB_USAGE_HYPERV_CALL 1
+
+union hv_ghcb {
+ struct ghcb ghcb;
+ struct {
+ u64 hypercalldata[509];
+ u64 outputgpa;
+ union {
+ union {
+ struct {
+ u32 callcode : 16;
+ u32 isfast : 1;
+ u32 reserved1 : 14;
+ u32 isnested : 1;
+ u32 countofelements : 12;
+ u32 reserved2 : 4;
+ u32 repstartindex : 12;
+ u32 reserved3 : 4;
+ };
+ u64 asuint64;
+ } hypercallinput;
+ union {
+ struct {
+ u16 callstatus;
+ u16 reserved1;
+ u32 elementsprocessed : 12;
+ u32 reserved2 : 20;
+ };
+ u64 asunit64;
+ } hypercalloutput;
+ };
+ u64 reserved2;
+ } hypercall;
+} __packed __aligned(HV_HYP_PAGE_SIZE);
+
+static u16 hv_ghcb_version __ro_after_init;
+
+u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+ u64 status;
+
+ if (!hv_ghcb_pg)
+ return -EFAULT;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return -EFAULT;
+ }
+
+ hv_ghcb->ghcb.protocol_version = GHCB_PROTOCOL_MAX;
+ hv_ghcb->ghcb.ghcb_usage = GHCB_USAGE_HYPERV_CALL;
+
+ hv_ghcb->hypercall.outputgpa = (u64)output;
+ hv_ghcb->hypercall.hypercallinput.asuint64 = 0;
+ hv_ghcb->hypercall.hypercallinput.callcode = control;
+
+ if (input_size)
+ memcpy(hv_ghcb->hypercall.hypercalldata, input, input_size);
+
+ VMGEXIT();
+
+ hv_ghcb->ghcb.ghcb_usage = 0xffffffff;
+ memset(hv_ghcb->ghcb.save.valid_bitmap, 0,
+ sizeof(hv_ghcb->ghcb.save.valid_bitmap));
+
+ status = hv_ghcb->hypercall.hypercalloutput.callstatus;
+
+ local_irq_restore(flags);
+
+ return status;
+}
+
+static inline u64 rd_ghcb_msr(void)
+{
+ return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
+}
+
+static inline void wr_ghcb_msr(u64 val)
+{
+ native_wrmsrl(MSR_AMD64_SEV_ES_GHCB, val);
+}
+
+static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code,
+ u64 exit_info_1, u64 exit_info_2)
+{
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = hv_ghcb_version;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ VMGEXIT();
+
+ if (ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0))
+ return ES_VMM_ERROR;
+ else
+ return ES_OK;
+}
+
+void hv_ghcb_terminate(unsigned int set, unsigned int reason)
+{
+ u64 val = GHCB_MSR_TERM_REQ;
+
+ /* Tell the hypervisor what went wrong. */
+ val |= GHCB_SEV_TERM_REASON(set, reason);
+
+ /* Request Guest Termination from Hypvervisor */
+ wr_ghcb_msr(val);
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+bool hv_ghcb_negotiate_protocol(void)
+{
+ u64 ghcb_gpa;
+ u64 val;
+
+ /* Save ghcb page gpa. */
+ ghcb_gpa = rd_ghcb_msr();
+
+ /* Do the GHCB protocol version negotiation */
+ wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
+ VMGEXIT();
+ val = rd_ghcb_msr();
+
+ if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
+ return false;
+
+ if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
+ GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
+ return false;
+
+ hv_ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val),
+ GHCB_PROTOCOL_MAX);
+
+ /* Write ghcb page back after negotiating protocol. */
+ wr_ghcb_msr(ghcb_gpa);
+ VMGEXIT();
+
+ return true;
+}
+
+void hv_ghcb_msr_write(u64 msr, u64 value)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+
+ if (!hv_ghcb_pg)
+ return;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ ghcb_set_rcx(&hv_ghcb->ghcb, msr);
+ ghcb_set_rax(&hv_ghcb->ghcb, lower_32_bits(value));
+ ghcb_set_rdx(&hv_ghcb->ghcb, upper_32_bits(value));
+
+ if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 1, 0))
+ pr_warn("Fail to write msr via ghcb %llx.\n", msr);
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(hv_ghcb_msr_write);
+
+void hv_ghcb_msr_read(u64 msr, u64 *value)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+
+ /* Check size of union hv_ghcb here. */
+ BUILD_BUG_ON(sizeof(union hv_ghcb) != HV_HYP_PAGE_SIZE);
+
+ if (!hv_ghcb_pg)
+ return;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ ghcb_set_rcx(&hv_ghcb->ghcb, msr);
+ if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 0, 0))
+ pr_warn("Fail to read msr via ghcb %llx.\n", msr);
+ else
+ *value = (u64)lower_32_bits(hv_ghcb->ghcb.save.rax)
+ | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
+#endif
+
+enum hv_isolation_type hv_get_isolation_type(void)
+{
+ if (!(ms_hyperv.priv_high & HV_ISOLATION))
+ return HV_ISOLATION_TYPE_NONE;
+ return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
+}
+EXPORT_SYMBOL_GPL(hv_get_isolation_type);
+
+/*
+ * hv_is_isolation_supported - Check system runs in the Hyper-V
+ * isolation VM.
+ */
+bool hv_is_isolation_supported(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ if (!hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ return false;
+
+ return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
+}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
+
+/*
+ * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
+ * isolation VM.
+ */
+bool hv_isolation_type_snp(void)
+{
+ return static_branch_unlikely(&isolation_type_snp);
+}
+
+/*
+ * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
+ *
+ * In Isolation VM, all guest memory is encrypted from host and guest
+ * needs to set memory visible to host via hvcall before sharing memory
+ * with host.
+ */
+static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
+ enum hv_mem_host_visibility visibility)
+{
+ struct hv_gpa_range_for_visibility **input_pcpu, *input;
+ u16 pages_processed;
+ u64 hv_status;
+ unsigned long flags;
+
+ /* no-op if partition isolation is not enabled */
+ if (!hv_is_isolation_supported())
+ return 0;
+
+ if (count > HV_MAX_MODIFY_GPA_REP_COUNT) {
+ pr_err("Hyper-V: GPA count:%d exceeds supported:%lu\n", count,
+ HV_MAX_MODIFY_GPA_REP_COUNT);
+ return -EINVAL;
+ }
+
+ local_irq_save(flags);
+ input_pcpu = (struct hv_gpa_range_for_visibility **)
+ this_cpu_ptr(hyperv_pcpu_input_arg);
+ input = *input_pcpu;
+ if (unlikely(!input)) {
+ local_irq_restore(flags);
+ return -EINVAL;
+ }
+
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->host_visibility = visibility;
+ input->reserved0 = 0;
+ input->reserved1 = 0;
+ memcpy((void *)input->gpa_page_list, pfn, count * sizeof(*pfn));
+ hv_status = hv_do_rep_hypercall(
+ HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, count,
+ 0, input, &pages_processed);
+ local_irq_restore(flags);
+
+ if (hv_result_success(hv_status))
+ return 0;
+ else
+ return -EFAULT;
+}
+
+/*
+ * hv_set_mem_host_visibility - Set specified memory visible to host.
+ *
+ * In Isolation VM, all guest memory is encrypted from host and guest
+ * needs to set memory visible to host via hvcall before sharing memory
+ * with host. This function works as wrap of hv_mark_gpa_visibility()
+ * with memory base and size.
+ */
+int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visible)
+{
+ enum hv_mem_host_visibility visibility = visible ?
+ VMBUS_PAGE_VISIBLE_READ_WRITE : VMBUS_PAGE_NOT_VISIBLE;
+ u64 *pfn_array;
+ int ret = 0;
+ int i, pfn;
+
+ if (!hv_is_isolation_supported() || !hv_hypercall_pg)
+ return 0;
+
+ pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+ if (!pfn_array)
+ return -ENOMEM;
+
+ for (i = 0, pfn = 0; i < pagecount; i++) {
+ pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE);
+ pfn++;
+
+ if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
+ ret = hv_mark_gpa_visibility(pfn, pfn_array,
+ visibility);
+ if (ret)
+ goto err_free_pfn_array;
+ pfn = 0;
+ }
+ }
+
+ err_free_pfn_array:
+ kfree(pfn_array);
+ return ret;
+}
+
+/*
+ * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
+ */
+void *hv_map_memory(void *addr, unsigned long size)
+{
+ unsigned long *pfns = kcalloc(size / PAGE_SIZE,
+ sizeof(unsigned long), GFP_KERNEL);
+ void *vaddr;
+ int i;
+
+ if (!pfns)
+ return NULL;
+
+ for (i = 0; i < size / PAGE_SIZE; i++)
+ pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
+ (ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
+
+ vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO);
+ kfree(pfns);
+
+ return vaddr;
+}
+
+void hv_unmap_memory(void *addr)
+{
+ vunmap(addr);
+}
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 2c87350c1fb0..0ad2378fe6ad 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -52,31 +52,22 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
return gva_n - offset;
}
-static void hyperv_flush_tlb_others(const struct cpumask *cpus,
- const struct flush_tlb_info *info)
+static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
{
int cpu, vcpu, gva_n, max_gvas;
struct hv_tlb_flush **flush_pcpu;
struct hv_tlb_flush *flush;
- u64 status = U64_MAX;
+ u64 status;
unsigned long flags;
- trace_hyperv_mmu_flush_tlb_others(cpus, info);
+ trace_hyperv_mmu_flush_tlb_multi(cpus, info);
if (!hv_hypercall_pg)
goto do_native;
local_irq_save(flags);
- /*
- * Only check the mask _after_ interrupt has been disabled to avoid the
- * mask changing under our feet.
- */
- if (cpumask_empty(cpus)) {
- local_irq_restore(flags);
- return;
- }
-
flush_pcpu = (struct hv_tlb_flush **)
this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -115,7 +106,9 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
* must. We will also check all VP numbers when walking the
* supplied CPU set to remain correct in all cases.
*/
- if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64)
+ cpu = cpumask_last(cpus);
+
+ if (cpu < nr_cpumask_bits && hv_cpu_number_to_vp_number(cpu) >= 64)
goto do_ex_hypercall;
for_each_cpu(cpu, cpus) {
@@ -131,6 +124,12 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
__set_bit(vcpu, (unsigned long *)
&flush->processor_mask);
}
+
+ /* nothing to flush if 'processor_mask' ends up being empty */
+ if (!flush->processor_mask) {
+ local_irq_restore(flags);
+ return;
+ }
}
/*
@@ -161,10 +160,10 @@ do_ex_hypercall:
check_status:
local_irq_restore(flags);
- if (!(status & HV_HYPERCALL_RESULT_MASK))
+ if (hv_result_success(status))
return;
do_native:
- native_flush_tlb_others(cpus, info);
+ native_flush_tlb_multi(cpus, info);
}
static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
@@ -176,7 +175,7 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
u64 status;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
- return U64_MAX;
+ return HV_STATUS_INVALID_PARAMETER;
flush_pcpu = (struct hv_tlb_flush_ex **)
this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -201,7 +200,7 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
if (nr_bank < 0)
- return U64_MAX;
+ return HV_STATUS_INVALID_PARAMETER;
/*
* We can flush not more than max_gvas with one hypercall. Flush the
@@ -239,6 +238,6 @@ void hyperv_setup_mmu_ops(void)
return;
pr_info("Using hypercall for remote TLB flush\n");
- pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
+ pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
index dd0a843f766d..5d70968c8538 100644
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -47,7 +47,7 @@ int hyperv_flush_guest_mapping(u64 as)
flush, NULL);
local_irq_restore(flags);
- if (!(status & HV_HYPERCALL_RESULT_MASK))
+ if (hv_result_success(status))
ret = 0;
fault:
@@ -92,7 +92,7 @@ int hyperv_flush_guest_mapping_range(u64 as,
{
struct hv_guest_mapping_flush_list **flush_pcpu;
struct hv_guest_mapping_flush_list *flush;
- u64 status = 0;
+ u64 status;
unsigned long flags;
int ret = -ENOTSUPP;
int gpa_n = 0;
@@ -125,10 +125,10 @@ int hyperv_flush_guest_mapping_range(u64 as,
local_irq_restore(flags);
- if (!(status & HV_HYPERCALL_RESULT_MASK))
+ if (hv_result_success(status))
ret = 0;
else
- ret = status;
+ ret = hv_result(status);
fault:
trace_hyperv_nested_flush_guest_mapping_range(as, ret);
return ret;
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 8e4d0391ff6c..e481056698de 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -5,7 +5,5 @@
obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o
-obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
-
audit-class-$(CONFIG_AUDIT) := audit.o
obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 6efe6cb3768a..59e19549e759 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/audit_arch.h>
#include <asm/unistd_32.h>
#include <asm/audit.h>
@@ -31,15 +32,17 @@ int ia32_classify_syscall(unsigned syscall)
{
switch (syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
case __NR_execveat:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 1;
+ return AUDITSC_COMPAT;
}
}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
deleted file mode 100644
index a09fc37ead9d..000000000000
--- a/arch/x86/ia32/ia32_aout.c
+++ /dev/null
@@ -1,327 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * a.out loader for x86-64
- *
- * Copyright (C) 1991, 1992, 1996 Linus Torvalds
- * Hacked together by Andi Kleen
- */
-
-#include <linux/module.h>
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/binfmts.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <linux/jiffies.h>
-#include <linux/perf_event.h>
-#include <linux/sched/task_stack.h>
-
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/user32.h>
-#include <asm/ia32.h>
-
-#undef WARN_OLD
-
-static int load_aout_binary(struct linux_binprm *);
-static int load_aout_library(struct file *);
-
-static struct linux_binfmt aout_format = {
- .module = THIS_MODULE,
- .load_binary = load_aout_binary,
- .load_shlib = load_aout_library,
-};
-
-static int set_brk(unsigned long start, unsigned long end)
-{
- start = PAGE_ALIGN(start);
- end = PAGE_ALIGN(end);
- if (end <= start)
- return 0;
- return vm_brk(start, end - start);
-}
-
-
-/*
- * create_aout_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
-{
- u32 __user *argv, *envp, *sp;
- int argc = bprm->argc, envc = bprm->envc;
-
- sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
- sp -= envc+1;
- envp = sp;
- sp -= argc+1;
- argv = sp;
- put_user((unsigned long) envp, --sp);
- put_user((unsigned long) argv, --sp);
- put_user(argc, --sp);
- current->mm->arg_start = (unsigned long) p;
- while (argc-- > 0) {
- char c;
-
- put_user((u32)(unsigned long)p, argv++);
- do {
- get_user(c, p++);
- } while (c);
- }
- put_user(0, argv);
- current->mm->arg_end = current->mm->env_start = (unsigned long) p;
- while (envc-- > 0) {
- char c;
-
- put_user((u32)(unsigned long)p, envp++);
- do {
- get_user(c, p++);
- } while (c);
- }
- put_user(0, envp);
- current->mm->env_end = (unsigned long) p;
- return sp;
-}
-
-/*
- * These are the functions used to load a.out style executables and shared
- * libraries. There is no binary dependent code anywhere else.
- */
-static int load_aout_binary(struct linux_binprm *bprm)
-{
- unsigned long error, fd_offset, rlim;
- struct pt_regs *regs = current_pt_regs();
- struct exec ex;
- int retval;
-
- ex = *((struct exec *) bprm->buf); /* exec-header */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
- N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
- N_TRSIZE(ex) || N_DRSIZE(ex) ||
- i_size_read(file_inode(bprm->file)) <
- ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- return -ENOEXEC;
- }
-
- fd_offset = N_TXTOFF(ex);
-
- /* Check initial limits. This avoids letting people circumvent
- * size limits imposed on them by creating programs with large
- * arrays in the data or bss.
- */
- rlim = rlimit(RLIMIT_DATA);
- if (rlim >= RLIM_INFINITY)
- rlim = ~0;
- if (ex.a_data + ex.a_bss > rlim)
- return -ENOMEM;
-
- /* Flush all traces of the currently running executable */
- retval = begin_new_exec(bprm);
- if (retval)
- return retval;
-
- /* OK, This is the point of no return */
- set_personality(PER_LINUX);
- set_personality_ia32(false);
-
- setup_new_exec(bprm);
-
- regs->cs = __USER32_CS;
- regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
- regs->r13 = regs->r14 = regs->r15 = 0;
-
- current->mm->end_code = ex.a_text +
- (current->mm->start_code = N_TXTADDR(ex));
- current->mm->end_data = ex.a_data +
- (current->mm->start_data = N_DATADDR(ex));
- current->mm->brk = ex.a_bss +
- (current->mm->start_brk = N_BSSADDR(ex));
-
- retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0)
- return retval;
-
- if (N_MAGIC(ex) == OMAGIC) {
- unsigned long text_addr, map_size;
-
- text_addr = N_TXTADDR(ex);
- map_size = ex.a_text+ex.a_data;
-
- error = vm_brk(text_addr & PAGE_MASK, map_size);
-
- if (error)
- return error;
-
- error = read_code(bprm->file, text_addr, 32,
- ex.a_text + ex.a_data);
- if ((signed long)error < 0)
- return error;
- } else {
-#ifdef WARN_OLD
- static unsigned long error_time, error_time2;
- if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
- (N_MAGIC(ex) != NMAGIC) &&
- time_after(jiffies, error_time2 + 5*HZ)) {
- printk(KERN_NOTICE "executable not page aligned\n");
- error_time2 = jiffies;
- }
-
- if ((fd_offset & ~PAGE_MASK) != 0 &&
- time_after(jiffies, error_time + 5*HZ)) {
- printk(KERN_WARNING
- "fd_offset is not page aligned. Please convert "
- "program: %pD\n",
- bprm->file);
- error_time = jiffies;
- }
-#endif
-
- if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
- error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
- if (error)
- return error;
-
- read_code(bprm->file, N_TXTADDR(ex), fd_offset,
- ex.a_text+ex.a_data);
- goto beyond_if;
- }
-
- error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
- PROT_READ | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
- MAP_EXECUTABLE | MAP_32BIT,
- fd_offset);
-
- if (error != N_TXTADDR(ex))
- return error;
-
- error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
- MAP_EXECUTABLE | MAP_32BIT,
- fd_offset + ex.a_text);
- if (error != N_DATADDR(ex))
- return error;
- }
-
-beyond_if:
- error = set_brk(current->mm->start_brk, current->mm->brk);
- if (error)
- return error;
-
- set_binfmt(&aout_format);
-
- current->mm->start_stack =
- (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
- /* start thread */
- loadsegment(fs, 0);
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
- load_gs_index(0);
- (regs)->ip = ex.a_entry;
- (regs)->sp = current->mm->start_stack;
- (regs)->flags = 0x200;
- (regs)->cs = __USER32_CS;
- (regs)->ss = __USER32_DS;
- regs->r8 = regs->r9 = regs->r10 = regs->r11 =
- regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
- return 0;
-}
-
-static int load_aout_library(struct file *file)
-{
- unsigned long bss, start_addr, len, error;
- int retval;
- struct exec ex;
- loff_t pos = 0;
-
- retval = -ENOEXEC;
- error = kernel_read(file, &ex, sizeof(ex), &pos);
- if (error != sizeof(ex))
- goto out;
-
- /* We come in here for the regular a.out style of shared libraries */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
- N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
- i_size_read(file_inode(file)) <
- ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- goto out;
- }
-
- if (N_FLAGS(ex))
- goto out;
-
- /* For QMAGIC, the starting address is 0x20 into the page. We mask
- this off to get the starting address for the page */
-
- start_addr = ex.a_entry & 0xfffff000;
-
- if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-#ifdef WARN_OLD
- static unsigned long error_time;
- if (time_after(jiffies, error_time + 5*HZ)) {
- printk(KERN_WARNING
- "N_TXTOFF is not page aligned. Please convert "
- "library: %pD\n",
- file);
- error_time = jiffies;
- }
-#endif
- retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
- if (retval)
- goto out;
-
- read_code(file, start_addr, N_TXTOFF(ex),
- ex.a_text + ex.a_data);
- retval = 0;
- goto out;
- }
- /* Now use mmap to map the library into memory. */
- error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
- N_TXTOFF(ex));
- retval = error;
- if (error != start_addr)
- goto out;
-
- len = PAGE_ALIGN(ex.a_text + ex.a_data);
- bss = ex.a_text + ex.a_data + ex.a_bss;
- if (bss > len) {
- retval = vm_brk(start_addr + len, bss - len);
- if (retval)
- goto out;
- }
- retval = 0;
-out:
- return retval;
-}
-
-static int __init init_aout_binfmt(void)
-{
- register_binfmt(&aout_format);
- return 0;
-}
-
-static void __exit exit_aout_binfmt(void)
-{
- unregister_binfmt(&aout_format);
-}
-
-module_init(init_aout_binfmt);
-module_exit(exit_aout_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 5e3d9b7fd5fb..c9c3859322fa 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,7 +24,6 @@
#include <linux/syscalls.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/ptrace.h>
#include <asm/ia32_unistd.h>
@@ -57,8 +56,8 @@ static inline void reload_segments(struct sigcontext_32 *sc)
/*
* Do a signal return; undo the signal stack.
*/
-static int ia32_restore_sigcontext(struct pt_regs *regs,
- struct sigcontext_32 __user *usc)
+static bool ia32_restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext_32 __user *usc)
{
struct sigcontext_32 sc;
@@ -66,7 +65,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
current->restart_block.fn = do_no_restart_syscall;
if (unlikely(copy_from_user(&sc, usc, sizeof(sc))))
- return -EFAULT;
+ return false;
/* Get only the ia32 registers. */
regs->bx = sc.bx;
@@ -111,7 +110,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
set_current_blocked(&set);
- if (ia32_restore_sigcontext(regs, &frame->sc))
+ if (!ia32_restore_sigcontext(regs, &frame->sc))
goto badframe;
return regs->ax;
@@ -135,7 +134,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
- if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
+ if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
if (compat_restore_altstack(&frame->uc.uc_stack))
@@ -220,8 +219,8 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
*fpstate = (struct _fpstate_32 __user *) sp;
- if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
- math_size) < 0)
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
+ math_size))
return (void __user *) -1L;
sp -= frame_size;
diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h
index 1b07fb102c4e..07949102a08d 100644
--- a/arch/x86/include/asm/GEN-for-each-reg.h
+++ b/arch/x86/include/asm/GEN-for-each-reg.h
@@ -1,11 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are in machine order; things rely on that.
+ */
#ifdef CONFIG_64BIT
GEN(rax)
-GEN(rbx)
GEN(rcx)
GEN(rdx)
+GEN(rbx)
+GEN(rsp)
+GEN(rbp)
GEN(rsi)
GEN(rdi)
-GEN(rbp)
GEN(r8)
GEN(r9)
GEN(r10)
@@ -16,10 +21,11 @@ GEN(r14)
GEN(r15)
#else
GEN(eax)
-GEN(ebx)
GEN(ecx)
GEN(edx)
+GEN(ebx)
+GEN(esp)
+GEN(ebp)
GEN(esi)
GEN(edi)
-GEN(ebp)
#endif
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index b19ec8282d50..1e51650b79d7 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -3,6 +3,7 @@
generated-y += syscalls_32.h
generated-y += syscalls_64.h
+generated-y += syscalls_x32.h
generated-y += unistd_32_ia32.h
generated-y += unistd_64_x32.h
generated-y += xen-hypercalls.h
diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h
index 9aff97f0de7f..d937c55e717e 100644
--- a/arch/x86/include/asm/acenv.h
+++ b/arch/x86/include/asm/acenv.h
@@ -13,7 +13,19 @@
/* Asm macros */
-#define ACPI_FLUSH_CPU_CACHE() wbinvd()
+/*
+ * ACPI_FLUSH_CPU_CACHE() flushes caches on entering sleep states.
+ * It is required to prevent data loss.
+ *
+ * While running inside virtual machine, the kernel can bypass cache flushing.
+ * Changing sleep state in a virtual machine doesn't affect the host system
+ * sleep state and cannot lead to data loss.
+ */
+#define ACPI_FLUSH_CPU_CACHE() \
+do { \
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) \
+ wbinvd(); \
+} while (0)
int __acpi_acquire_global_lock(unsigned int *lock);
int __acpi_release_global_lock(unsigned int *lock);
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 62da760d6d5a..cd7b14322035 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -9,7 +9,7 @@
* Functions to keep the agpgart mappings coherent with the MMU. The
* GART gives the CPU a physical alias of pages in memory. The alias
* region is mapped uncacheable. Make sure there are no conflicting
- * mappings with different cachability attributes for the same
+ * mappings with different cacheability attributes for the same
* page. This avoids data corruption on some CPUs.
*/
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
deleted file mode 100644
index 464034db299f..000000000000
--- a/arch/x86/include/asm/alternative-asm.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_ALTERNATIVE_ASM_H
-#define _ASM_X86_ALTERNATIVE_ASM_H
-
-#ifdef __ASSEMBLY__
-
-#include <asm/asm.h>
-
-#ifdef CONFIG_SMP
- .macro LOCK_PREFIX
-672: lock
- .pushsection .smp_locks,"a"
- .balign 4
- .long 672b - .
- .popsection
- .endm
-#else
- .macro LOCK_PREFIX
- .endm
-#endif
-
-/*
- * objtool annotation to ignore the alternatives and only consider the original
- * instruction(s).
- */
-.macro ANNOTATE_IGNORE_ALTERNATIVE
- .Lannotate_\@:
- .pushsection .discard.ignore_alts
- .long .Lannotate_\@ - .
- .popsection
-.endm
-
-/*
- * Issue one struct alt_instr descriptor entry (need to put it into
- * the section .altinstructions, see below). This entry contains
- * enough information for the alternatives patching code to patch an
- * instruction. See apply_alternatives().
- */
-.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
- .long \orig - .
- .long \alt - .
- .word \feature
- .byte \orig_len
- .byte \alt_len
- .byte \pad_len
-.endm
-
-/*
- * Define an alternative between two instructions. If @feature is
- * present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
- */
-.macro ALTERNATIVE oldinstr, newinstr, feature
-140:
- \oldinstr
-141:
- .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
- .popsection
-
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr
-144:
- .popsection
-.endm
-
-#define old_len 141b-140b
-#define new_len1 144f-143f
-#define new_len2 145f-144f
-
-/*
- * gas compatible max based on the idea from:
- * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *
- * The additional "-" is needed because gas uses a "true" value of -1.
- */
-#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
-
-
-/*
- * Same as ALTERNATIVE macro above but for two alternatives. If CPU
- * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
- * @feature2, it replaces @oldinstr with @feature2.
- */
-.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
-140:
- \oldinstr
-141:
- .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
- (alt_max_short(new_len1, new_len2) - (old_len)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
- altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
- .popsection
-
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr1
-144:
- \newinstr2
-145:
- .popsection
-.endm
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_ALTERNATIVE_ASM_H */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13adca37c99a..9b10c8c76087 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -2,13 +2,17 @@
#ifndef _ASM_X86_ALTERNATIVE_H
#define _ASM_X86_ALTERNATIVE_H
-#ifndef __ASSEMBLY__
-
#include <linux/types.h>
-#include <linux/stddef.h>
#include <linux/stringify.h>
#include <asm/asm.h>
+#define ALTINSTR_FLAG_INV (1 << 15)
+#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stddef.h>
+
/*
* Alternative inline assembly for SMP.
*
@@ -61,7 +65,6 @@ struct alt_instr {
u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction */
- u8 padlen; /* length of build-time padding */
} __packed;
/*
@@ -72,6 +75,8 @@ extern int alternatives_patched;
extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+extern void apply_retpolines(s32 *start, s32 *end);
+extern void apply_ibt_endbr(s32 *start, s32 *end);
struct module;
@@ -100,7 +105,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alt_end_marker "663"
#define alt_slen "662b-661b"
-#define alt_pad_len alt_end_marker"b-662b"
#define alt_total_slen alt_end_marker"b-661b"
#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
@@ -147,10 +151,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
" .long " b_replacement(num)"f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \
" .byte " alt_total_slen "\n" /* source len */ \
- " .byte " alt_rlen(num) "\n" /* replacement len */ \
- " .byte " alt_pad_len "\n" /* pad len */
+ " .byte " alt_rlen(num) "\n" /* replacement len */
-#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
+#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \
"# ALT: replacement " #num "\n" \
b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n"
@@ -161,7 +164,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feature, 1) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
+ ALTINSTR_REPLACEMENT(newinstr, 1) \
".popsection\n"
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
@@ -171,10 +174,15 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feature2, 2) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
- ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
+ ALTINSTR_REPLACEMENT(newinstr1, 1) \
+ ALTINSTR_REPLACEMENT(newinstr2, 2) \
".popsection\n"
+/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
+#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+ ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
+ newinstr_yes, feature)
+
#define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \
OLDINSTR_3(oldinsn, 1, 2, 3) \
".pushsection .altinstructions,\"a\"\n" \
@@ -183,9 +191,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feat3, 3) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinsn1, feat1, 1) \
- ALTINSTR_REPLACEMENT(newinsn2, feat2, 2) \
- ALTINSTR_REPLACEMENT(newinsn3, feat3, 3) \
+ ALTINSTR_REPLACEMENT(newinsn1, 1) \
+ ALTINSTR_REPLACEMENT(newinsn2, 2) \
+ ALTINSTR_REPLACEMENT(newinsn3, 3) \
".popsection\n"
/*
@@ -206,15 +214,15 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \
+ asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory")
+
/*
* Alternative inline assembly with input.
*
* Peculiarities:
* No memory clobber here.
* Argument numbers start with 1.
- * Best is to use constraints that are fixed size (like (%1) ... "r")
- * If you use variable sized constraints like "m" or "g" in the
- * replacement make sure to pad to the worst case length.
* Leaving an unused argument 0 to keep API compatibility.
*/
#define alternative_input(oldinstr, newinstr, feature, input...) \
@@ -271,6 +279,115 @@ static inline int alternatives_text_reserved(void *start, void *end)
*/
#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_SMP
+ .macro LOCK_PREFIX
+672: lock
+ .pushsection .smp_locks,"a"
+ .balign 4
+ .long 672b - .
+ .popsection
+ .endm
+#else
+ .macro LOCK_PREFIX
+ .endm
+#endif
+
+/*
+ * objtool annotation to ignore the alternatives and only consider the original
+ * instruction(s).
+ */
+.macro ANNOTATE_IGNORE_ALTERNATIVE
+ .Lannotate_\@:
+ .pushsection .discard.ignore_alts
+ .long .Lannotate_\@ - .
+ .popsection
+.endm
+
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
+.macro altinstruction_entry orig alt feature orig_len alt_len
+ .long \orig - .
+ .long \alt - .
+ .word \feature
+ .byte \orig_len
+ .byte \alt_len
+.endm
+
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
+ */
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr
+144:
+ .popsection
+.endm
+
+#define old_len 141b-140b
+#define new_len1 144f-143f
+#define new_len2 145f-144f
+
+/*
+ * gas compatible max based on the idea from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
+ */
+#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+ \oldinstr
+141:
+ .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
+ (alt_max_short(new_len1, new_len2) - (old_len)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
+ altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
+.endm
+
+/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
+#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+ ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
+ newinstr_yes, feature
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h
new file mode 100644
index 000000000000..aabdbb5ab920
--- /dev/null
+++ b/arch/x86/include/asm/amd-ibs.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * From PPR Vol 1 for AMD Family 19h Model 01h B1
+ * 55898 Rev 0.35 - Feb 5, 2021
+ */
+
+#include <asm/msr-index.h>
+
+/*
+ * IBS Hardware MSRs
+ */
+
+/* MSR 0xc0011030: IBS Fetch Control */
+union ibs_fetch_ctl {
+ __u64 val;
+ struct {
+ __u64 fetch_maxcnt:16,/* 0-15: instruction fetch max. count */
+ fetch_cnt:16, /* 16-31: instruction fetch count */
+ fetch_lat:16, /* 32-47: instruction fetch latency */
+ fetch_en:1, /* 48: instruction fetch enable */
+ fetch_val:1, /* 49: instruction fetch valid */
+ fetch_comp:1, /* 50: instruction fetch complete */
+ ic_miss:1, /* 51: i-cache miss */
+ phy_addr_valid:1,/* 52: physical address valid */
+ l1tlb_pgsz:2, /* 53-54: i-cache L1TLB page size
+ * (needs IbsPhyAddrValid) */
+ l1tlb_miss:1, /* 55: i-cache fetch missed in L1TLB */
+ l2tlb_miss:1, /* 56: i-cache fetch missed in L2TLB */
+ rand_en:1, /* 57: random tagging enable */
+ fetch_l2_miss:1,/* 58: L2 miss for sampled fetch
+ * (needs IbsFetchComp) */
+ reserved:5; /* 59-63: reserved */
+ };
+};
+
+/* MSR 0xc0011033: IBS Execution Control */
+union ibs_op_ctl {
+ __u64 val;
+ struct {
+ __u64 opmaxcnt:16, /* 0-15: periodic op max. count */
+ reserved0:1, /* 16: reserved */
+ op_en:1, /* 17: op sampling enable */
+ op_val:1, /* 18: op sample valid */
+ cnt_ctl:1, /* 19: periodic op counter control */
+ opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */
+ reserved1:5, /* 27-31: reserved */
+ opcurcnt:27, /* 32-58: periodic op counter current count */
+ reserved2:5; /* 59-63: reserved */
+ };
+};
+
+/* MSR 0xc0011035: IBS Op Data 1 */
+union ibs_op_data {
+ __u64 val;
+ struct {
+ __u64 comp_to_ret_ctr:16, /* 0-15: op completion to retire count */
+ tag_to_ret_ctr:16, /* 15-31: op tag to retire count */
+ reserved1:2, /* 32-33: reserved */
+ op_return:1, /* 34: return op */
+ op_brn_taken:1, /* 35: taken branch op */
+ op_brn_misp:1, /* 36: mispredicted branch op */
+ op_brn_ret:1, /* 37: branch op retired */
+ op_rip_invalid:1, /* 38: RIP is invalid */
+ op_brn_fuse:1, /* 39: fused branch op */
+ op_microcode:1, /* 40: microcode op */
+ reserved2:23; /* 41-63: reserved */
+ };
+};
+
+/* MSR 0xc0011036: IBS Op Data 2 */
+union ibs_op_data2 {
+ __u64 val;
+ struct {
+ __u64 data_src:3, /* 0-2: data source */
+ reserved0:1, /* 3: reserved */
+ rmt_node:1, /* 4: destination node */
+ cache_hit_st:1, /* 5: cache hit state */
+ reserved1:57; /* 5-63: reserved */
+ };
+};
+
+/* MSR 0xc0011037: IBS Op Data 3 */
+union ibs_op_data3 {
+ __u64 val;
+ struct {
+ __u64 ld_op:1, /* 0: load op */
+ st_op:1, /* 1: store op */
+ dc_l1tlb_miss:1, /* 2: data cache L1TLB miss */
+ dc_l2tlb_miss:1, /* 3: data cache L2TLB hit in 2M page */
+ dc_l1tlb_hit_2m:1, /* 4: data cache L1TLB hit in 2M page */
+ dc_l1tlb_hit_1g:1, /* 5: data cache L1TLB hit in 1G page */
+ dc_l2tlb_hit_2m:1, /* 6: data cache L2TLB hit in 2M page */
+ dc_miss:1, /* 7: data cache miss */
+ dc_mis_acc:1, /* 8: misaligned access */
+ reserved:4, /* 9-12: reserved */
+ dc_wc_mem_acc:1, /* 13: write combining memory access */
+ dc_uc_mem_acc:1, /* 14: uncacheable memory access */
+ dc_locked_op:1, /* 15: locked operation */
+ dc_miss_no_mab_alloc:1, /* 16: DC miss with no MAB allocated */
+ dc_lin_addr_valid:1, /* 17: data cache linear address valid */
+ dc_phy_addr_valid:1, /* 18: data cache physical address valid */
+ dc_l2_tlb_hit_1g:1, /* 19: data cache L2 hit in 1GB page */
+ l2_miss:1, /* 20: L2 cache miss */
+ sw_pf:1, /* 21: software prefetch */
+ op_mem_width:4, /* 22-25: load/store size in bytes */
+ op_dc_miss_open_mem_reqs:6, /* 26-31: outstanding mem reqs on DC fill */
+ dc_miss_lat:16, /* 32-47: data cache miss latency */
+ tlb_refill_lat:16; /* 48-63: L1 TLB refill latency */
+ };
+};
+
+/* MSR 0xc001103c: IBS Fetch Control Extended */
+union ic_ibs_extd_ctl {
+ __u64 val;
+ struct {
+ __u64 itlb_refill_lat:16, /* 0-15: ITLB Refill latency for sampled fetch */
+ reserved:48; /* 16-63: reserved */
+ };
+};
+
+/*
+ * IBS driver related
+ */
+
+struct perf_ibs_data {
+ u32 size;
+ union {
+ u32 data[0]; /* data buffer starts here */
+ u32 caps;
+ };
+ u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
diff --git a/arch/x86/include/asm/amd_hsmp.h b/arch/x86/include/asm/amd_hsmp.h
new file mode 100644
index 000000000000..03c2ce3edaf5
--- /dev/null
+++ b/arch/x86/include/asm/amd_hsmp.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _ASM_X86_AMD_HSMP_H_
+#define _ASM_X86_AMD_HSMP_H_
+
+#include <uapi/asm/amd_hsmp.h>
+
+#if IS_ENABLED(CONFIG_AMD_HSMP)
+int hsmp_send_message(struct hsmp_message *msg);
+#else
+static inline int hsmp_send_message(struct hsmp_message *msg)
+{
+ return -ENODEV;
+}
+#endif
+#endif /*_ASM_X86_AMD_HSMP_H_*/
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 455066a06f60..ed0eaf65c437 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -16,7 +16,6 @@ extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
extern bool early_is_amd_nb(u32 value);
extern struct resource *amd_get_mmconfig_range(struct resource *res);
-extern int amd_cache_northbridges(void);
extern void amd_flush_garts(void);
extern int amd_numa_init(void);
extern int amd_get_subcaches(int);
@@ -24,7 +23,6 @@ extern int amd_set_subcaches(int, unsigned long);
extern int amd_smn_read(u16 node, u32 address, u32 *value);
extern int amd_smn_write(u16 node, u32 address, u32 value);
-extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo);
struct amd_l3_cache {
unsigned indices;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 412b51e059c8..bd8ae0a7010a 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -174,6 +174,7 @@ static inline int apic_is_clustered_box(void)
extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
extern void lapic_assign_system_vectors(void);
extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace);
+extern void lapic_update_legacy_vectors(void);
extern void lapic_online(void);
extern void lapic_offline(void);
extern bool apic_needs_pit(void);
@@ -327,6 +328,8 @@ struct apic {
/* wakeup_secondary_cpu */
int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
+ /* wakeup secondary CPU using 64-bit wakeup point */
+ int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
void (*inquire_remote_apic)(int apicid);
@@ -487,6 +490,11 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
}
+#ifdef CONFIG_X86_64
+typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip);
+extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler);
+#endif
+
extern int default_apic_id_valid(u32 apicid);
extern int default_acpi_madt_oem_check(char *, char *);
extern void default_setup_apic_routing(void);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 5716f22f81ac..92035eb3afee 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -95,12 +95,6 @@
#define APIC_LVTTHMR 0x330
#define APIC_LVTPC 0x340
#define APIC_LVT0 0x350
-#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18)
-#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3)
-#define SET_APIC_TIMER_BASE(x) (((x) << 18))
-#define APIC_TIMER_BASE_CLKIN 0x0
-#define APIC_TIMER_BASE_TMBASE 0x1
-#define APIC_TIMER_BASE_DIV 0x2
#define APIC_LVT_TIMER_ONESHOT (0 << 17)
#define APIC_LVT_TIMER_PERIODIC (1 << 17)
#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 51e2bf27cc9b..8f80de627c60 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -17,20 +17,3 @@
extern void cmpxchg8b_emu(void);
#endif
-#ifdef CONFIG_RETPOLINE
-
-#define DECL_INDIRECT_THUNK(reg) \
- extern asmlinkage void __x86_indirect_thunk_ ## reg (void);
-
-#define DECL_RETPOLINE(reg) \
- extern asmlinkage void __x86_retpoline_ ## reg (void);
-
-#undef GEN
-#define GEN(reg) DECL_INDIRECT_THUNK(reg)
-#include <asm/GEN-for-each-reg.h>
-
-#undef GEN
-#define GEN(reg) DECL_RETPOLINE(reg)
-#include <asm/GEN-for-each-reg.h>
-
-#endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 0603c7423aca..fbcfec4dc4cc 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,25 +3,28 @@
#define _ASM_X86_ASM_H
#ifdef __ASSEMBLY__
-# define __ASM_FORM(x) x
-# define __ASM_FORM_RAW(x) x
-# define __ASM_FORM_COMMA(x) x,
+# define __ASM_FORM(x, ...) x,## __VA_ARGS__
+# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__
+# define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__,
+# define __ASM_REGPFX %
#else
#include <linux/stringify.h>
-
-# define __ASM_FORM(x) " " __stringify(x) " "
-# define __ASM_FORM_RAW(x) __stringify(x)
-# define __ASM_FORM_COMMA(x) " " __stringify(x) ","
+# define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " "
+# define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__)
+# define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) ","
+# define __ASM_REGPFX %%
#endif
+#define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;)
+
#ifndef __x86_64__
/* 32 bit */
-# define __ASM_SEL(a,b) __ASM_FORM(a)
-# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
+# define __ASM_SEL(a,b) __ASM_FORM(a)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
#else
/* 64 bit */
-# define __ASM_SEL(a,b) __ASM_FORM(b)
-# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
+# define __ASM_SEL(a,b) __ASM_FORM(b)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
#endif
#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
@@ -48,6 +51,9 @@
#define _ASM_SI __ASM_REG(si)
#define _ASM_DI __ASM_REG(di)
+/* Adds a (%rip) suffix on 64 bits only; for immediate memory references */
+#define _ASM_RIP(x) __ASM_SEL_RAW(x, x (__ASM_REGPFX rip))
+
#ifndef __x86_64__
/* 32 bit */
@@ -119,28 +125,21 @@
# define CC_OUT(c) [_cc_ ## c] "=qm"
#endif
+#ifdef __KERNEL__
+
+# include <asm/extable_fixup_types.h>
+
/* Exception table entry */
#ifdef __ASSEMBLY__
-# define _ASM_EXTABLE_HANDLE(from, to, handler) \
+
+# define _ASM_EXTABLE_TYPE(from, to, type) \
.pushsection "__ex_table","a" ; \
.balign 4 ; \
.long (from) - . ; \
.long (to) - . ; \
- .long (handler) - . ; \
+ .long type ; \
.popsection
-# define _ASM_EXTABLE(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_UA(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
-
-# define _ASM_EXTABLE_CPY(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
-
-# define _ASM_EXTABLE_FAULT(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
-
# ifdef CONFIG_KPROBES
# define _ASM_NOKPROBE(entry) \
.pushsection "_kprobe_blacklist","aw" ; \
@@ -152,26 +151,51 @@
# endif
#else /* ! __ASSEMBLY__ */
-# define _EXPAND_EXTABLE_HANDLE(x) #x
-# define _ASM_EXTABLE_HANDLE(from, to, handler) \
+
+# define DEFINE_EXTABLE_TYPE_REG \
+ ".macro extable_type_reg type:req reg:req\n" \
+ ".set .Lfound, 0\n" \
+ ".set .Lregnr, 0\n" \
+ ".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".long \\type + (.Lregnr << 8)\n" \
+ ".endif\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
+ ".endr\n" \
+ ".set .Lregnr, 0\n" \
+ ".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".long \\type + (.Lregnr << 8)\n" \
+ ".endif\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
+ ".endr\n" \
+ ".if (.Lfound != 1)\n" \
+ ".error \"extable_type_reg: bad register argument\"\n" \
+ ".endif\n" \
+ ".endm\n"
+
+# define UNDEFINE_EXTABLE_TYPE_REG \
+ ".purgem extable_type_reg\n"
+
+# define _ASM_EXTABLE_TYPE(from, to, type) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .balign 4\n" \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
- " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \
+ " .long " __stringify(type) " \n" \
" .popsection\n"
-# define _ASM_EXTABLE(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_UA(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
-
-# define _ASM_EXTABLE_CPY(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
-
-# define _ASM_EXTABLE_FAULT(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \
+ " .pushsection \"__ex_table\",\"a\"\n" \
+ " .balign 4\n" \
+ " .long (" #from ") - .\n" \
+ " .long (" #to ") - .\n" \
+ DEFINE_EXTABLE_TYPE_REG \
+ "extable_type_reg reg=" __stringify(reg) ", type=" __stringify(type) " \n"\
+ UNDEFINE_EXTABLE_TYPE_REG \
+ " .popsection\n"
/* For C file, we already have NOKPROBE_SYMBOL macro */
@@ -185,4 +209,17 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
#endif /* __ASSEMBLY__ */
+#define _ASM_EXTABLE(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT)
+
+#define _ASM_EXTABLE_UA(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS)
+
+#define _ASM_EXTABLE_CPY(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY)
+
+#define _ASM_EXTABLE_FAULT(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT)
+
+#endif /* __KERNEL__ */
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index f732741ad7c7..5e754e895767 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -269,6 +269,4 @@ static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
# include <asm/atomic64_64.h>
#endif
-#define ARCH_ATOMIC
-
#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 4819d5e5a335..35389b2af88e 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -19,9 +19,9 @@
#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
#else
-#define mb() asm volatile("mfence":::"memory")
-#define rmb() asm volatile("lfence":::"memory")
-#define wmb() asm volatile("sfence" ::: "memory")
+#define __mb() asm volatile("mfence":::"memory")
+#define __rmb() asm volatile("lfence":::"memory")
+#define __wmb() asm volatile("sfence" ::: "memory")
#endif
/**
@@ -51,14 +51,11 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
/* Prevent speculative execution past this barrier. */
#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC)
-#define dma_rmb() barrier()
-#define dma_wmb() barrier()
+#define __dma_rmb() barrier()
+#define __dma_wmb() barrier()
+
+#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc")
-#ifdef CONFIG_X86_32
-#define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc")
-#else
-#define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc")
-#endif
#define __smp_rmb() dma_rmb()
#define __smp_wmb() barrier()
#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 0367efdc5b7a..a288ecd230ab 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -380,8 +380,6 @@ static __always_inline int fls64(__u64 x)
#include <asm-generic/bitops/fls64.h>
#endif
-#include <asm-generic/bitops/find.h>
-
#include <asm-generic/bitops/sched.h>
#include <asm/arch_hweight.h>
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index 981fe923a59f..53e9b0620d96 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -74,6 +74,7 @@ static void sanitize_boot_params(struct boot_params *boot_params)
BOOT_PARAM_PRESERVE(hdr),
BOOT_PARAM_PRESERVE(e820_table),
BOOT_PARAM_PRESERVE(eddbuf),
+ BOOT_PARAM_PRESERVE(cc_blob_address),
};
memset(&scratch, 0, sizeof(scratch));
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 297fa12e7e27..a3ec87d198ac 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -4,21 +4,13 @@
#include <linux/stringify.h>
#include <linux/instrumentation.h>
+#include <linux/objtool.h>
/*
* Despite that some emulators terminate on UD2, we use it for WARN().
- *
- * Since various instruction decoders/specs disagree on the encoding of
- * UD0/UD1.
*/
-
-#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */
-#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */
#define ASM_UD2 ".byte 0x0f, 0x0b"
-
-#define INSN_UD0 0xff0f
#define INSN_UD2 0x0b0f
-
#define LEN_UD2 2
#ifdef CONFIG_GENERIC_BUG
@@ -26,12 +18,12 @@
#ifdef CONFIG_X86_32
# define __BUG_REL(val) ".long " __stringify(val)
#else
-# define __BUG_REL(val) ".long " __stringify(val) " - 2b"
+# define __BUG_REL(val) ".long " __stringify(val) " - ."
#endif
#ifdef CONFIG_DEBUG_BUGVERBOSE
-#define _BUG_FLAGS(ins, flags) \
+#define _BUG_FLAGS(ins, flags, extra) \
do { \
asm_inline volatile("1:\t" ins "\n" \
".pushsection __bug_table,\"aw\"\n" \
@@ -40,7 +32,8 @@ do { \
"\t.word %c1" "\t# bug_entry::line\n" \
"\t.word %c2" "\t# bug_entry::flags\n" \
"\t.org 2b+%c3\n" \
- ".popsection" \
+ ".popsection\n" \
+ extra \
: : "i" (__FILE__), "i" (__LINE__), \
"i" (flags), \
"i" (sizeof(struct bug_entry))); \
@@ -48,14 +41,15 @@ do { \
#else /* !CONFIG_DEBUG_BUGVERBOSE */
-#define _BUG_FLAGS(ins, flags) \
+#define _BUG_FLAGS(ins, flags, extra) \
do { \
asm_inline volatile("1:\t" ins "\n" \
".pushsection __bug_table,\"aw\"\n" \
"2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
"\t.word %c0" "\t# bug_entry::flags\n" \
"\t.org 2b+%c1\n" \
- ".popsection" \
+ ".popsection\n" \
+ extra \
: : "i" (flags), \
"i" (sizeof(struct bug_entry))); \
} while (0)
@@ -64,7 +58,7 @@ do { \
#else
-#define _BUG_FLAGS(ins, flags) asm volatile(ins)
+#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins)
#endif /* CONFIG_GENERIC_BUG */
@@ -72,8 +66,8 @@ do { \
#define BUG() \
do { \
instrumentation_begin(); \
- _BUG_FLAGS(ASM_UD2, 0); \
- unreachable(); \
+ _BUG_FLAGS(ASM_UD2, 0, ""); \
+ __builtin_unreachable(); \
} while (0)
/*
@@ -84,9 +78,9 @@ do { \
*/
#define __WARN_FLAGS(flags) \
do { \
+ __auto_type __flags = BUGFLAG_WARNING|(flags); \
instrumentation_begin(); \
- _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \
- annotate_reachable(); \
+ _BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \
instrumentation_end(); \
} while (0)
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 4d4ec5cbdc51..94fbe6ae7431 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -22,7 +22,7 @@ extern void __add_wrong_size(void)
/*
* Constants for operation sizes. On 32-bit, the 64-bit size it set to
* -1 because sizeof will never return -1, thereby making those switch
- * case statements guaranteeed dead code which the compiler will
+ * case statements guaranteed dead code which the compiler will
* eliminate, and allowing the "missing symbol in the default case" to
* indicate a usage error.
*/
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 0a7fe0321613..215f5a65790f 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
#define arch_cmpxchg64_local(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
(unsigned long long)(n)))
+#define arch_try_cmpxchg64(ptr, po, n) \
+ __try_cmpxchg64((ptr), (unsigned long long *)(po), \
+ (unsigned long long)(n))
#endif
static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
@@ -70,6 +73,24 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
return prev;
}
+static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
+{
+ bool success;
+ u64 old = *pold;
+ asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]"
+ CC_SET(z)
+ : CC_OUT(z) (success),
+ [ptr] "+m" (*ptr),
+ "+A" (old)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32))
+ : "memory");
+
+ if (unlikely(!success))
+ *pold = old;
+ return success;
+}
+
#ifndef CONFIG_X86_CMPXCHG64
/*
* Building a kernel capable running on 80386 and 80486. It may be necessary
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 072e5459fe2f..250187ac8248 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -19,6 +19,12 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
arch_cmpxchg_local((ptr), (o), (n)); \
})
+#define arch_try_cmpxchg64(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg((ptr), (po), (n)); \
+})
+
#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
new file mode 100644
index 000000000000..3d98c3a60d34
--- /dev/null
+++ b/arch/x86/include/asm/coco.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_COCO_H
+#define _ASM_X86_COCO_H
+
+#include <asm/types.h>
+
+enum cc_vendor {
+ CC_VENDOR_NONE,
+ CC_VENDOR_AMD,
+ CC_VENDOR_HYPERV,
+ CC_VENDOR_INTEL,
+};
+
+void cc_set_vendor(enum cc_vendor v);
+void cc_set_mask(u64 mask);
+
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+u64 cc_mkenc(u64 val);
+u64 cc_mkdec(u64 val);
+#else
+static inline u64 cc_mkenc(u64 val)
+{
+ return val;
+}
+
+static inline u64 cc_mkdec(u64 val)
+{
+ return val;
+}
+#endif
+
+#endif /* _ASM_X86_COCO_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index be09c7eac89f..b1221da477b7 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -12,32 +12,35 @@
#include <asm/user32.h>
#include <asm/unistd.h>
-#include <asm-generic/compat.h>
-
-#define COMPAT_USER_HZ 100
-#define COMPAT_UTS_MACHINE "i686\0\0"
+#define compat_mode_t compat_mode_t
+typedef u16 compat_mode_t;
+#define __compat_uid_t __compat_uid_t
typedef u16 __compat_uid_t;
typedef u16 __compat_gid_t;
-typedef u32 __compat_uid32_t;
-typedef u32 __compat_gid32_t;
-typedef u16 compat_mode_t;
+
+#define compat_dev_t compat_dev_t
typedef u16 compat_dev_t;
+
+#define compat_ipc_pid_t compat_ipc_pid_t
+typedef u16 compat_ipc_pid_t;
+
+#define compat_statfs compat_statfs
+
+#include <asm-generic/compat.h>
+
+#define COMPAT_UTS_MACHINE "i686\0\0"
+
typedef u16 compat_nlink_t;
-typedef u16 compat_ipc_pid_t;
-typedef u32 compat_caddr_t;
-typedef __kernel_fsid_t compat_fsid_t;
struct compat_stat {
- compat_dev_t st_dev;
- u16 __pad1;
+ u32 st_dev;
compat_ino_t st_ino;
compat_mode_t st_mode;
compat_nlink_t st_nlink;
__compat_uid_t st_uid;
__compat_gid_t st_gid;
- compat_dev_t st_rdev;
- u16 __pad2;
+ u32 st_rdev;
u32 st_size;
u32 st_blksize;
u32 st_blocks;
@@ -51,29 +54,11 @@ struct compat_stat {
u32 __unused5;
};
-struct compat_flock {
- short l_type;
- short l_whence;
- compat_off_t l_start;
- compat_off_t l_len;
- compat_pid_t l_pid;
-};
-
-#define F_GETLK64 12 /* using 'struct flock64' */
-#define F_SETLK64 13
-#define F_SETLKW64 14
-
/*
- * IA32 uses 4 byte alignment for 64 bit quantities,
- * so we need to pack this structure.
+ * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the
+ * compat flock64 structure.
*/
-struct compat_flock64 {
- short l_type;
- short l_whence;
- compat_loff_t l_start;
- compat_loff_t l_len;
- compat_pid_t l_pid;
-} __attribute__((packed));
+#define __ARCH_NEED_COMPAT_FLOCK64_PACKED
struct compat_statfs {
int f_type;
@@ -90,93 +75,11 @@ struct compat_statfs {
int f_spare[4];
};
-#define COMPAT_RLIM_INFINITY 0xffffffff
-
-typedef u32 compat_old_sigset_t; /* at least 32 bits */
-
-#define _COMPAT_NSIG 64
-#define _COMPAT_NSIG_BPW 32
-
-typedef u32 compat_sigset_word;
-
-#define COMPAT_OFF_T_MAX 0x7fffffff
-
-struct compat_ipc64_perm {
- compat_key_t key;
- __compat_uid32_t uid;
- __compat_gid32_t gid;
- __compat_uid32_t cuid;
- __compat_gid32_t cgid;
- unsigned short mode;
- unsigned short __pad1;
- unsigned short seq;
- unsigned short __pad2;
- compat_ulong_t unused1;
- compat_ulong_t unused2;
-};
-
-struct compat_semid64_ds {
- struct compat_ipc64_perm sem_perm;
- compat_ulong_t sem_otime;
- compat_ulong_t sem_otime_high;
- compat_ulong_t sem_ctime;
- compat_ulong_t sem_ctime_high;
- compat_ulong_t sem_nsems;
- compat_ulong_t __unused3;
- compat_ulong_t __unused4;
-};
-
-struct compat_msqid64_ds {
- struct compat_ipc64_perm msg_perm;
- compat_ulong_t msg_stime;
- compat_ulong_t msg_stime_high;
- compat_ulong_t msg_rtime;
- compat_ulong_t msg_rtime_high;
- compat_ulong_t msg_ctime;
- compat_ulong_t msg_ctime_high;
- compat_ulong_t msg_cbytes;
- compat_ulong_t msg_qnum;
- compat_ulong_t msg_qbytes;
- compat_pid_t msg_lspid;
- compat_pid_t msg_lrpid;
- compat_ulong_t __unused4;
- compat_ulong_t __unused5;
-};
-
-struct compat_shmid64_ds {
- struct compat_ipc64_perm shm_perm;
- compat_size_t shm_segsz;
- compat_ulong_t shm_atime;
- compat_ulong_t shm_atime_high;
- compat_ulong_t shm_dtime;
- compat_ulong_t shm_dtime_high;
- compat_ulong_t shm_ctime;
- compat_ulong_t shm_ctime_high;
- compat_pid_t shm_cpid;
- compat_pid_t shm_lpid;
- compat_ulong_t shm_nattch;
- compat_ulong_t __unused4;
- compat_ulong_t __unused5;
-};
-
#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- compat_uptr_t sp = task_pt_regs(current)->sp;
-
- /*
- * -128 for the x32 ABI redzone. For IA32, it is not strictly
- * necessary, but not harmful.
- */
- sp -= 128;
-
- return (void __user *)round_down(sp - len, 16);
-}
-
static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index da78ccbd493b..8cbf623f0ecf 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -7,6 +7,7 @@
#include <linux/topology.h>
#include <linux/nodemask.h>
#include <linux/percpu.h>
+#include <asm/ibt.h>
#ifdef CONFIG_SMP
@@ -35,19 +36,21 @@ extern int _debug_hotplug_cpu(int cpu, int action);
#endif
#endif
+extern void ap_init_aperfmperf(void);
+
int mwait_usable(const struct cpuinfo_x86 *);
unsigned int x86_family(unsigned int sig);
unsigned int x86_model(unsigned int sig);
unsigned int x86_stepping(unsigned int sig);
#ifdef CONFIG_CPU_SUP_INTEL
-extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
-extern void switch_to_sld(unsigned long tifn);
+extern void __init sld_setup(struct cpuinfo_x86 *c);
extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
extern bool handle_guest_split_lock(unsigned long ip);
+extern void handle_bus_lock(struct pt_regs *regs);
+u8 get_this_hybrid_cpu_type(void);
#else
-static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
-static inline void switch_to_sld(unsigned long tifn) {}
+static inline void __init sld_setup(struct cpuinfo_x86 *c) {}
static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
{
return false;
@@ -57,10 +60,38 @@ static inline bool handle_guest_split_lock(unsigned long ip)
{
return false;
}
+
+static inline void handle_bus_lock(struct pt_regs *regs) {}
+
+static inline u8 get_this_hybrid_cpu_type(void)
+{
+ return 0;
+}
#endif
#ifdef CONFIG_IA32_FEAT_CTL
void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
#else
static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {}
#endif
+
+extern __noendbr void cet_disable(void);
+
+struct ucode_cpu_info;
+
+int intel_cpu_collect_info(struct ucode_cpu_info *uci);
+
+static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1,
+ unsigned int s2, unsigned int p2)
+{
+ if (s1 != s2)
+ return false;
+
+ /* Processor flags are either both 0 ... */
+ if (!p1 && !p2)
+ return true;
+
+ /* ... or they intersect. */
+ return p1 & p2;
+}
+
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 3d52b094850a..75efc4c6f076 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -10,6 +10,12 @@
#ifdef CONFIG_X86_64
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#define VC_EXCEPTION_STKSZ EXCEPTION_STKSZ
+#else
+#define VC_EXCEPTION_STKSZ 0
+#endif
+
/* Macro to enforce the same ordering and stack sizes */
#define ESTACKS_MEMBERS(guardsize, optional_stack_size) \
char DF_stack_guard[guardsize]; \
@@ -28,7 +34,7 @@
/* The exception stacks' physical storage. No guard pages required */
struct exception_stacks {
- ESTACKS_MEMBERS(0, 0)
+ ESTACKS_MEMBERS(0, VC_EXCEPTION_STKSZ)
};
/* The effective cpu entry area mapping with guard pages. */
@@ -137,7 +143,7 @@ extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
-static inline struct entry_stack *cpu_entry_stack(int cpu)
+static __always_inline struct entry_stack *cpu_entry_stack(int cpu)
{
return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
}
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 1728d4ce5730..ea34cc31b047 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -8,6 +8,7 @@
#include <asm/asm.h>
#include <linux/bitops.h>
+#include <asm/alternative.h>
enum cpuid_leafs
{
@@ -33,14 +34,17 @@ enum cpuid_leafs
CPUID_8000_001F_EAX,
};
+#define X86_CAP_FMT_NUM "%d:%d"
+#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31)
+
#ifdef CONFIG_X86_FEATURE_NAMES
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
#define X86_CAP_FMT "%s"
#define x86_cap_flag(flag) x86_cap_flags[flag]
#else
-#define X86_CAP_FMT "%d:%d"
-#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31)
+#define X86_CAP_FMT X86_CAP_FMT_NUM
+#define x86_cap_flag x86_cap_flag_num
#endif
/*
@@ -50,7 +54,7 @@ extern const char * const x86_power_flags[32];
extern const char * const x86_bug_flags[NBUGINTS*32];
#define test_cpu_cap(c, bit) \
- test_bit(bit, (unsigned long *)((c)->x86_capability))
+ arch_test_bit(bit, (unsigned long *)((c)->x86_capability))
/*
* There are 32 bits/features in each mask word. The high bits
@@ -172,44 +176,25 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
* means that the boot_cpu_has() variant is already fast enough for the
* majority of cases and you should stick to using it as it is generally
* only two instructions: a RIP-relative MOV and a TEST.
+ *
+ * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
+ * that this is only used on a fallback path and will sometimes cause
+ * it to manifest the address of boot_cpu_data in a register, fouling
+ * the mainline (post-initialization) code.
*/
static __always_inline bool _static_cpu_has(u16 bit)
{
- asm_volatile_goto("1: jmp 6f\n"
- "2:\n"
- ".skip -(((5f-4f) - (2b-1b)) > 0) * "
- "((5f-4f) - (2b-1b)),0x90\n"
- "3:\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 4f - .\n" /* repl offset */
- " .word %P[always]\n" /* always replace */
- " .byte 3b - 1b\n" /* src len */
- " .byte 5f - 4f\n" /* repl len */
- " .byte 3b - 2b\n" /* pad len */
- ".previous\n"
- ".section .altinstr_replacement,\"ax\"\n"
- "4: jmp %l[t_no]\n"
- "5:\n"
- ".previous\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 0\n" /* no replacement */
- " .word %P[feature]\n" /* feature bit */
- " .byte 3b - 1b\n" /* src len */
- " .byte 0\n" /* repl len */
- " .byte 0\n" /* pad len */
- ".previous\n"
- ".section .altinstr_aux,\"ax\"\n"
- "6:\n"
- " testb %[bitnum],%[cap_byte]\n"
- " jnz %l[t_yes]\n"
- " jmp %l[t_no]\n"
- ".previous\n"
+ asm_volatile_goto(
+ ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
+ ".pushsection .altinstr_aux,\"ax\"\n"
+ "6:\n"
+ " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n"
+ " jnz %l[t_yes]\n"
+ " jmp %l[t_no]\n"
+ ".popsection\n"
: : [feature] "i" (bit),
- [always] "i" (X86_FEATURE_ALWAYS),
[bitnum] "i" (1 << (bit & 7)),
- [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+ [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3])
: : t_yes, t_no);
t_yes:
return true;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cc96e26d69f7..03acc823838a 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -84,7 +84,7 @@
/* CPU types for specific tunings: */
#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
+/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */
#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
@@ -108,7 +108,7 @@
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
-/* free ( 3*32+29) */
+#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
@@ -201,17 +201,17 @@
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-/* FREE! ( 7*32+10) */
+#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
-#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-/* FREE! ( 7*32+20) */
+#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
@@ -236,6 +236,9 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
+#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */
+#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */
+#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -275,6 +278,7 @@
#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
+#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various
@@ -290,6 +294,8 @@
#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
@@ -309,6 +315,8 @@
#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
+#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
+#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
@@ -321,6 +329,7 @@
#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */
/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
@@ -336,6 +345,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@@ -354,6 +364,7 @@
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
+#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */
#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */
#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */
#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */
@@ -372,12 +383,18 @@
#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */
#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
+#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */
+#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */
#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_IBT (18*32+20) /* Indirect Branch Tracking */
+#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */
#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */
+#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */
+#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
@@ -390,6 +407,7 @@
#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
/*
@@ -428,5 +446,6 @@
#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
+#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
new file mode 100644
index 000000000000..70b2db18165e
--- /dev/null
+++ b/arch/x86/include/asm/cpuid.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * CPUID-related helpers/definitions
+ *
+ * Derived from arch/x86/kvm/cpuid.c
+ */
+
+#ifndef _ASM_X86_CPUID_H
+#define _ASM_X86_CPUID_H
+
+static __always_inline bool cpuid_function_is_indexed(u32 function)
+{
+ switch (function) {
+ case 4:
+ case 7:
+ case 0xb:
+ case 0xd:
+ case 0xf:
+ case 0x10:
+ case 0x12:
+ case 0x14:
+ case 0x17:
+ case 0x18:
+ case 0x1d:
+ case 0x1e:
+ case 0x1f:
+ case 0x8000001d:
+ return true;
+ }
+
+ return false;
+}
+
+#endif /* _ASM_X86_CPUID_H */
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
index 3afa990d756b..c5aed9e9226c 100644
--- a/arch/x86/include/asm/cpumask.h
+++ b/arch/x86/include/asm/cpumask.h
@@ -20,11 +20,21 @@ static __always_inline bool arch_cpu_online(int cpu)
{
return arch_test_bit(cpu, cpumask_bits(cpu_online_mask));
}
+
+static __always_inline void arch_cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+ arch_clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
#else
static __always_inline bool arch_cpu_online(int cpu)
{
return cpu == 0;
}
+
+static __always_inline void arch_cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+ return;
+}
#endif
#define arch_cpu_is_offline(cpu) unlikely(!arch_cpu_online(cpu))
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
index f58de66091e5..8b6bd63530dc 100644
--- a/arch/x86/include/asm/crash.h
+++ b/arch/x86/include/asm/crash.h
@@ -9,10 +9,4 @@ int crash_setup_memmap_entries(struct kimage *image,
struct boot_params *params);
void crash_smp_send_stop(void);
-#ifdef CONFIG_KEXEC_CORE
-void __init crash_reserve_low_1M(void);
-#else
-static inline void __init crash_reserve_low_1M(void) { }
-#endif
-
#endif /* _ASM_X86_CRASH_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 476082a83d1c..ab97b22ac04a 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -9,6 +9,7 @@
#include <asm/irq_vectors.h>
#include <asm/cpu_entry_area.h>
+#include <linux/debug_locks.h>
#include <linux/smp.h>
#include <linux/percpu.h>
@@ -224,6 +225,26 @@ static inline void store_idt(struct desc_ptr *dtr)
asm volatile("sidt %0":"=m" (*dtr));
}
+static inline void native_gdt_invalidate(void)
+{
+ const struct desc_ptr invalid_gdt = {
+ .address = 0,
+ .size = 0
+ };
+
+ native_load_gdt(&invalid_gdt);
+}
+
+static inline void native_idt_invalidate(void)
+{
+ const struct desc_ptr invalid_idt = {
+ .address = 0,
+ .size = 0
+ };
+
+ native_load_idt(&invalid_idt);
+}
+
/*
* The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
* a read-only remapping. To prevent a page fault, the GDT is switched to the
@@ -421,12 +442,10 @@ extern bool idt_is_f00f_address(unsigned long address);
#ifdef CONFIG_X86_64
extern void idt_setup_early_pf(void);
-extern void idt_setup_ist_traps(void);
#else
static inline void idt_setup_early_pf(void) { }
-static inline void idt_setup_ist_traps(void) { }
#endif
-extern void idt_invalidate(void *addr);
+extern void idt_invalidate(void);
#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index b7dd944dc867..36369e76cc63 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -10,12 +10,6 @@
* cpu_feature_enabled().
*/
-#ifdef CONFIG_X86_SMAP
-# define DISABLE_SMAP 0
-#else
-# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31))
-#endif
-
#ifdef CONFIG_X86_UMIP
# define DISABLE_UMIP 0
#else
@@ -56,10 +50,10 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
-#ifdef CONFIG_IOMMU_SUPPORT
-# define DISABLE_ENQCMD 0
+#ifdef CONFIG_INTEL_IOMMU_SVM
+# define DISABLE_ENQCMD 0
#else
-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
+# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
#endif
#ifdef CONFIG_X86_SGX
@@ -68,6 +62,12 @@
# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31))
#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+# define DISABLE_TDX_GUEST 0
+#else
+# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -79,8 +79,8 @@
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
#define DISABLED_MASK7 (DISABLE_PTI)
-#define DISABLED_MASK8 0
-#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX)
+#define DISABLED_MASK8 (DISABLE_TDX_GUEST)
+#define DISABLED_MASK9 (DISABLE_SGX)
#define DISABLED_MASK10 0
#define DISABLED_MASK11 0
#define DISABLED_MASK12 0
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index bb1654fe0ce7..1c66708e3062 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -2,18 +2,6 @@
#ifndef _ASM_X86_DMA_MAPPING_H
#define _ASM_X86_DMA_MAPPING_H
-/*
- * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and
- * Documentation/core-api/dma-api.rst for documentation.
- */
-
-#include <linux/scatterlist.h>
-#include <asm/io.h>
-#include <asm/swiotlb.h>
-
-extern int iommu_merge;
-extern int panic_on_overflow;
-
extern const struct dma_map_ops *dma_ops;
static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 4d0b126835b8..71943dce691e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -7,6 +7,7 @@
#include <asm/tlb.h>
#include <asm/nospec-branch.h>
#include <asm/mmu_context.h>
+#include <asm/ibt.h>
#include <linux/build_bug.h>
#include <linux/kernel.h>
#include <linux/pgtable.h>
@@ -46,13 +47,14 @@ extern unsigned long efi_mixed_mode_stack_pa;
#define __efi_nargs(...) __efi_nargs_(__VA_ARGS__)
#define __efi_nargs_(...) __efi_nargs__(0, ##__VA_ARGS__, \
+ __efi_arg_sentinel(9), __efi_arg_sentinel(8), \
__efi_arg_sentinel(7), __efi_arg_sentinel(6), \
__efi_arg_sentinel(5), __efi_arg_sentinel(4), \
__efi_arg_sentinel(3), __efi_arg_sentinel(2), \
__efi_arg_sentinel(1), __efi_arg_sentinel(0))
-#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, n, ...) \
+#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, ...) \
__take_second_arg(n, \
- ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 8; }))
+ ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 10; }))
#define __efi_arg_sentinel(n) , n
/*
@@ -119,8 +121,12 @@ extern asmlinkage u64 __efi_call(void *fp, ...);
efi_enter_mm(); \
})
-#define arch_efi_call_virt(p, f, args...) \
- efi_call((void *)p->f, args) \
+#define arch_efi_call_virt(p, f, args...) ({ \
+ u64 ret, ibt = ibt_save(); \
+ ret = efi_call((void *)p->f, args); \
+ ibt_restore(ibt); \
+ ret; \
+})
#define arch_efi_call_virt_teardown() \
({ \
@@ -176,8 +182,9 @@ extern u64 efi_setup;
extern efi_status_t __efi64_thunk(u32, ...);
#define efi64_thunk(...) ({ \
- __efi_nargs_check(efi64_thunk, 6, __VA_ARGS__); \
- __efi64_thunk(__VA_ARGS__); \
+ u64 __pad[3]; /* must have space for 3 args on the stack */ \
+ __efi_nargs_check(efi64_thunk, 9, __VA_ARGS__); \
+ __efi64_thunk(__VA_ARGS__, __pad); \
})
static inline bool efi_is_mixed(void)
@@ -197,8 +204,6 @@ static inline bool efi_runtime_supported(void)
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
-extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
-
extern void efi_thunk_runtime_setup(void);
efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
unsigned long descriptor_size,
@@ -265,6 +270,8 @@ static inline u32 efi64_convert_status(efi_status_t status)
return (u32)(status | (u64)status >> 32);
}
+#define __efi64_split(val) (val) & U32_MAX, (u64)(val) >> 32
+
#define __efi64_argmap_free_pages(addr, size) \
((addr), 0, (size))
@@ -308,6 +315,17 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi64_argmap_query_mode(gop, mode, size, info) \
((gop), (mode), efi64_zero_upper(size), efi64_zero_upper(info))
+/* TCG2 protocol */
+#define __efi64_argmap_hash_log_extend_event(prot, fl, addr, size, ev) \
+ ((prot), (fl), 0ULL, (u64)(addr), 0ULL, (u64)(size), 0ULL, ev)
+
+/* DXE services */
+#define __efi64_argmap_get_memory_space_descriptor(phys, desc) \
+ (__efi64_split(phys), (desc))
+
+#define __efi64_argmap_set_memory_space_descriptor(phys, size, flags) \
+ (__efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+
/*
* The macros below handle the plumbing for the argument mapping. To add a
* mapping for a specific EFI method, simply define a macro
@@ -348,6 +366,11 @@ static inline u32 efi64_convert_status(efi_status_t status)
runtime), \
func, __VA_ARGS__))
+#define efi_dxe_call(func, ...) \
+ (efi_is_native() \
+ ? efi_dxe_table->func(__VA_ARGS__) \
+ : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__))
+
#else /* CONFIG_EFI_MIXED */
static inline bool efi_is_64bit(void)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9224d40cdefe..cb0ff1055ab1 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -116,7 +116,7 @@ extern unsigned int vdso32_enabled;
* now struct_user_regs, they are different)
*/
-#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
pr_reg[0] = regs->bx; \
pr_reg[1] = regs->cx; \
@@ -128,6 +128,7 @@ do { \
pr_reg[7] = regs->ds; \
pr_reg[8] = regs->es; \
pr_reg[9] = regs->fs; \
+ savesegment(gs, pr_reg[10]); \
pr_reg[11] = regs->orig_ax; \
pr_reg[12] = regs->ip; \
pr_reg[13] = regs->cs; \
@@ -136,18 +137,6 @@ do { \
pr_reg[16] = regs->ss; \
} while (0);
-#define ELF_CORE_COPY_REGS(pr_reg, regs) \
-do { \
- ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
- pr_reg[10] = get_user_gs(regs); \
-} while (0);
-
-#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \
-do { \
- ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
- savesegment(gs, pr_reg[10]); \
-} while (0);
-
#define ELF_PLATFORM (utsname()->machine)
#define set_personality_64bit() do { } while (0)
@@ -283,12 +272,12 @@ extern u32 elf_hwcap2;
*
* The decision process for determining the results are:
*
- *              CPU: | lacks NX*  | has NX, ia32     | has NX, x86_64 |
- * ELF:              |            |                  |                |
+ * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 |
+ * ELF: | | | |
* ---------------------|------------|------------------|----------------|
- * missing PT_GNU_STACK | exec-all   | exec-all         | exec-none      |
- * PT_GNU_STACK == RWX  | exec-stack | exec-stack       | exec-stack     |
- * PT_GNU_STACK == RW   | exec-none  | exec-none        | exec-none      |
+ * missing PT_GNU_STACK | exec-all | exec-all | exec-none |
+ * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack |
+ * PT_GNU_STACK == RW | exec-none | exec-none | exec-none |
*
* exec-all : all PROT_READ user mappings are executable, except when
* backed by files on a noexec-filesystem.
@@ -312,6 +301,7 @@ do { \
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
} \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
} while (0)
/*
@@ -328,6 +318,7 @@ extern unsigned long task_size_32bit(void);
extern unsigned long task_size_64bit(int full_addr_space);
extern unsigned long get_mmap_base(int is_legacy);
extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
#ifdef CONFIG_X86_32
@@ -349,6 +340,7 @@ do { \
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \
(unsigned long __force)current->mm->context.vdso); \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
} while (0)
/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
@@ -357,6 +349,7 @@ do { \
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \
(unsigned long __force)current->mm->context.vdso); \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
} while (0)
#define AT_SYSINFO 32
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 2b87b191b3b8..674ed46d3ced 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_ENTRY_COMMON_H
#define _ASM_X86_ENTRY_COMMON_H
+#include <linux/randomize_kstack.h>
#include <linux/user-return-notifier.h>
#include <asm/nospec-branch.h>
@@ -9,7 +10,7 @@
#include <asm/fpu/api.h>
/* Check that the stack and regs on entry from user mode are sane. */
-static __always_inline void arch_check_user_regs(struct pt_regs *regs)
+static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
/*
@@ -24,7 +25,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
* For !SMAP hardware we patch out CLAC on entry.
*/
if (boot_cpu_has(X86_FEATURE_SMAP) ||
- (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
+ (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
mask |= X86_EFLAGS_AC;
WARN_ON_ONCE(flags & mask);
@@ -41,7 +42,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
WARN_ON_ONCE(regs != task_pt_regs(current));
}
}
-#define arch_check_user_regs arch_check_user_regs
+#define arch_enter_from_user_mode arch_enter_from_user_mode
static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
unsigned long ti_work)
@@ -70,6 +71,21 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
*/
current_thread_info()->status &= ~(TS_COMPAT | TS_I386_REGS_POKED);
#endif
+
+ /*
+ * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(),
+ * but not enough for x86 stack utilization comfort. To keep
+ * reasonable stack head room, reduce the maximum offset to 8 bits.
+ *
+ * The actual entropy will be further reduced by the compiler when
+ * applying stack alignment constraints (see cc_stack_align4/8 in
+ * arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32)
+ * low bits from any entropy chosen here.
+ *
+ * Therefore, final stack offset entropy will be 5 (x86_64) or
+ * 6 (ia32) bits.
+ */
+ choose_random_kstack_offset(rdtsc() & 0xFF);
}
#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index 1f0cbc52937c..eeed395c3177 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -1,12 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_EXTABLE_H
#define _ASM_X86_EXTABLE_H
+
+#include <asm/extable_fixup_types.h>
+
/*
- * The exception table consists of triples of addresses relative to the
- * exception table entry itself. The first address is of an instruction
- * that is allowed to fault, the second is the target at which the program
- * should continue. The third is a handler function to deal with the fault
- * caused by the instruction in the first field.
+ * The exception table consists of two addresses relative to the
+ * exception table entry itself and a type selector field.
+ *
+ * The first address is of an instruction that is allowed to fault, the
+ * second is the target at which the program should continue.
+ *
+ * The type entry is used by fixup_exception() to select the handler to
+ * deal with the fault caused by the instruction in the first field.
*
* All the routines below use bits of fixup code that are out of line
* with the main instruction path. This means when everything is well,
@@ -15,7 +21,7 @@
*/
struct exception_table_entry {
- int insn, fixup, handler;
+ int insn, fixup, data;
};
struct pt_regs;
@@ -25,21 +31,31 @@ struct pt_regs;
do { \
(a)->fixup = (b)->fixup + (delta); \
(b)->fixup = (tmp).fixup - (delta); \
- (a)->handler = (b)->handler + (delta); \
- (b)->handler = (tmp).handler - (delta); \
+ (a)->data = (b)->data; \
+ (b)->data = (tmp).data; \
} while (0)
-enum handler_type {
- EX_HANDLER_NONE,
- EX_HANDLER_FAULT,
- EX_HANDLER_UACCESS,
- EX_HANDLER_OTHER
-};
-
extern int fixup_exception(struct pt_regs *regs, int trapnr,
unsigned long error_code, unsigned long fault_addr);
extern int fixup_bug(struct pt_regs *regs, int trapnr);
-extern enum handler_type ex_get_fault_handler_type(unsigned long ip);
+extern int ex_get_fixup_type(unsigned long ip);
extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
+#ifdef CONFIG_X86_MCE
+extern void __noreturn ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr);
+#else
+static inline void __noreturn ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
+{
+ for (;;)
+ cpu_relax();
+}
+#endif
+
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs);
+#else
+static inline bool ex_handler_bpf(const struct exception_table_entry *x,
+ struct pt_regs *regs) { return false; }
+#endif
+
#endif
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
new file mode 100644
index 000000000000..503622627400
--- /dev/null
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H
+#define _ASM_X86_EXTABLE_FIXUP_TYPES_H
+
+/*
+ * Our IMM is signed, as such it must live at the top end of the word. Also,
+ * since C99 hex constants are of ambigious type, force cast the mask to 'int'
+ * so that FIELD_GET() will DTRT and sign extend the value when it extracts it.
+ */
+#define EX_DATA_TYPE_MASK ((int)0x000000FF)
+#define EX_DATA_REG_MASK ((int)0x00000F00)
+#define EX_DATA_FLAG_MASK ((int)0x0000F000)
+#define EX_DATA_IMM_MASK ((int)0xFFFF0000)
+
+#define EX_DATA_REG_SHIFT 8
+#define EX_DATA_FLAG_SHIFT 12
+#define EX_DATA_IMM_SHIFT 16
+
+#define EX_DATA_REG(reg) ((reg) << EX_DATA_REG_SHIFT)
+#define EX_DATA_FLAG(flag) ((flag) << EX_DATA_FLAG_SHIFT)
+#define EX_DATA_IMM(imm) ((imm) << EX_DATA_IMM_SHIFT)
+
+/* segment regs */
+#define EX_REG_DS EX_DATA_REG(8)
+#define EX_REG_ES EX_DATA_REG(9)
+#define EX_REG_FS EX_DATA_REG(10)
+#define EX_REG_GS EX_DATA_REG(11)
+
+/* flags */
+#define EX_FLAG_CLEAR_AX EX_DATA_FLAG(1)
+#define EX_FLAG_CLEAR_DX EX_DATA_FLAG(2)
+#define EX_FLAG_CLEAR_AX_DX EX_DATA_FLAG(3)
+
+/* types */
+#define EX_TYPE_NONE 0
+#define EX_TYPE_DEFAULT 1
+#define EX_TYPE_FAULT 2
+#define EX_TYPE_UACCESS 3
+#define EX_TYPE_COPY 4
+#define EX_TYPE_CLEAR_FS 5
+#define EX_TYPE_FPU_RESTORE 6
+#define EX_TYPE_BPF 7
+#define EX_TYPE_WRMSR 8
+#define EX_TYPE_RDMSR 9
+#define EX_TYPE_WRMSR_SAFE 10 /* reg := -EIO */
+#define EX_TYPE_RDMSR_SAFE 11 /* reg := -EIO */
+#define EX_TYPE_WRMSR_IN_MCE 12
+#define EX_TYPE_RDMSR_IN_MCE 13
+#define EX_TYPE_DEFAULT_MCE_SAFE 14
+#define EX_TYPE_FAULT_MCE_SAFE 15
+
+#define EX_TYPE_POP_REG 16 /* sp += sizeof(long) */
+#define EX_TYPE_POP_ZERO (EX_TYPE_POP_REG | EX_DATA_IMM(0))
+
+#define EX_TYPE_IMM_REG 17 /* reg := (long)imm */
+#define EX_TYPE_EFAULT_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(-EFAULT))
+#define EX_TYPE_ZERO_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(0))
+#define EX_TYPE_ONE_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(1))
+
+#define EX_TYPE_FAULT_SGX 18
+
+#define EX_TYPE_UCOPY_LEN 19 /* cx := reg + imm*cx */
+#define EX_TYPE_UCOPY_LEN1 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(1))
+#define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4))
+#define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8))
+
+#endif
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index d43717b423cb..6ec3fc969ad5 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -74,7 +74,6 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id)
int lcount;
char *lptr;
- st = 1;
for (lcount = virtual_dma_count, lptr = virtual_dma_addr;
lcount; lcount--, lptr++) {
st = inb(virtual_dma_port + FD_STATUS);
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index ed33a14188f6..6b0f31fb53f7 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -12,6 +12,8 @@
#define _ASM_X86_FPU_API_H
#include <linux/bottom_half.h>
+#include <asm/fpu/types.h>
+
/*
* Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
* disables preemption so be careful if you intend to use it for long periods
@@ -48,9 +50,9 @@ static inline void kernel_fpu_begin(void)
}
/*
- * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
* A context switch will (and softirq might) save CPU's FPU registers to
- * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
+ * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
* a random state.
*
* local_bh_disable() protects against both preemption and soft interrupts
@@ -100,16 +102,66 @@ extern void switch_fpu_return(void);
*/
extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
-/*
- * Tasks that are not using SVA have mm->pasid set to zero to note that they
- * will not have the valid bit set in MSR_IA32_PASID while they are running.
- */
-#define PASID_DISABLED 0
+/* Trap handling */
+extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern void fpu_sync_fpstate(struct fpu *fpu);
+extern void fpu_reset_from_exception_fixup(void);
+
+/* Boot, hotplug and resume */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
-#ifdef CONFIG_IOMMU_SUPPORT
-/* Update current's PASID MSR/state by mm's PASID. */
-void update_pasid(void);
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
#else
-static inline void update_pasid(void) { }
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif
+
+/* State tracking */
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/* Process cleanup */
+#ifdef CONFIG_X86_64
+extern void fpstate_free(struct fpu *fpu);
+#else
+static inline void fpstate_free(struct fpu *fpu) { }
+#endif
+
+/* fpstate-related functions which are exported to KVM */
+extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
+
+extern u64 xstate_get_guest_group_perm(void);
+
+/* KVM specific functions */
+extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
+extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
+extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
+extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);
+
+#ifdef CONFIG_X86_64
+extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
+extern void fpu_sync_guest_vmexit_xfd_state(void);
+#else
+static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
+static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
+#endif
+
+extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
+extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
+
+static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
+{
+ gfpu->fpstate->is_confidential = true;
+}
+
+static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
+{
+ return gfpu->fpstate->is_confidential;
+}
+
+/* prctl */
+extern long fpu_xstate_prctl(int option, unsigned long arg2);
+
#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
deleted file mode 100644
index 8d33ad80704f..000000000000
--- a/arch/x86/include/asm/fpu/internal.h
+++ /dev/null
@@ -1,596 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- * Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_FPU_INTERNAL_H
-#define _ASM_X86_FPU_INTERNAL_H
-
-#include <linux/compat.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-
-#include <asm/user.h>
-#include <asm/fpu/api.h>
-#include <asm/fpu/xstate.h>
-#include <asm/fpu/xcr.h>
-#include <asm/cpufeature.h>
-#include <asm/trace/fpu.h>
-
-/*
- * High level FPU state handling functions:
- */
-extern void fpu__prepare_read(struct fpu *fpu);
-extern void fpu__prepare_write(struct fpu *fpu);
-extern void fpu__save(struct fpu *fpu);
-extern int fpu__restore_sig(void __user *buf, int ia32_frame);
-extern void fpu__drop(struct fpu *fpu);
-extern int fpu__copy(struct task_struct *dst, struct task_struct *src);
-extern void fpu__clear_user_states(struct fpu *fpu);
-extern void fpu__clear_all(struct fpu *fpu);
-extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
-
-/*
- * Boot time FPU initialization functions:
- */
-extern void fpu__init_cpu(void);
-extern void fpu__init_system_xstate(void);
-extern void fpu__init_cpu_xstate(void);
-extern void fpu__init_system(struct cpuinfo_x86 *c);
-extern void fpu__init_check_bugs(void);
-extern void fpu__resume_cpu(void);
-extern u64 fpu__get_supported_xfeatures_mask(void);
-
-/*
- * Debugging facility:
- */
-#ifdef CONFIG_X86_DEBUG_FPU
-# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
-#else
-# define WARN_ON_FPU(x) ({ (void)(x); 0; })
-#endif
-
-/*
- * FPU related CPU feature flag helper routines:
- */
-static __always_inline __pure bool use_xsaveopt(void)
-{
- return static_cpu_has(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
- return static_cpu_has(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
- return static_cpu_has(X86_FEATURE_FXSR);
-}
-
-/*
- * fpstate handling functions:
- */
-
-extern union fpregs_state init_fpstate;
-
-extern void fpstate_init(union fpregs_state *state);
-#ifdef CONFIG_MATH_EMULATION
-extern void fpstate_init_soft(struct swregs_state *soft);
-#else
-static inline void fpstate_init_soft(struct swregs_state *soft) {}
-#endif
-
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
-{
- /*
- * XRSTORS requires these bits set in xcomp_bv, or it will
- * trigger #GP:
- */
- xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
-}
-
-static inline void fpstate_init_fxstate(struct fxregs_state *fx)
-{
- fx->cwd = 0x37f;
- fx->mxcsr = MXCSR_DEFAULT;
-}
-extern void fpstate_sanitize_xstate(struct fpu *fpu);
-
-#define user_insn(insn, output, input...) \
-({ \
- int err; \
- \
- might_fault(); \
- \
- asm volatile(ASM_STAC "\n" \
- "1:" #insn "\n\t" \
- "2: " ASM_CLAC "\n" \
- ".section .fixup,\"ax\"\n" \
- "3: movl $-1,%[err]\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err), output \
- : "0"(0), input); \
- err; \
-})
-
-#define kernel_insn_err(insn, output, input...) \
-({ \
- int err; \
- asm volatile("1:" #insn "\n\t" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: movl $-1,%[err]\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err), output \
- : "0"(0), input); \
- err; \
-})
-
-#define kernel_insn(insn, output, input...) \
- asm volatile("1:" #insn "\n\t" \
- "2:\n" \
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \
- : output : input)
-
-static inline int copy_fregs_to_user(struct fregs_state __user *fx)
-{
- return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
-}
-
-static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
- else
- return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
-
-}
-
-static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
- else
- kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
- else
- return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
- else
- return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void copy_kernel_to_fregs(struct fregs_state *fx)
-{
- kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
-{
- return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int copy_user_to_fregs(struct fregs_state __user *fx)
-{
- return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void copy_fxregs_to_kernel(struct fpu *fpu)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
- else
- asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
-}
-
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-
-#define XSTATE_OP(op, st, lmask, hmask, err) \
- asm volatile("1:" op "\n\t" \
- "xor %[err], %[err]\n" \
- "2:\n\t" \
- ".pushsection .fixup,\"ax\"\n\t" \
- "3: movl $-2,%[err]\n\t" \
- "jmp 2b\n\t" \
- ".popsection\n\t" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err) \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
- : "memory")
-
-/*
- * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
- * format and supervisor states in addition to modified optimization in
- * XSAVEOPT.
- *
- * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
- * supports modified optimization which is not supported by XSAVE.
- *
- * We use XSAVE as a fallback.
- *
- * The 661 label is defined in the ALTERNATIVE* macros as the address of the
- * original instruction which gets replaced. We need to use it here as the
- * address of the instruction where we might get an exception at.
- */
-#define XSTATE_XSAVE(st, lmask, hmask, err) \
- asm volatile(ALTERNATIVE_2(XSAVE, \
- XSAVEOPT, X86_FEATURE_XSAVEOPT, \
- XSAVES, X86_FEATURE_XSAVES) \
- "\n" \
- "xor %[err], %[err]\n" \
- "3:\n" \
- ".pushsection .fixup,\"ax\"\n" \
- "4: movl $-2, %[err]\n" \
- "jmp 3b\n" \
- ".popsection\n" \
- _ASM_EXTABLE(661b, 4b) \
- : [err] "=r" (err) \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
- : "memory")
-
-/*
- * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
- * XSAVE area format.
- */
-#define XSTATE_XRESTORE(st, lmask, hmask) \
- asm volatile(ALTERNATIVE(XRSTOR, \
- XRSTORS, X86_FEATURE_XSAVES) \
- "\n" \
- "3:\n" \
- _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
- : \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
- : "memory")
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
-{
- u64 mask = xfeatures_mask_all;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
-{
- u64 mask = -1;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-
- /*
- * We should never fault when copying from a kernel buffer, and the FPU
- * state we set at boot time should be valid.
- */
- WARN_ON_FPU(err);
-}
-
-/*
- * Save processor xstate to xsave area.
- */
-static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
-{
- u64 mask = xfeatures_mask_all;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON_FPU(!alternatives_patched);
-
- XSTATE_XSAVE(xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
-/*
- * Restore processor xstate from xsave area.
- */
-static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
-
- XSTATE_XRESTORE(xstate, lmask, hmask);
-}
-
-/*
- * Save xstate to user space xsave area.
- *
- * We don't use modified optimization because xrstor/xrstors might track
- * a different application.
- *
- * We don't use compacted format xsave area for
- * backward compatibility for old applications which don't understand
- * compacted format of xsave area.
- */
-static inline int copy_xregs_to_user(struct xregs_state __user *buf)
-{
- u64 mask = xfeatures_mask_user();
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- /*
- * Clear the xsave header first, so that reserved fields are
- * initialized to zero.
- */
- err = __clear_user(&buf->header, sizeof(buf->header));
- if (unlikely(err))
- return -EFAULT;
-
- stac();
- XSTATE_OP(XSAVE, buf, lmask, hmask, err);
- clac();
-
- return err;
-}
-
-/*
- * Restore xstate from user space xsave area.
- */
-static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
-{
- struct xregs_state *xstate = ((__force struct xregs_state *)buf);
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- stac();
- XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
- clac();
-
- return err;
-}
-
-/*
- * Restore xstate from kernel space xsave area, return an error code instead of
- * an exception.
- */
-static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- if (static_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-
- return err;
-}
-
-extern int copy_fpregs_to_fpstate(struct fpu *fpu);
-
-static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
-{
- if (use_xsave()) {
- copy_kernel_to_xregs(&fpstate->xsave, mask);
- } else {
- if (use_fxsr())
- copy_kernel_to_fxregs(&fpstate->fxsave);
- else
- copy_kernel_to_fregs(&fpstate->fsave);
- }
-}
-
-static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
-{
- /*
- * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
- * pending. Clear the x87 state here by setting it to fixed values.
- * "m" is a random variable that should be in L1.
- */
- if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
- asm volatile(
- "fnclex\n\t"
- "emms\n\t"
- "fildl %P[addr]" /* set F?P to defined value */
- : : [addr] "m" (fpstate));
- }
-
- __copy_kernel_to_fpregs(fpstate, -1);
-}
-
-extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
-
-/*
- * FPU context switch related helper methods:
- */
-
-DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
-
-/*
- * The in-register FPU state for an FPU context on a CPU is assumed to be
- * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
- * matches the FPU.
- *
- * If the FPU register state is valid, the kernel can skip restoring the
- * FPU state from memory.
- *
- * Any code that clobbers the FPU registers or updates the in-memory
- * FPU state for a task MUST let the rest of the kernel know that the
- * FPU registers are no longer valid for this task.
- *
- * Either one of these invalidation functions is enough. Invalidate
- * a resource you control: CPU if using the CPU for something else
- * (with preemption disabled), FPU for the current task, or a task that
- * is prevented from running by the current task.
- */
-static inline void __cpu_invalidate_fpregs_state(void)
-{
- __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
-}
-
-static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
-{
- fpu->last_cpu = -1;
-}
-
-static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
-{
- return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
-}
-
-/*
- * These generally need preemption protection to work,
- * do try to avoid using these on their own:
- */
-static inline void fpregs_deactivate(struct fpu *fpu)
-{
- this_cpu_write(fpu_fpregs_owner_ctx, NULL);
- trace_x86_fpu_regs_deactivated(fpu);
-}
-
-static inline void fpregs_activate(struct fpu *fpu)
-{
- this_cpu_write(fpu_fpregs_owner_ctx, fpu);
- trace_x86_fpu_regs_activated(fpu);
-}
-
-/*
- * Internal helper, do not use directly. Use switch_fpu_return() instead.
- */
-static inline void __fpregs_load_activate(void)
-{
- struct fpu *fpu = &current->thread.fpu;
- int cpu = smp_processor_id();
-
- if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
- return;
-
- if (!fpregs_state_valid(fpu, cpu)) {
- copy_kernel_to_fpregs(&fpu->state);
- fpregs_activate(fpu);
- fpu->last_cpu = cpu;
- }
- clear_thread_flag(TIF_NEED_FPU_LOAD);
-}
-
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- * - switch_fpu_prepare() saves the old state.
- * This is done within the context of the old process.
- *
- * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
- * will get loaded on return to userspace, or when the kernel needs it.
- *
- * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
- * are saved in the current thread's FPU register state.
- *
- * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
- * hold current()'s FPU registers. It is required to load the
- * registers before returning to userland or using the content
- * otherwise.
- *
- * The FPU context is only stored/restored for a user task and
- * PF_KTHREAD is used to distinguish between kernel and user threads.
- */
-static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
-{
- if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
- if (!copy_fpregs_to_fpstate(old_fpu))
- old_fpu->last_cpu = -1;
- else
- old_fpu->last_cpu = cpu;
-
- /* But leave fpu_fpregs_owner_ctx! */
- trace_x86_fpu_regs_deactivated(old_fpu);
- }
-}
-
-/*
- * Misc helper functions:
- */
-
-/*
- * Load PKRU from the FPU context if available. Delay loading of the
- * complete FPU state until the return to userland.
- */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
-{
- u32 pkru_val = init_pkru_value;
- struct pkru_state *pk;
-
- if (!static_cpu_has(X86_FEATURE_FPU))
- return;
-
- set_thread_flag(TIF_NEED_FPU_LOAD);
-
- if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
- return;
-
- /*
- * PKRU state is switched eagerly because it needs to be valid before we
- * return to userland e.g. for a copy_to_user() operation.
- */
- if (current->mm) {
- pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
- if (pk)
- pkru_val = pk->pkru;
- }
- __write_pkru(pkru_val);
-
- /*
- * Expensive PASID MSR write will be avoided in update_pasid() because
- * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated
- * unless it's different from mm->pasid to reduce overhead.
- */
- update_pasid();
-}
-
-#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
new file mode 100644
index 000000000000..b2486b2cbc6e
--- /dev/null
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_SCHED_H
+#define _ASM_X86_FPU_SCHED_H
+
+#include <linux/sched.h>
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/types.h>
+
+#include <asm/trace/fpu.h>
+
+extern void save_fpregs_to_fpstate(struct fpu *fpu);
+extern void fpu__drop(struct fpu *fpu);
+extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal);
+extern void fpu_flush_thread(void);
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ * - switch_fpu_prepare() saves the old state.
+ * This is done within the context of the old process.
+ *
+ * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
+ * will get loaded on return to userspace, or when the kernel needs it.
+ *
+ * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
+ * are saved in the current thread's FPU register state.
+ *
+ * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
+ * hold current()'s FPU registers. It is required to load the
+ * registers before returning to userland or using the content
+ * otherwise.
+ *
+ * The FPU context is only stored/restored for a user task and
+ * PF_KTHREAD is used to distinguish between kernel and user threads.
+ */
+static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+{
+ if (cpu_feature_enabled(X86_FEATURE_FPU) &&
+ !(current->flags & PF_KTHREAD)) {
+ save_fpregs_to_fpstate(old_fpu);
+ /*
+ * The save operation preserved register state, so the
+ * fpu_fpregs_owner_ctx is still @old_fpu. Store the
+ * current CPU number in @old_fpu, so the next return
+ * to user space can avoid the FPU register restore
+ * when is returns on the same CPU and still owns the
+ * context.
+ */
+ old_fpu->last_cpu = cpu;
+
+ trace_x86_fpu_regs_deactivated(old_fpu);
+ }
+}
+
+/*
+ * Delay loading of the complete FPU state until the return to userland.
+ * PKRU is handled separately.
+ */
+static inline void switch_fpu_finish(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_FPU))
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif /* _ASM_X86_FPU_SCHED_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..e1c9df9102a5 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -5,6 +5,11 @@
#ifndef _ASM_X86_FPU_SIGNAL_H
#define _ASM_X86_FPU_SIGNAL_H
+#include <linux/compat.h>
+#include <linux/user.h>
+
+#include <asm/fpu/types.h>
+
#ifdef CONFIG_X86_64
# include <uapi/asm/sigcontext.h>
# include <asm/user32.h>
@@ -29,6 +34,11 @@ unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size);
-extern void fpu__init_prepare_fx_sw_frame(void);
+unsigned long fpu__get_fpstate_size(void);
+
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+extern void fpu__clear_user_states(struct fpu *fpu);
+extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
+extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask);
#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f5a38a5f3ae1..eb7cd1139d97 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -120,6 +120,9 @@ enum xfeature {
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
+ XFEATURE_RSRVD_COMP_16,
+ XFEATURE_XTILE_CFG,
+ XFEATURE_XTILE_DATA,
XFEATURE_MAX,
};
@@ -136,12 +139,21 @@ enum xfeature {
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
+#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
+#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
| XFEATURE_MASK_ZMM_Hi256 \
| XFEATURE_MASK_Hi16_ZMM)
+#ifdef CONFIG_X86_64
+# define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILE_DATA \
+ | XFEATURE_MASK_XTILE_CFG)
+#else
+# define XFEATURE_MASK_XTILE (0)
+#endif
+
#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM
struct reg_128_bit {
@@ -153,6 +165,9 @@ struct reg_256_bit {
struct reg_512_bit {
u8 regbytes[512/8];
};
+struct reg_1024_byte {
+ u8 regbytes[1024];
+};
/*
* State component 2:
@@ -255,6 +270,23 @@ struct arch_lbr_state {
u64 ler_to;
u64 ler_info;
struct lbr_entry entries[];
+};
+
+/*
+ * State component 17: 64-byte tile configuration register.
+ */
+struct xtile_cfg {
+ u64 tcfg[8];
+} __packed;
+
+/*
+ * State component 18: 1KB tile data register.
+ * Each register represents 16 64-byte rows of the matrix
+ * data. But the number of registers depends on the actual
+ * implementation.
+ */
+struct xtile_data {
+ struct reg_1024_byte tmm;
} __packed;
/*
@@ -309,6 +341,93 @@ union fpregs_state {
u8 __padding[PAGE_SIZE];
};
+struct fpstate {
+ /* @kernel_size: The size of the kernel register image */
+ unsigned int size;
+
+ /* @user_size: The size in non-compacted UABI format */
+ unsigned int user_size;
+
+ /* @xfeatures: xfeatures for which the storage is sized */
+ u64 xfeatures;
+
+ /* @user_xfeatures: xfeatures valid in UABI buffers */
+ u64 user_xfeatures;
+
+ /* @xfd: xfeatures disabled to trap userspace use. */
+ u64 xfd;
+
+ /* @is_valloc: Indicator for dynamically allocated state */
+ unsigned int is_valloc : 1;
+
+ /* @is_guest: Indicator for guest state (KVM) */
+ unsigned int is_guest : 1;
+
+ /*
+ * @is_confidential: Indicator for KVM confidential mode.
+ * The FPU registers are restored by the
+ * vmentry firmware from encrypted guest
+ * memory. On vmexit the FPU registers are
+ * saved by firmware to encrypted guest memory
+ * and the registers are scrubbed before
+ * returning to the host. So there is no
+ * content which is worth saving and restoring.
+ * The fpstate has to be there so that
+ * preemption and softirq FPU usage works
+ * without special casing.
+ */
+ unsigned int is_confidential : 1;
+
+ /* @in_use: State is in use */
+ unsigned int in_use : 1;
+
+ /* @regs: The register state union for all supported formats */
+ union fpregs_state regs;
+
+ /* @regs is dynamically sized! Don't add anything after @regs! */
+} __aligned(64);
+
+#define FPU_GUEST_PERM_LOCKED BIT_ULL(63)
+
+struct fpu_state_perm {
+ /*
+ * @__state_perm:
+ *
+ * This bitmap indicates the permission for state components, which
+ * are available to a thread group. The permission prctl() sets the
+ * enabled state bits in thread_group_leader()->thread.fpu.
+ *
+ * All run time operations use the per thread information in the
+ * currently active fpu.fpstate which contains the xfeature masks
+ * and sizes for kernel and user space.
+ *
+ * This master permission field is only to be used when
+ * task.fpu.fpstate based checks fail to validate whether the task
+ * is allowed to expand it's xfeatures set which requires to
+ * allocate a larger sized fpstate buffer.
+ *
+ * Do not access this field directly. Use the provided helper
+ * function. Unlocked access is possible for quick checks.
+ */
+ u64 __state_perm;
+
+ /*
+ * @__state_size:
+ *
+ * The size required for @__state_perm. Only valid to access
+ * with sighand locked.
+ */
+ unsigned int __state_size;
+
+ /*
+ * @__user_state_size:
+ *
+ * The size required for @__state_perm user part. Only valid to
+ * access with sighand locked.
+ */
+ unsigned int __user_state_size;
+};
+
/*
* Highest level per task FPU state data structure that
* contains the FPU register state plus various FPU
@@ -337,19 +456,130 @@ struct fpu {
unsigned long avx512_timestamp;
/*
- * @state:
+ * @fpstate:
+ *
+ * Pointer to the active struct fpstate. Initialized to
+ * point at @__fpstate below.
+ */
+ struct fpstate *fpstate;
+
+ /*
+ * @__task_fpstate:
+ *
+ * Pointer to an inactive struct fpstate. Initialized to NULL. Is
+ * used only for KVM support to swap out the regular task fpstate.
+ */
+ struct fpstate *__task_fpstate;
+
+ /*
+ * @perm:
+ *
+ * Permission related information
+ */
+ struct fpu_state_perm perm;
+
+ /*
+ * @guest_perm:
+ *
+ * Permission related information for guest pseudo FPUs
+ */
+ struct fpu_state_perm guest_perm;
+
+ /*
+ * @__fpstate:
*
- * In-memory copy of all FPU registers that we save/restore
- * over context switches. If the task is using the FPU then
- * the registers in the FPU are more recent than this state
- * copy. If the task context-switches away then they get
- * saved here and represent the FPU state.
+ * Initial in-memory storage for FPU registers which are saved in
+ * context switch and when the kernel uses the FPU. The registers
+ * are restored from this storage on return to user space if they
+ * are not longer containing the tasks FPU register state.
*/
- union fpregs_state state;
+ struct fpstate __fpstate;
/*
- * WARNING: 'state' is dynamically-sized. Do not put
+ * WARNING: '__fpstate' is dynamically-sized. Do not put
* anything after it here.
*/
};
+/*
+ * Guest pseudo FPU container
+ */
+struct fpu_guest {
+ /*
+ * @xfeatures: xfeature bitmap of features which are
+ * currently enabled for the guest vCPU.
+ */
+ u64 xfeatures;
+
+ /*
+ * @perm: xfeature bitmap of features which are
+ * permitted to be enabled for the guest
+ * vCPU.
+ */
+ u64 perm;
+
+ /*
+ * @xfd_err: Save the guest value.
+ */
+ u64 xfd_err;
+
+ /*
+ * @uabi_size: Size required for save/restore
+ */
+ unsigned int uabi_size;
+
+ /*
+ * @fpstate: Pointer to the allocated guest fpstate
+ */
+ struct fpstate *fpstate;
+};
+
+/*
+ * FPU state configuration data. Initialized at boot time. Read only after init.
+ */
+struct fpu_state_config {
+ /*
+ * @max_size:
+ *
+ * The maximum size of the register state buffer. Includes all
+ * supported features except independent managed features.
+ */
+ unsigned int max_size;
+
+ /*
+ * @default_size:
+ *
+ * The default size of the register state buffer. Includes all
+ * supported features except independent managed features and
+ * features which have to be requested by user space before usage.
+ */
+ unsigned int default_size;
+
+ /*
+ * @max_features:
+ *
+ * The maximum supported features bitmap. Does not include
+ * independent managed features.
+ */
+ u64 max_features;
+
+ /*
+ * @default_features:
+ *
+ * The default supported features bitmap. Does not include
+ * independent managed features and features which have to
+ * be requested by user space before usage.
+ */
+ u64 default_features;
+ /*
+ * @legacy_features:
+ *
+ * Features which can be reported back to user space
+ * even without XSAVE support, i.e. legacy features FP + SSE
+ */
+ u64 legacy_features;
+};
+
+/* FPU state configuration information */
+extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
+
#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
index 1c7ab8d95da5..9656a5bc6fea 100644
--- a/arch/x86/include/asm/fpu/xcr.h
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -2,18 +2,8 @@
#ifndef _ASM_X86_FPU_XCR_H
#define _ASM_X86_FPU_XCR_H
-/*
- * MXCSR and XCR definitions:
- */
-
-static inline void ldmxcsr(u32 mxcsr)
-{
- asm volatile("ldmxcsr %0" :: "m" (mxcsr));
-}
-
-extern unsigned int mxcsr_feature_mask;
-
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
+#define XCR_XFEATURE_IN_USE_MASK 0x00000001
static inline u64 xgetbv(u32 index)
{
@@ -31,4 +21,15 @@ static inline void xsetbv(u32 index, u64 value)
asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
}
+/*
+ * Return a mask of xfeatures which are currently being tracked
+ * by the processor as being in the initial configuration.
+ *
+ * Callers should check X86_FEATURE_XGETBV1.
+ */
+static inline u64 xfeatures_in_use(void)
+{
+ return xgetbv(XCR_XFEATURE_IN_USE_MASK);
+}
+
#endif /* _ASM_X86_FPU_XCR_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..cd3dd170e23a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -6,6 +6,7 @@
#include <linux/types.h>
#include <asm/processor.h>
+#include <asm/fpu/api.h>
#include <asm/user.h>
/* Bit 63 of XCR0 is reserved for future expansion */
@@ -13,6 +14,8 @@
#define XSTATE_CPUID 0x0000000d
+#define TILE_CPUID 0x0000001d
+
#define FXSAVE_SIZE 512
#define XSAVE_HDR_SIZE 64
@@ -32,7 +35,19 @@
XFEATURE_MASK_Hi16_ZMM | \
XFEATURE_MASK_PKRU | \
XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR)
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_XTILE)
+
+/*
+ * Features which are restored when returning to user space.
+ * PKRU is not restored on return to user space because PKRU
+ * is switched eagerly in switch_to() and flush_thread()
+ */
+#define XFEATURE_MASK_USER_RESTORE \
+ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
+
+/* Features which are dynamically enabled for a process on request */
+#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
@@ -42,21 +57,21 @@
* and its size may be huge. Saving/restoring such supervisor state components
* at each context switch can cause high CPU and space overhead, which should
* be avoided. Such supervisor state components should only be saved/restored
- * on demand. The on-demand dynamic supervisor features are set in this mask.
+ * on demand. The on-demand supervisor features are set in this mask.
*
- * Unlike the existing supported supervisor features, a dynamic supervisor
+ * Unlike the existing supported supervisor features, an independent supervisor
* feature does not allocate a buffer in task->fpu, and the corresponding
* supervisor state component cannot be saved/restored at each context switch.
*
- * To support a dynamic supervisor feature, a developer should follow the
+ * To support an independent supervisor feature, a developer should follow the
* dos and don'ts as below:
* - Do dynamically allocate a buffer for the supervisor state component.
* - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
* state component to/from the buffer.
- * - Don't set the bit corresponding to the dynamic supervisor feature in
+ * - Don't set the bit corresponding to the independent supervisor feature in
* IA32_XSS at run time, since it has been set at boot time.
*/
-#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR)
+#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)
/*
* Unsupported supervisor features. When a supervisor feature in this mask is
@@ -66,54 +81,52 @@
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
- XFEATURE_MASK_DYNAMIC | \
+ XFEATURE_MASK_INDEPENDENT | \
XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
-#ifdef CONFIG_X86_64
-#define REX_PREFIX "0x48, "
-#else
-#define REX_PREFIX
-#endif
-
-extern u64 xfeatures_mask_all;
-
-static inline u64 xfeatures_mask_supervisor(void)
-{
- return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
-}
-
-static inline u64 xfeatures_mask_user(void)
-{
- return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
-}
-
-static inline u64 xfeatures_mask_dynamic(void)
-{
- if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR;
+/*
+ * The feature mask required to restore FPU state:
+ * - All user states which are not eagerly switched in switch_to()/exec()
+ * - The suporvisor states
+ */
+#define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED)
- return XFEATURE_MASK_DYNAMIC;
-}
+/*
+ * Features in this mask have space allocated in the signal frame, but may not
+ * have that space initialized when the feature is in its init state.
+ */
+#define XFEATURE_MASK_SIGFRAME_INITOPT (XFEATURE_MASK_XTILE | \
+ XFEATURE_MASK_USER_DYNAMIC)
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void __init update_regset_xstate_info(unsigned int size,
u64 xstate_mask);
-void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
-const void *get_xsave_field_ptr(int xfeature_nr);
-int using_compacted_format(void);
int xfeature_size(int xfeature_nr);
-struct membuf;
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-void copy_supervisor_to_kernel(struct xregs_state *xsave);
-void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
-void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
+void xsaves(struct xregs_state *xsave, u64 mask);
+void xrstors(struct xregs_state *xsave, u64 mask);
+
+int xfd_enable_feature(u64 xfd_err);
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+#endif
-/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_user_xstate_header(const struct xstate_header *hdr);
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+ return static_branch_unlikely(&__fpu_state_size_dynamic);
+}
+#else
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+ return false;
+}
+#endif
#endif
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9f3130f40807..b5ef474be858 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -9,6 +9,13 @@
# define MCOUNT_ADDR ((unsigned long)(__fentry__))
#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
+/* Ignore unused weak functions which will have non zero offsets */
+#ifdef CONFIG_HAVE_FENTRY
+# include <asm/ibt.h>
+/* Add offset for endbr64 if IBT enabled */
+# define FTRACE_MCOUNT_MAX_OFFSET ENDBR_INSN_SIZE
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
#define ARCH_SUPPORTS_FTRACE_OPS 1
#endif
@@ -57,6 +64,13 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
#define ftrace_instruction_pointer_set(fregs, _ip) \
do { (fregs)->regs.ip = (_ip); } while (0)
+
+struct ftrace_ops;
+#define ftrace_graph_func ftrace_graph_func
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+#else
+#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
#endif
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -65,8 +79,6 @@ struct dyn_arch_ftrace {
/* No extra data needed for x86 */
};
-#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
-
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index f9c00110a69a..99d345b686fa 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -17,13 +17,9 @@ do { \
int oldval = 0, ret; \
asm volatile("1:\t" insn "\n" \
"2:\n" \
- "\t.section .fixup,\"ax\"\n" \
- "3:\tmov\t%3, %1\n" \
- "\tjmp\t2b\n" \
- "\t.previous\n" \
- _ASM_EXTABLE_UA(1b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %1) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
- : "i" (-EFAULT), "0" (oparg), "1" (0)); \
+ : "0" (oparg), "1" (0)); \
if (ret) \
goto label; \
*oval = oldval; \
@@ -39,15 +35,11 @@ do { \
"3:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
"\tjnz\t2b\n" \
"4:\n" \
- "\t.section .fixup,\"ax\"\n" \
- "5:\tmov\t%5, %1\n" \
- "\tjmp\t4b\n" \
- "\t.previous\n" \
- _ASM_EXTABLE_UA(1b, 5b) \
- _ASM_EXTABLE_UA(3b, 5b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 4b, EX_TYPE_EFAULT_REG, %1) \
+ _ASM_EXTABLE_TYPE_REG(3b, 4b, EX_TYPE_EFAULT_REG, %1) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
- : "r" (oparg), "i" (-EFAULT), "1" (0)); \
+ : "r" (oparg), "1" (0)); \
if (ret) \
goto label; \
*oval = oldval; \
@@ -95,15 +87,11 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
if (!user_access_begin(uaddr, sizeof(u32)))
return -EFAULT;
asm volatile("\n"
- "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
+ "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
"2:\n"
- "\t.section .fixup, \"ax\"\n"
- "3:\tmov %3, %0\n"
- "\tjmp 2b\n"
- "\t.previous\n"
- _ASM_EXTABLE_UA(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \
: "+r" (ret), "=a" (oldval), "+m" (*uaddr)
- : "i" (-EFAULT), "r" (newval), "1" (oldval)
+ : "r" (newval), "1" (oldval)
: "memory"
);
user_access_end();
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 318556574345..5af8088a10df 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -38,7 +38,7 @@ extern int gart_iommu_aperture_disabled;
extern void early_gart_iommu_check(void);
extern int gart_iommu_init(void);
extern void __init gart_parse_options(char *);
-extern int gart_iommu_hole_init(void);
+void gart_iommu_hole_init(void);
#else
#define gart_iommu_aperture 0
@@ -51,9 +51,8 @@ static inline void early_gart_iommu_check(void)
static inline void gart_parse_options(char *options)
{
}
-static inline int gart_iommu_hole_init(void)
+static inline void gart_iommu_hole_init(void)
{
- return -ENODEV;
}
#endif
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 032e020853aa..731ee7cc40a5 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -26,6 +26,7 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/fixmap.h>
+#include <asm/pgtable_areas.h>
/* declarations for highmem.c */
extern unsigned long highstart_pfn, highend_pfn;
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index e6cd3fee562b..0a9407dc0859 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -52,7 +52,7 @@
* Support for passing hypercall input parameter block via XMM
* registers is available
*/
-#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4)
+#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4)
/* Support for a virtual guest idle state is available */
#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5)
/* Frequency MSRs available */
@@ -61,6 +61,11 @@
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10)
/* Support for debug MSRs available */
#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11)
+/*
+ * Support for returning hypercall output block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE BIT(15)
/* stimer Direct Mode is available */
#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19)
@@ -133,6 +138,15 @@
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19)
+/*
+ * This is specific to AMD and specifies that enlightened TLB flush is
+ * supported. If guest opts in to this feature, ASID invalidations only
+ * flushes gva -> hpa mapping entries. To flush the TLB entries derived
+ * from NPT, hypercalls should be used (HvFlushGuestPhysicalAddressSpace
+ * or HvFlushGuestPhysicalAddressList).
+ */
+#define HV_X64_NESTED_ENLIGHTENED_TLB BIT(22)
+
/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */
#define HV_PARAVISOR_PRESENT BIT(0)
@@ -156,7 +170,7 @@ enum hv_isolation_type {
#define HV_X64_MSR_HYPERCALL 0x40000001
/* MSR used to provide vcpu index */
-#define HV_X64_MSR_VP_INDEX 0x40000002
+#define HV_REGISTER_VP_INDEX 0x40000002
/* MSR used to reset the guest OS. */
#define HV_X64_MSR_RESET 0x40000003
@@ -165,10 +179,10 @@ enum hv_isolation_type {
#define HV_X64_MSR_VP_RUNTIME 0x40000010
/* MSR used to read the per-partition time reference counter */
-#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
+#define HV_REGISTER_TIME_REF_COUNT 0x40000020
/* A partition's reference time stamp counter (TSC) page */
-#define HV_X64_MSR_REFERENCE_TSC 0x40000021
+#define HV_REGISTER_REFERENCE_TSC 0x40000021
/* MSR used to retrieve the TSC frequency */
#define HV_X64_MSR_TSC_FREQUENCY 0x40000022
@@ -183,50 +197,50 @@ enum hv_isolation_type {
#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
/* Define synthetic interrupt controller model specific registers. */
-#define HV_X64_MSR_SCONTROL 0x40000080
-#define HV_X64_MSR_SVERSION 0x40000081
-#define HV_X64_MSR_SIEFP 0x40000082
-#define HV_X64_MSR_SIMP 0x40000083
-#define HV_X64_MSR_EOM 0x40000084
-#define HV_X64_MSR_SINT0 0x40000090
-#define HV_X64_MSR_SINT1 0x40000091
-#define HV_X64_MSR_SINT2 0x40000092
-#define HV_X64_MSR_SINT3 0x40000093
-#define HV_X64_MSR_SINT4 0x40000094
-#define HV_X64_MSR_SINT5 0x40000095
-#define HV_X64_MSR_SINT6 0x40000096
-#define HV_X64_MSR_SINT7 0x40000097
-#define HV_X64_MSR_SINT8 0x40000098
-#define HV_X64_MSR_SINT9 0x40000099
-#define HV_X64_MSR_SINT10 0x4000009A
-#define HV_X64_MSR_SINT11 0x4000009B
-#define HV_X64_MSR_SINT12 0x4000009C
-#define HV_X64_MSR_SINT13 0x4000009D
-#define HV_X64_MSR_SINT14 0x4000009E
-#define HV_X64_MSR_SINT15 0x4000009F
+#define HV_REGISTER_SCONTROL 0x40000080
+#define HV_REGISTER_SVERSION 0x40000081
+#define HV_REGISTER_SIEFP 0x40000082
+#define HV_REGISTER_SIMP 0x40000083
+#define HV_REGISTER_EOM 0x40000084
+#define HV_REGISTER_SINT0 0x40000090
+#define HV_REGISTER_SINT1 0x40000091
+#define HV_REGISTER_SINT2 0x40000092
+#define HV_REGISTER_SINT3 0x40000093
+#define HV_REGISTER_SINT4 0x40000094
+#define HV_REGISTER_SINT5 0x40000095
+#define HV_REGISTER_SINT6 0x40000096
+#define HV_REGISTER_SINT7 0x40000097
+#define HV_REGISTER_SINT8 0x40000098
+#define HV_REGISTER_SINT9 0x40000099
+#define HV_REGISTER_SINT10 0x4000009A
+#define HV_REGISTER_SINT11 0x4000009B
+#define HV_REGISTER_SINT12 0x4000009C
+#define HV_REGISTER_SINT13 0x4000009D
+#define HV_REGISTER_SINT14 0x4000009E
+#define HV_REGISTER_SINT15 0x4000009F
/*
* Synthetic Timer MSRs. Four timers per vcpu.
*/
-#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0
-#define HV_X64_MSR_STIMER0_COUNT 0x400000B1
-#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2
-#define HV_X64_MSR_STIMER1_COUNT 0x400000B3
-#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4
-#define HV_X64_MSR_STIMER2_COUNT 0x400000B5
-#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
-#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
+#define HV_REGISTER_STIMER0_CONFIG 0x400000B0
+#define HV_REGISTER_STIMER0_COUNT 0x400000B1
+#define HV_REGISTER_STIMER1_CONFIG 0x400000B2
+#define HV_REGISTER_STIMER1_COUNT 0x400000B3
+#define HV_REGISTER_STIMER2_CONFIG 0x400000B4
+#define HV_REGISTER_STIMER2_COUNT 0x400000B5
+#define HV_REGISTER_STIMER3_CONFIG 0x400000B6
+#define HV_REGISTER_STIMER3_COUNT 0x400000B7
/* Hyper-V guest idle MSR */
#define HV_X64_MSR_GUEST_IDLE 0x400000F0
/* Hyper-V guest crash notification MSR's */
-#define HV_X64_MSR_CRASH_P0 0x40000100
-#define HV_X64_MSR_CRASH_P1 0x40000101
-#define HV_X64_MSR_CRASH_P2 0x40000102
-#define HV_X64_MSR_CRASH_P3 0x40000103
-#define HV_X64_MSR_CRASH_P4 0x40000104
-#define HV_X64_MSR_CRASH_CTL 0x40000105
+#define HV_REGISTER_CRASH_P0 0x40000100
+#define HV_REGISTER_CRASH_P1 0x40000101
+#define HV_REGISTER_CRASH_P2 0x40000102
+#define HV_REGISTER_CRASH_P3 0x40000103
+#define HV_REGISTER_CRASH_P4 0x40000104
+#define HV_REGISTER_CRASH_CTL 0x40000105
/* TSC emulation after migration */
#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
@@ -236,6 +250,49 @@ enum hv_isolation_type {
/* TSC invariant control */
#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118
+/* Register name aliases for temporary compatibility */
+#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT
+#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG
+#define HV_X64_MSR_STIMER1_COUNT HV_REGISTER_STIMER1_COUNT
+#define HV_X64_MSR_STIMER1_CONFIG HV_REGISTER_STIMER1_CONFIG
+#define HV_X64_MSR_STIMER2_COUNT HV_REGISTER_STIMER2_COUNT
+#define HV_X64_MSR_STIMER2_CONFIG HV_REGISTER_STIMER2_CONFIG
+#define HV_X64_MSR_STIMER3_COUNT HV_REGISTER_STIMER3_COUNT
+#define HV_X64_MSR_STIMER3_CONFIG HV_REGISTER_STIMER3_CONFIG
+#define HV_X64_MSR_SCONTROL HV_REGISTER_SCONTROL
+#define HV_X64_MSR_SVERSION HV_REGISTER_SVERSION
+#define HV_X64_MSR_SIMP HV_REGISTER_SIMP
+#define HV_X64_MSR_SIEFP HV_REGISTER_SIEFP
+#define HV_X64_MSR_VP_INDEX HV_REGISTER_VP_INDEX
+#define HV_X64_MSR_EOM HV_REGISTER_EOM
+#define HV_X64_MSR_SINT0 HV_REGISTER_SINT0
+#define HV_X64_MSR_SINT15 HV_REGISTER_SINT15
+#define HV_X64_MSR_CRASH_P0 HV_REGISTER_CRASH_P0
+#define HV_X64_MSR_CRASH_P1 HV_REGISTER_CRASH_P1
+#define HV_X64_MSR_CRASH_P2 HV_REGISTER_CRASH_P2
+#define HV_X64_MSR_CRASH_P3 HV_REGISTER_CRASH_P3
+#define HV_X64_MSR_CRASH_P4 HV_REGISTER_CRASH_P4
+#define HV_X64_MSR_CRASH_CTL HV_REGISTER_CRASH_CTL
+#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT
+#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC
+
+/* Hyper-V memory host visibility */
+enum hv_mem_host_visibility {
+ VMBUS_PAGE_NOT_VISIBLE = 0,
+ VMBUS_PAGE_VISIBLE_READ_ONLY = 1,
+ VMBUS_PAGE_VISIBLE_READ_WRITE = 3
+};
+
+/* HvCallModifySparseGpaPageHostVisibility hypercall */
+#define HV_MAX_MODIFY_GPA_REP_COUNT ((PAGE_SIZE / sizeof(u64)) - 2)
+struct hv_gpa_range_for_visibility {
+ u64 partition_id;
+ u32 host_visibility:2;
+ u32 reserved0:30;
+ u32 reserved1;
+ u64 gpa_page_list[HV_MAX_MODIFY_GPA_REP_COUNT];
+} __packed;
+
/*
* Declare the MSR used to setup pages used to communicate with the hypervisor.
*/
@@ -248,6 +305,15 @@ union hv_x64_msr_hypercall_contents {
} __packed;
};
+union hv_vp_assist_msr_contents {
+ u64 as_uint64;
+ struct {
+ u64 enable:1;
+ u64 reserved:11;
+ u64 pfn:52;
+ } __packed;
+};
+
struct hv_reenlightenment_control {
__u64 vector:8;
__u64 reserved1:8;
@@ -288,34 +354,8 @@ struct hv_tsc_emulation_status {
#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
-
-/* Define hypervisor message types. */
-enum hv_message_type {
- HVMSG_NONE = 0x00000000,
-
- /* Memory access messages. */
- HVMSG_UNMAPPED_GPA = 0x80000000,
- HVMSG_GPA_INTERCEPT = 0x80000001,
-
- /* Timer notification messages. */
- HVMSG_TIMER_EXPIRED = 0x80000010,
-
- /* Error messages. */
- HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
- HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021,
- HVMSG_UNSUPPORTED_FEATURE = 0x80000022,
-
- /* Trace buffer complete messages. */
- HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040,
-
- /* Platform-specific processor intercept messages. */
- HVMSG_X64_IOPORT_INTERCEPT = 0x80010000,
- HVMSG_X64_MSR_INTERCEPT = 0x80010001,
- HVMSG_X64_CPUID_INTERCEPT = 0x80010002,
- HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003,
- HVMSG_X64_APIC_EOI = 0x80010004,
- HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
-};
+/* Number of XMM registers used in hypercall input/output */
+#define HV_HYPERCALL_MAX_XMM_REGISTERS 6
struct hv_nested_enlightenments_control {
struct {
@@ -562,6 +602,39 @@ enum hv_interrupt_type {
HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A,
};
+union hv_msi_address_register {
+ u32 as_uint32;
+ struct {
+ u32 reserved1:2;
+ u32 destination_mode:1;
+ u32 redirection_hint:1;
+ u32 reserved2:8;
+ u32 destination_id:8;
+ u32 msi_base:12;
+ };
+} __packed;
+
+union hv_msi_data_register {
+ u32 as_uint32;
+ struct {
+ u32 vector:8;
+ u32 delivery_mode:3;
+ u32 reserved1:3;
+ u32 level_assert:1;
+ u32 trigger_mode:1;
+ u32 reserved2:16;
+ };
+} __packed;
+
+/* HvRetargetDeviceInterrupt hypercall */
+union hv_msi_entry {
+ u64 as_uint64;
+ struct {
+ union hv_msi_address_register address;
+ union hv_msi_data_register data;
+ } __packed;
+};
+
#include <asm-generic/hyperv-tlfs.h>
#endif
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 89789e8c80f6..637fa1df3512 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -19,6 +19,8 @@ extern unsigned int cached_irq_mask;
#define PIC_MASTER_OCW3 PIC_MASTER_ISR
#define PIC_SLAVE_CMD 0xa0
#define PIC_SLAVE_IMR 0xa1
+#define PIC_ELCR1 0x4d0
+#define PIC_ELCR2 0x4d1
/* i8259A PIC related value */
#define PIC_CASCADE_IR 2
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 2c5f7861d373..fada857f0a1e 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -68,6 +68,6 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm);
#endif
-#endif /* !CONFIG_IA32_SUPPORT */
+#endif /* CONFIG_IA32_EMULATION */
#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h
new file mode 100644
index 000000000000..689880eca9ba
--- /dev/null
+++ b/arch/x86/include/asm/ibt.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IBT_H
+#define _ASM_X86_IBT_H
+
+#include <linux/types.h>
+
+/*
+ * The rules for enabling IBT are:
+ *
+ * - CC_HAS_IBT: the toolchain supports it
+ * - X86_KERNEL_IBT: it is selected in Kconfig
+ * - !__DISABLE_EXPORTS: this is regular kernel code
+ *
+ * Esp. that latter one is a bit non-obvious, but some code like compressed,
+ * purgatory, realmode etc.. is built with custom CFLAGS that do not include
+ * -fcf-protection=branch and things will go *bang*.
+ *
+ * When all the above are satisfied, HAS_KERNEL_IBT will be 1, otherwise 0.
+ */
+#if defined(CONFIG_X86_KERNEL_IBT) && !defined(__DISABLE_EXPORTS)
+
+#define HAS_KERNEL_IBT 1
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_X86_64
+#define ASM_ENDBR "endbr64\n\t"
+#else
+#define ASM_ENDBR "endbr32\n\t"
+#endif
+
+#define __noendbr __attribute__((nocf_check))
+
+static inline __attribute_const__ u32 gen_endbr(void)
+{
+ u32 endbr;
+
+ /*
+ * Generate ENDBR64 in a way that is sure to not result in
+ * an ENDBR64 instruction as immediate.
+ */
+ asm ( "mov $~0xfa1e0ff3, %[endbr]\n\t"
+ "not %[endbr]\n\t"
+ : [endbr] "=&r" (endbr) );
+
+ return endbr;
+}
+
+static inline __attribute_const__ u32 gen_endbr_poison(void)
+{
+ /*
+ * 4 byte NOP that isn't NOP4 (in fact it is OSP NOP3), such that it
+ * will be unique to (former) ENDBR sites.
+ */
+ return 0x001f0f66; /* osp nopl (%rax) */
+}
+
+static inline bool is_endbr(u32 val)
+{
+ if (val == gen_endbr_poison())
+ return true;
+
+ val &= ~0x01000000U; /* ENDBR32 -> ENDBR64 */
+ return val == gen_endbr();
+}
+
+extern __noendbr u64 ibt_save(void);
+extern __noendbr void ibt_restore(u64 save);
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_X86_64
+#define ENDBR endbr64
+#else
+#define ENDBR endbr32
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#else /* !IBT */
+
+#define HAS_KERNEL_IBT 0
+
+#ifndef __ASSEMBLY__
+
+#define ASM_ENDBR
+
+#define __noendbr
+
+static inline bool is_endbr(u32 val) { return false; }
+
+static inline u64 ibt_save(void) { return 0; }
+static inline void ibt_restore(u64 save) { }
+
+#else /* __ASSEMBLY__ */
+
+#define ENDBR
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_X86_KERNEL_IBT */
+
+#define ENDBR_INSN_SIZE (4*HAS_KERNEL_IBT)
+
+#endif /* _ASM_X86_IBT_H */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 5eb3bdf36a41..72184b0b2219 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -5,6 +5,8 @@
/* Interrupts/Exceptions */
#include <asm/trapnr.h>
+#define IDT_ALIGN (8 * (1 + HAS_KERNEL_IBT))
+
#ifndef __ASSEMBLY__
#include <linux/entry-common.h>
#include <linux/hardirq.h>
@@ -312,8 +314,8 @@ static __always_inline void __##func(struct pt_regs *regs)
*/
#define DECLARE_IDTENTRY_VC(vector, func) \
DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \
- __visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \
- __visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code)
+ __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \
+ __visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code)
/**
* DEFINE_IDTENTRY_IST - Emit code for IST entry points
@@ -355,33 +357,24 @@ static __always_inline void __##func(struct pt_regs *regs)
DEFINE_IDTENTRY_RAW_ERRORCODE(func)
/**
- * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler
- which runs on a safe stack.
- * @func: Function name of the entry point
- *
- * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
- */
-#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \
- DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func)
-
-/**
- * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler
- which runs on the VC fall-back stack
+ * DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler
+ when raised from kernel mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
-#define DEFINE_IDTENTRY_VC_IST(func) \
- DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func)
+#define DEFINE_IDTENTRY_VC_KERNEL(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func)
/**
- * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler
+ * DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler
+ when raised from user mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
-#define DEFINE_IDTENTRY_VC(func) \
- DEFINE_IDTENTRY_RAW_ERRORCODE(func)
+#define DEFINE_IDTENTRY_VC_USER(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
#else /* CONFIG_X86_64 */
@@ -489,7 +482,7 @@ __visible noinstr void func(struct pt_regs *regs, \
/*
* ASM code to emit the common vector entry stubs where each stub is
- * packed into 8 bytes.
+ * packed into IDT_ALIGN bytes.
*
* Note, that the 'pushq imm8' is emitted via '.byte 0x6a, vector' because
* GCC treats the local vector variable as unsigned int and would expand
@@ -501,33 +494,33 @@ __visible noinstr void func(struct pt_regs *regs, \
* point is to mask off the bits above bit 7 because the push is sign
* extending.
*/
- .align 8
+ .align IDT_ALIGN
SYM_CODE_START(irq_entries_start)
vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ .rept NR_EXTERNAL_VECTORS
UNWIND_HINT_IRET_REGS
0 :
+ ENDBR
.byte 0x6a, vector
jmp asm_common_interrupt
- nop
- /* Ensure that the above is 8 bytes max */
- . = 0b + 8
+ /* Ensure that the above is IDT_ALIGN bytes max */
+ .fill 0b + IDT_ALIGN - ., 1, 0xcc
vector = vector+1
.endr
SYM_CODE_END(irq_entries_start)
#ifdef CONFIG_X86_LOCAL_APIC
- .align 8
+ .align IDT_ALIGN
SYM_CODE_START(spurious_entries_start)
vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ .rept NR_SYSTEM_VECTORS
UNWIND_HINT_IRET_REGS
0 :
+ ENDBR
.byte 0x6a, vector
jmp asm_spurious_interrupt
- nop
- /* Ensure that the above is 8 bytes max */
- . = 0b + 8
+ /* Ensure that the above is IDT_ALIGN bytes max */
+ .fill 0b + IDT_ALIGN - ., 1, 0xcc
vector = vector+1
.endr
SYM_CODE_END(spurious_entries_start)
@@ -547,7 +540,7 @@ SYM_CODE_END(spurious_entries_start)
/*
* Dummy trap number so the low level ASM macro vector number checks do not
* match which results in emitting plain IDTENTRY stubs without bells and
- * whistels.
+ * whistles.
*/
#define X86_TRAP_OTHER 0xFFFF
@@ -588,6 +581,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
#endif
/* NMI */
+
+#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * Special NOIST entry point for VMX which invokes this on the kernel
+ * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
+ * 'executing' marker.
+ *
+ * On 32bit this just uses the regular NMI entry point because 32-bit does
+ * not have ISTs.
+ */
+DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
+#else
+#define asm_exc_nmi_noist asm_exc_nmi
+#endif
+
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
@@ -609,6 +617,11 @@ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
#endif
+/* #CP */
+#ifdef CONFIG_X86_KERNEL_IBT
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
+#endif
+
/* #VC */
#ifdef CONFIG_AMD_MEM_ENCRYPT
DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication);
@@ -619,6 +632,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap);
#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception);
+#endif
+
/* Device interrupts common/spurious */
DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt);
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index 4cf2ad521f65..b56c5741581a 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -6,7 +6,7 @@
*
* Written by Masami Hiramatsu <mhiramat@redhat.com>
*/
-#include <asm/inat_types.h>
+#include <asm/inat_types.h> /* __ignore_sync_check__ */
/*
* Internal bits. Don't use bitmasks directly, because these bits are
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 98b4dae5e8bc..f07faa61c7f3 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -15,17 +15,33 @@
#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
+int pt_regs_offset(struct pt_regs *regs, int regno);
+
bool insn_has_rep_prefix(struct insn *insn);
void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip);
int insn_fetch_from_user(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
int insn_fetch_from_user_inatomic(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
-bool insn_decode(struct insn *insn, struct pt_regs *regs,
- unsigned char buf[MAX_INSN_SIZE], int buf_size);
+bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE], int buf_size);
+
+enum mmio_type {
+ MMIO_DECODE_FAILED,
+ MMIO_WRITE,
+ MMIO_WRITE_IMM,
+ MMIO_READ,
+ MMIO_READ_ZERO_EXTEND,
+ MMIO_READ_SIGN_EXTEND,
+ MMIO_MOVS,
+};
+
+enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 95a448fbb44c..1b29f58f730f 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -9,7 +9,7 @@
#include <asm/byteorder.h>
/* insn_attr_t is defined in inat.h */
-#include <asm/inat.h>
+#include <asm/inat.h> /* __ignore_sync_check__ */
#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
@@ -124,7 +124,7 @@ struct insn {
#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
/* VEX bit fields */
-#define X86_EVEX_M(vex) ((vex) & 0x03) /* EVEX Byte1 */
+#define X86_EVEX_M(vex) ((vex) & 0x07) /* EVEX Byte1 */
#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
#define X86_VEX2_M 1 /* VEX2.M always 1 */
#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
@@ -132,13 +132,25 @@ struct insn {
#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64);
-extern void insn_get_prefixes(struct insn *insn);
-extern void insn_get_opcode(struct insn *insn);
-extern void insn_get_modrm(struct insn *insn);
-extern void insn_get_sib(struct insn *insn);
-extern void insn_get_displacement(struct insn *insn);
-extern void insn_get_immediate(struct insn *insn);
-extern void insn_get_length(struct insn *insn);
+extern int insn_get_prefixes(struct insn *insn);
+extern int insn_get_opcode(struct insn *insn);
+extern int insn_get_modrm(struct insn *insn);
+extern int insn_get_sib(struct insn *insn);
+extern int insn_get_displacement(struct insn *insn);
+extern int insn_get_immediate(struct insn *insn);
+extern int insn_get_length(struct insn *insn);
+
+enum insn_mode {
+ INSN_MODE_32,
+ INSN_MODE_64,
+ /* Mode is determined by the current kernel build. */
+ INSN_MODE_KERN,
+ INSN_NUM_MODES,
+};
+
+extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m);
+
+#define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN)
/* Attribute will be determined after getting ModRM (for opcode groups) */
static inline void insn_get_attribute(struct insn *insn)
@@ -149,17 +161,6 @@ static inline void insn_get_attribute(struct insn *insn)
/* Instruction uses RIP-relative addressing */
extern int insn_rip_relative(struct insn *insn);
-/* Init insn for kernel text */
-static inline void kernel_insn_init(struct insn *insn,
- const void *kaddr, int buf_len)
-{
-#ifdef CONFIG_X86_64
- insn_init(insn, kaddr, buf_len, 1);
-#else /* CONFIG_X86_32 */
- insn_init(insn, kaddr, buf_len, 0);
-#endif
-}
-
static inline int insn_is_avx(struct insn *insn)
{
if (!insn->prefixes.got)
@@ -179,13 +180,6 @@ static inline int insn_has_emulate_prefix(struct insn *insn)
return !!insn->emulate_prefix_size;
}
-/* Ensure this instruction is decoded completely */
-static inline int insn_complete(struct insn *insn)
-{
- return insn->opcode.got && insn->modrm.got && insn->sib.got &&
- insn->displacement.got && insn->immediate.got;
-}
-
static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
{
if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 9abe842dbd84..def6ca121111 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -26,13 +26,16 @@
* _G - parts with extra graphics on
* _X - regular server parts
* _D - micro server parts
+ * _N,_P - other mobile parts
*
* Historical OPTDIFFs:
*
* _EP - 2 socket server parts
* _EX - 4+ socket server parts
*
- * The #define line may optionally include a comment including platform names.
+ * The #define line may optionally include a comment including platform or core
+ * names. An exception is made for skylake/kabylake where steppings seem to have gotten
+ * their own names :-(
*/
/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
@@ -69,35 +72,46 @@
#define INTEL_FAM6_BROADWELL_X 0x4F
#define INTEL_FAM6_BROADWELL_D 0x56
-#define INTEL_FAM6_SKYLAKE_L 0x4E
-#define INTEL_FAM6_SKYLAKE 0x5E
-#define INTEL_FAM6_SKYLAKE_X 0x55
-#define INTEL_FAM6_KABYLAKE_L 0x8E
-#define INTEL_FAM6_KABYLAKE 0x9E
+#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */
+#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */
+#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */
+/* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */
+/* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */
-#define INTEL_FAM6_CANNONLAKE_L 0x66
+#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */
+/* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */
+/* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */
+/* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */
-#define INTEL_FAM6_ICELAKE_X 0x6A
-#define INTEL_FAM6_ICELAKE_D 0x6C
-#define INTEL_FAM6_ICELAKE 0x7D
-#define INTEL_FAM6_ICELAKE_L 0x7E
-#define INTEL_FAM6_ICELAKE_NNPI 0x9D
+#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */
+/* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */
-#define INTEL_FAM6_TIGERLAKE_L 0x8C
-#define INTEL_FAM6_TIGERLAKE 0x8D
+#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */
+#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */
-#define INTEL_FAM6_COMETLAKE 0xA5
-#define INTEL_FAM6_COMETLAKE_L 0xA6
+#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */
-#define INTEL_FAM6_ROCKETLAKE 0xA7
+#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */
+#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */
+#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */
+#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */
+#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */
-#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F
+#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */
-/* Hybrid Core/Atom Processors */
+#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */
-#define INTEL_FAM6_LAKEFIELD 0x8A
-#define INTEL_FAM6_ALDERLAKE 0x97
-#define INTEL_FAM6_ALDERLAKE_L 0x9A
+#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */
+#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */
+
+#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */
+
+#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
+#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
+#define INTEL_FAM6_ALDERLAKE_N 0xBE
+
+#define INTEL_FAM6_RAPTORLAKE 0xB7
+#define INTEL_FAM6_RAPTORLAKE_P 0xBA
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
index 8380c3ddd4b2..2f9eeb5c3069 100644
--- a/arch/x86/include/asm/intel_ds.h
+++ b/arch/x86/include/asm/intel_ds.h
@@ -7,8 +7,9 @@
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS 8
-#define MAX_FIXED_PEBS_EVENTS 4
+#define MAX_PEBS_EVENTS_FMT4 8
+#define MAX_PEBS_EVENTS 32
+#define MAX_FIXED_PEBS_EVENTS 16
/*
* A debug store configuration.
diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h
index 3cb002b1d0f9..994638ef171b 100644
--- a/arch/x86/include/asm/intel_pconfig.h
+++ b/arch/x86/include/asm/intel_pconfig.h
@@ -38,7 +38,7 @@ enum pconfig_leaf {
#define MKTME_INVALID_ENC_ALG 4
#define MKTME_DEVICE_BUSY 5
-/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */
+/* Hardware requires the structure to be 256 byte aligned. Otherwise #GP(0). */
struct mktme_key_program {
u16 keyid;
u32 keyid_ctrl;
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 423b788f495e..c796e9bc98b6 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -3,7 +3,7 @@
#define _ASM_X86_INTEL_PT_H
#define PT_CPUID_LEAVES 2
-#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */
+#define PT_CPUID_REGS_NUM 4 /* number of registers (eax, ebx, ecx, edx) */
enum pt_capabilities {
PT_CAP_max_subleaf = 0,
@@ -13,6 +13,8 @@ enum pt_capabilities {
PT_CAP_mtc,
PT_CAP_ptwrite,
PT_CAP_power_event_trace,
+ PT_CAP_event_trace,
+ PT_CAP_tnt_disable,
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d726459d08e5..1870b99c3356 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,9 +40,11 @@
#include <linux/string.h>
#include <linux/compiler.h>
+#include <linux/cc_platform.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
+#include <asm/shared/io.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -159,7 +161,7 @@ static inline void *phys_to_virt(phys_addr_t address)
/*
* ISA I/O bus memory addresses are 1:1 with the physical address.
* However, we truncate the address to unsigned int to avoid undesirable
- * promitions in legacy drivers.
+ * promotions in legacy drivers.
*/
static inline unsigned int isa_virt_to_bus(volatile void *address)
{
@@ -209,8 +211,6 @@ void __iomem *ioremap(resource_size_t offset, unsigned long size);
extern void iounmap(volatile void __iomem *addr);
#define iounmap iounmap
-extern void set_iounmap_nonlazy(void);
-
#ifdef __KERNEL__
void memcpy_fromio(void *, const volatile void __iomem *, size_t);
@@ -256,53 +256,24 @@ static inline void slow_down_io(void)
#endif
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-#include <linux/jump_label.h>
-
-extern struct static_key_false sev_enable_key;
-static inline bool sev_key_active(void)
-{
- return static_branch_unlikely(&sev_enable_key);
-}
-
-#else /* !CONFIG_AMD_MEM_ENCRYPT */
-
-static inline bool sev_key_active(void) { return false; }
-
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
-
#define BUILDIO(bwl, bw, type) \
-static inline void out##bwl(unsigned type value, int port) \
-{ \
- asm volatile("out" #bwl " %" #bw "0, %w1" \
- : : "a"(value), "Nd"(port)); \
-} \
- \
-static inline unsigned type in##bwl(int port) \
-{ \
- unsigned type value; \
- asm volatile("in" #bwl " %w1, %" #bw "0" \
- : "=a"(value) : "Nd"(port)); \
- return value; \
-} \
- \
-static inline void out##bwl##_p(unsigned type value, int port) \
+static inline void out##bwl##_p(type value, u16 port) \
{ \
out##bwl(value, port); \
slow_down_io(); \
} \
\
-static inline unsigned type in##bwl##_p(int port) \
+static inline type in##bwl##_p(u16 port) \
{ \
- unsigned type value = in##bwl(port); \
+ type value = in##bwl(port); \
slow_down_io(); \
return value; \
} \
\
-static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
- unsigned type *value = (unsigned type *)addr; \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
+ type *value = (type *)addr; \
while (count) { \
out##bwl(*value, port); \
value++; \
@@ -315,10 +286,10 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
} \
} \
\
-static inline void ins##bwl(int port, void *addr, unsigned long count) \
+static inline void ins##bwl(u16 port, void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
- unsigned type *value = (unsigned type *)addr; \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
+ type *value = (type *)addr; \
while (count) { \
*value = in##bwl(port); \
value++; \
@@ -331,13 +302,11 @@ static inline void ins##bwl(int port, void *addr, unsigned long count) \
} \
}
-BUILDIO(b, b, char)
-BUILDIO(w, w, short)
-BUILDIO(l, , int)
+BUILDIO(b, b, u8)
+BUILDIO(w, w, u16)
+BUILDIO(l, , u32)
+#undef BUILDIO
-#define inb inb
-#define inw inw
-#define inl inl
#define inb_p inb_p
#define inw_p inw_p
#define inl_p inl_p
@@ -345,9 +314,6 @@ BUILDIO(l, , int)
#define insw insw
#define insl insl
-#define outb outb
-#define outw outw
-#define outl outl
#define outb_p outb_p
#define outw_p outw_p
#define outl_p outl_p
@@ -391,6 +357,7 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
extern bool arch_memremap_can_ram_remap(resource_size_t offset,
unsigned long size,
unsigned long flags);
@@ -398,6 +365,13 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset,
extern bool phys_mem_access_encrypted(unsigned long phys_addr,
unsigned long size);
+#else
+static inline bool phys_mem_access_encrypted(unsigned long phys_addr,
+ unsigned long size)
+{
+ return true;
+}
+#endif
/**
* iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index bf1ed2ddc74b..0bef44d30a27 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -8,6 +8,14 @@
extern int force_iommu, no_iommu;
extern int iommu_detected;
+extern int iommu_merge;
+extern int panic_on_overflow;
+
+#ifdef CONFIG_SWIOTLB
+extern bool x86_swiotlb_enable;
+#else
+#define x86_swiotlb_enable false
+#endif
/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
deleted file mode 100644
index 1fb3fd1a83c2..000000000000
--- a/arch/x86/include/asm/iommu_table.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_IOMMU_TABLE_H
-#define _ASM_X86_IOMMU_TABLE_H
-
-#include <asm/swiotlb.h>
-
-/*
- * History lesson:
- * The execution chain of IOMMUs in 2.6.36 looks as so:
- *
- * [xen-swiotlb]
- * |
- * +----[swiotlb *]--+
- * / | \
- * / | \
- * [GART] [Calgary] [Intel VT-d]
- * /
- * /
- * [AMD-Vi]
- *
- * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
- * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
- * Also it would surreptitiously initialize set the swiotlb=1 if there were
- * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
- * flag would be turned off by all IOMMUs except the Calgary one.
- *
- * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
- * to be built by defining who we depend on.
- *
- * And all that needs to be done is to use one of the macros in the IOMMU
- * and the pci-dma.c will take care of the rest.
- */
-
-struct iommu_table_entry {
- initcall_t detect;
- initcall_t depend;
- void (*early_init)(void); /* No memory allocate available. */
- void (*late_init)(void); /* Yes, can allocate memory. */
-#define IOMMU_FINISH_IF_DETECTED (1<<0)
-#define IOMMU_DETECTED (1<<1)
- int flags;
-};
-/*
- * Macro fills out an entry in the .iommu_table that is equivalent
- * to the fields that 'struct iommu_table_entry' has. The entries
- * that are put in the .iommu_table section are not put in any order
- * hence during boot-time we will have to resort them based on
- * dependency. */
-
-
-#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\
- static const struct iommu_table_entry \
- __iommu_entry_##_detect __used \
- __attribute__ ((unused, __section__(".iommu_table"), \
- aligned((sizeof(void *))))) \
- = {_detect, _depend, _early_init, _late_init, \
- _finish ? IOMMU_FINISH_IF_DETECTED : 0}
-/*
- * The simplest IOMMU definition. Provide the detection routine
- * and it will be run after the SWIOTLB and the other IOMMUs
- * that utilize this macro. If the IOMMU is detected (ie, the
- * detect routine returns a positive value), the other IOMMUs
- * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer
- * to stop detecting the other IOMMUs after yours has been detected.
- */
-#define IOMMU_INIT_POST(_detect) \
- __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 0)
-
-#define IOMMU_INIT_POST_FINISH(detect) \
- __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 1)
-
-/*
- * A more sophisticated version of IOMMU_INIT. This variant requires:
- * a). A detection routine function.
- * b). The name of the detection routine we depend on to get called
- * before us.
- * c). The init routine which gets called if the detection routine
- * returns a positive value from the pci_iommu_alloc. This means
- * no presence of a memory allocator.
- * d). Similar to the 'init', except that this gets called from pci_iommu_init
- * where we do have a memory allocator.
- *
- * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant
- * in that the former will continue detecting other IOMMUs in the call
- * list after the detection routine returns a positive number, while the
- * latter will stop the execution chain upon first successful detection.
- * Both variants will still call the 'init' and 'late_init' functions if
- * they are set.
- */
-#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
- __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
-
-#define IOMMU_INIT(_detect, _depend, _init, _late_init) \
- __IOMMU_INIT(_detect, _depend, _init, _late_init, 0)
-
-void sort_iommu_table(struct iommu_table_entry *start,
- struct iommu_table_entry *finish);
-
-void check_iommu_entries(struct iommu_table_entry *start,
- struct iommu_table_entry *finish);
-
-#endif /* _ASM_X86_IOMMU_TABLE_H */
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 9b2a0ff76c73..63f818aedf77 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -3,6 +3,7 @@
#define _ASM_X86_IRQ_STACK_H
#include <linux/ptrace.h>
+#include <linux/objtool.h>
#include <asm/processor.h>
@@ -58,7 +59,7 @@
* the output constraints to make the compiler aware that R11 cannot be
* reused after the asm() statement.
*
- * For builds with CONFIG_UNWIND_FRAME_POINTER ASM_CALL_CONSTRAINT is
+ * For builds with CONFIG_UNWINDER_FRAME_POINTER, ASM_CALL_CONSTRAINT is
* required as well as this prevents certain creative GCC variants from
* misplacing the ASM code.
*
@@ -77,11 +78,11 @@
* Function calls can clobber anything except the callee-saved
* registers. Tell the compiler.
*/
-#define call_on_irqstack(func, asm_call, argconstr...) \
+#define call_on_stack(stack, func, asm_call, argconstr...) \
{ \
register void *tos asm("r11"); \
\
- tos = ((void *)__this_cpu_read(hardirq_stack_ptr)); \
+ tos = ((void *)(stack)); \
\
asm_inline volatile( \
"movq %%rsp, (%[tos]) \n" \
@@ -98,6 +99,26 @@
); \
}
+#define ASM_CALL_ARG0 \
+ "call %P[__func] \n" \
+ ASM_REACHABLE
+
+#define ASM_CALL_ARG1 \
+ "movq %[arg1], %%rdi \n" \
+ ASM_CALL_ARG0
+
+#define ASM_CALL_ARG2 \
+ "movq %[arg2], %%rsi \n" \
+ ASM_CALL_ARG1
+
+#define ASM_CALL_ARG3 \
+ "movq %[arg3], %%rdx \n" \
+ ASM_CALL_ARG2
+
+#define call_on_irqstack(func, asm_call, argconstr...) \
+ call_on_stack(__this_cpu_read(hardirq_stack_ptr), \
+ func, asm_call, argconstr)
+
/* Macros to assert type correctness for run_*_on_irqstack macros */
#define assert_function_type(func, proto) \
static_assert(__builtin_types_compatible_p(typeof(&func), proto))
@@ -147,8 +168,7 @@
*/
#define ASM_CALL_SYSVEC \
"call irq_enter_rcu \n" \
- "movq %[arg1], %%rdi \n" \
- "call %P[__func] \n" \
+ ASM_CALL_ARG1 \
"call irq_exit_rcu \n"
#define SYSVEC_CONSTRAINTS , [arg1] "r" (regs)
@@ -168,12 +188,10 @@
*/
#define ASM_CALL_IRQ \
"call irq_enter_rcu \n" \
- "movq %[arg1], %%rdi \n" \
- "movl %[arg2], %%esi \n" \
- "call %P[__func] \n" \
+ ASM_CALL_ARG2 \
"call irq_exit_rcu \n"
-#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" (vector)
+#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" ((unsigned long)vector)
#define run_irq_on_irqstack_cond(func, regs, vector) \
{ \
@@ -185,22 +203,22 @@
IRQ_CONSTRAINTS, regs, vector); \
}
-#define ASM_CALL_SOFTIRQ \
- "call %P[__func] \n"
-
+#ifndef CONFIG_PREEMPT_RT
/*
* Macro to invoke __do_softirq on the irq stack. This is only called from
- * task context when bottom halfs are about to be reenabled and soft
+ * task context when bottom halves are about to be reenabled and soft
* interrupts are pending to be processed. The interrupt stack cannot be in
* use here.
*/
#define do_softirq_own_stack() \
{ \
__this_cpu_write(hardirq_stack_inuse, true); \
- call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ); \
+ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \
__this_cpu_write(hardirq_stack_inuse, false); \
}
+#endif
+
#else /* CONFIG_X86_64 */
/* System vector handlers always run on the stack they interrupted. */
#define run_sysvec_on_irqstack_cond(func, regs) \
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 889f8b1b5b7f..43dcb9284208 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -26,8 +26,8 @@
* This file enumerates the exact layout of them:
*/
+/* This is used as an interrupt vector when programming the APIC. */
#define NMI_VECTOR 0x02
-#define MCE_VECTOR 0x12
/*
* IDT vectors usable for external interrupt sources start at 0x20.
@@ -84,7 +84,7 @@
*/
#define IRQ_WORK_VECTOR 0xf6
-#define UV_BAU_MESSAGE 0xf5
+/* 0xf5 - unused, was UV_BAU_MESSAGE */
#define DEFERRED_ERROR_VECTOR 0xf4
/* Vector on which hypervisor callbacks will be delivered */
@@ -114,6 +114,9 @@
#define FIRST_SYSTEM_VECTOR NR_VECTORS
#endif
+#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+
/*
* Size the maximum number of interrupts.
*
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 144d70ea4393..7793e52d6237 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -109,18 +109,11 @@ static __always_inline unsigned long arch_local_irq_save(void)
}
#else
-#define ENABLE_INTERRUPTS(x) sti
-#define DISABLE_INTERRUPTS(x) cli
-
#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
-#define SAVE_FLAGS(x) pushfq; popq %rax
+#define SAVE_FLAGS pushfq; popq %rax
#endif
-#define INTERRUPT_RETURN jmp native_iret
-
-#else
-#define INTERRUPT_RETURN iret
#endif
#endif /* __ASSEMBLY__ */
@@ -144,14 +137,6 @@ static __always_inline void arch_local_irq_restore(unsigned long flags)
if (!arch_irqs_disabled_flags(flags))
arch_local_irq_enable();
}
-#else
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_XEN_PV
-#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
-#else
-#define SWAPGS swapgs
-#endif
-#endif
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 06c3cc22a058..071572e23d3a 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -4,14 +4,6 @@
#define HAVE_JUMP_LABEL_BATCH
-#define JUMP_LABEL_NOP_SIZE 5
-
-#ifdef CONFIG_X86_64
-# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
-#else
-# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
-#endif
-
#include <asm/asm.h>
#include <asm/nops.h>
@@ -20,15 +12,35 @@
#include <linux/stringify.h>
#include <linux/types.h>
+#define JUMP_TABLE_ENTRY \
+ ".pushsection __jump_table, \"aw\" \n\t" \
+ _ASM_ALIGN "\n\t" \
+ ".long 1b - . \n\t" \
+ ".long %l[l_yes] - . \n\t" \
+ _ASM_PTR "%c0 + %c1 - .\n\t" \
+ ".popsection \n\t"
+
+#ifdef CONFIG_HAVE_JUMP_LABEL_HACK
+
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
asm_volatile_goto("1:"
- ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
- ".pushsection __jump_table, \"aw\" \n\t"
- _ASM_ALIGN "\n\t"
- ".long 1b - ., %l[l_yes] - . \n\t"
- _ASM_PTR "%c0 + %c1 - .\n\t"
- ".popsection \n\t"
+ "jmp %l[l_yes] # objtool NOPs this \n\t"
+ JUMP_TABLE_ENTRY
+ : : "i" (key), "i" (2 | branch) : : l_yes);
+
+ return false;
+l_yes:
+ return true;
+}
+
+#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */
+
+static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
+{
+ asm_volatile_goto("1:"
+ ".byte " __stringify(BYTES_NOP5) "\n\t"
+ JUMP_TABLE_ENTRY
: : "i" (key), "i" (branch) : : l_yes);
return false;
@@ -36,16 +48,13 @@ l_yes:
return true;
}
-static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */
+
+static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
asm_volatile_goto("1:"
- ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
- "2:\n\t"
- ".pushsection __jump_table, \"aw\" \n\t"
- _ASM_ALIGN "\n\t"
- ".long 1b - ., %l[l_yes] - . \n\t"
- _ASM_PTR "%c0 + %c1 - .\n\t"
- ".popsection \n\t"
+ "jmp %l[l_yes]\n\t"
+ JUMP_TABLE_ENTRY
: : "i" (key), "i" (branch) : : l_yes);
return false;
@@ -53,41 +62,7 @@ l_yes:
return true;
}
-#else /* __ASSEMBLY__ */
-
-.macro STATIC_JUMP_IF_TRUE target, key, def
-.Lstatic_jump_\@:
- .if \def
- /* Equivalent to "jmp.d32 \target" */
- .byte 0xe9
- .long \target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
- .else
- .byte STATIC_KEY_INIT_NOP
- .endif
- .pushsection __jump_table, "aw"
- _ASM_ALIGN
- .long .Lstatic_jump_\@ - ., \target - .
- _ASM_PTR \key - .
- .popsection
-.endm
-
-.macro STATIC_JUMP_IF_FALSE target, key, def
-.Lstatic_jump_\@:
- .if \def
- .byte STATIC_KEY_INIT_NOP
- .else
- /* Equivalent to "jmp.d32 \target" */
- .byte 0xe9
- .long \target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
- .endif
- .pushsection __jump_table, "aw"
- _ASM_ALIGN
- .long .Lstatic_jump_\@ - ., \target - .
- _ASM_PTR \key + 1 - .
- .popsection
-.endm
+extern int arch_jump_entry_size(struct jump_entry *entry);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6802c59e8252..6ad8d946cd3e 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -129,7 +129,7 @@ relocate_kernel(unsigned long indirection_page,
unsigned long page_list,
unsigned long start_address,
unsigned int preserve_context,
- unsigned int sme_active);
+ unsigned int host_mem_enc_active);
#endif
#define ARCH_HAS_KIMAGE_ARCH
@@ -150,11 +150,6 @@ struct kimage_arch {
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
-
- /* Core ELF header buffer */
- void *elf_headers;
- unsigned long elf_headers_sz;
- unsigned long elf_load_addr;
};
#endif /* CONFIG_X86_32 */
@@ -191,6 +186,14 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+#ifdef CONFIG_KEXEC_FILE
+struct purgatory_info;
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
+#endif
#endif
typedef void crash_vmclear_fn(void);
diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h
index 97bbb4a9083a..ff5c7134a37a 100644
--- a/arch/x86/include/asm/kfence.h
+++ b/arch/x86/include/asm/kfence.h
@@ -8,6 +8,8 @@
#ifndef _ASM_X86_KFENCE_H
#define _ASM_X86_KFENCE_H
+#ifndef MODULE
+
#include <linux/bug.h>
#include <linux/kfence.h>
@@ -56,9 +58,16 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect)
else
set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- /* Flush this CPU's TLB. */
+ /*
+ * Flush this CPU's TLB, assuming whoever did the allocation/free is
+ * likely to continue running on this CPU.
+ */
+ preempt_disable();
flush_tlb_one_kernel(addr);
+ preempt_enable();
return true;
}
+#endif /* !MODULE */
+
#endif /* _ASM_X86_KFENCE_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index d20a3d6be36e..71ea2eab43d5 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -49,7 +49,6 @@ extern __visible kprobe_opcode_t optprobe_template_end[];
extern const int kretprobe_blacklist_size;
void arch_remove_kprobe(struct kprobe *p);
-asmlinkage void kretprobe_trampoline(void);
extern void arch_kprobe_override_function(struct pt_regs *regs);
@@ -65,10 +64,22 @@ struct arch_specific_insn {
* a post_handler).
*/
unsigned boostable:1;
- unsigned if_modifier:1;
- unsigned is_call:1;
- unsigned is_pushf:1;
- unsigned is_abs_ip:1;
+ unsigned char size; /* The size of insn */
+ union {
+ unsigned char opcode;
+ struct {
+ unsigned char type;
+ } jcc;
+ struct {
+ unsigned char type;
+ unsigned char asize;
+ } loop;
+ struct {
+ unsigned char reg;
+ } indirect;
+ };
+ s32 rel32; /* relative offset must be s32, s16, or s8 */
+ void (*emulate_op)(struct kprobe *p, struct pt_regs *regs);
/* Number of bytes of text poked */
int tp_len;
};
@@ -107,7 +118,6 @@ extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
extern int kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data);
extern int kprobe_int3_handler(struct pt_regs *regs);
-extern int kprobe_debug_handler(struct pt_regs *regs);
#else
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 323641097f63..da47f60a4650 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -1,29 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL)
+#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_OPTIONAL)
BUILD_BUG_ON(1)
#endif
/*
- * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate
- * "static_call()"s. They are also intended for use when defining
- * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those
- * functions that follow the [svm|vmx]_func_name convention.
- * KVM_X86_OP_NULL() can leave a NULL definition for the
- * case where there is no definition or a function name that
- * doesn't match the typical naming convention is supplied.
+ * KVM_X86_OP() and KVM_X86_OP_OPTIONAL() are used to help generate
+ * both DECLARE/DEFINE_STATIC_CALL() invocations and
+ * "static_call_update()" calls.
+ *
+ * KVM_X86_OP_OPTIONAL() can be used for those functions that can have
+ * a NULL definition, for example if "static_call_cond()" will be used
+ * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
+ * to make a definition optional, but in this case the default will
+ * be __static_call_return0.
*/
-KVM_X86_OP_NULL(hardware_enable)
-KVM_X86_OP_NULL(hardware_disable)
-KVM_X86_OP_NULL(hardware_unsetup)
-KVM_X86_OP_NULL(cpu_has_accelerated_tpr)
+KVM_X86_OP(hardware_enable)
+KVM_X86_OP(hardware_disable)
+KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
KVM_X86_OP(vm_init)
-KVM_X86_OP_NULL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_destroy)
KVM_X86_OP(vcpu_create)
KVM_X86_OP(vcpu_free)
KVM_X86_OP(vcpu_reset)
-KVM_X86_OP(prepare_guest_switch)
+KVM_X86_OP(prepare_switch_to_guest)
KVM_X86_OP(vcpu_load)
KVM_X86_OP(vcpu_put)
KVM_X86_OP(update_exception_bitmap)
@@ -33,8 +34,9 @@ KVM_X86_OP(get_segment_base)
KVM_X86_OP(get_segment)
KVM_X86_OP(get_cpl)
KVM_X86_OP(set_segment)
-KVM_X86_OP_NULL(get_cs_db_l_bits)
+KVM_X86_OP(get_cs_db_l_bits)
KVM_X86_OP(set_cr0)
+KVM_X86_OP_OPTIONAL(post_set_cr3)
KVM_X86_OP(is_valid_cr4)
KVM_X86_OP(set_cr4)
KVM_X86_OP(set_efer)
@@ -47,21 +49,23 @@ KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
KVM_X86_OP(set_rflags)
-KVM_X86_OP(tlb_flush_all)
-KVM_X86_OP(tlb_flush_current)
-KVM_X86_OP_NULL(tlb_remote_flush)
-KVM_X86_OP_NULL(tlb_remote_flush_with_range)
-KVM_X86_OP(tlb_flush_gva)
-KVM_X86_OP(tlb_flush_guest)
-KVM_X86_OP(run)
-KVM_X86_OP_NULL(handle_exit)
-KVM_X86_OP_NULL(skip_emulated_instruction)
-KVM_X86_OP_NULL(update_emulated_instruction)
+KVM_X86_OP(get_if_flag)
+KVM_X86_OP(flush_tlb_all)
+KVM_X86_OP(flush_tlb_current)
+KVM_X86_OP_OPTIONAL(tlb_remote_flush)
+KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range)
+KVM_X86_OP(flush_tlb_gva)
+KVM_X86_OP(flush_tlb_guest)
+KVM_X86_OP(vcpu_pre_run)
+KVM_X86_OP(vcpu_run)
+KVM_X86_OP(handle_exit)
+KVM_X86_OP(skip_emulated_instruction)
+KVM_X86_OP_OPTIONAL(update_emulated_instruction)
KVM_X86_OP(set_interrupt_shadow)
KVM_X86_OP(get_interrupt_shadow)
KVM_X86_OP(patch_hypercall)
-KVM_X86_OP(set_irq)
-KVM_X86_OP(set_nmi)
+KVM_X86_OP(inject_irq)
+KVM_X86_OP(inject_nmi)
KVM_X86_OP(queue_exception)
KVM_X86_OP(cancel_injection)
KVM_X86_OP(interrupt_allowed)
@@ -70,54 +74,61 @@ KVM_X86_OP(get_nmi_mask)
KVM_X86_OP(set_nmi_mask)
KVM_X86_OP(enable_nmi_window)
KVM_X86_OP(enable_irq_window)
-KVM_X86_OP(update_cr8_intercept)
+KVM_X86_OP_OPTIONAL(update_cr8_intercept)
KVM_X86_OP(check_apicv_inhibit_reasons)
-KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl)
KVM_X86_OP(refresh_apicv_exec_ctrl)
-KVM_X86_OP(hwapic_irr_update)
-KVM_X86_OP(hwapic_isr_update)
-KVM_X86_OP_NULL(guest_apic_has_interrupt)
-KVM_X86_OP(load_eoi_exitmap)
-KVM_X86_OP(set_virtual_apic_mode)
-KVM_X86_OP_NULL(set_apic_access_page_addr)
-KVM_X86_OP(deliver_posted_interrupt)
-KVM_X86_OP_NULL(sync_pir_to_irr)
-KVM_X86_OP(set_tss_addr)
-KVM_X86_OP(set_identity_map_addr)
+KVM_X86_OP_OPTIONAL(hwapic_irr_update)
+KVM_X86_OP_OPTIONAL(hwapic_isr_update)
+KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt)
+KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
+KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
+KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
+KVM_X86_OP(deliver_interrupt)
+KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
+KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
+KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
-KVM_X86_OP_NULL(has_wbinvd_exit)
-KVM_X86_OP(write_l1_tsc_offset)
+KVM_X86_OP(has_wbinvd_exit)
+KVM_X86_OP(get_l2_tsc_offset)
+KVM_X86_OP(get_l2_tsc_multiplier)
+KVM_X86_OP(write_tsc_offset)
+KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP_NULL(request_immediate_exit)
+KVM_X86_OP(request_immediate_exit)
KVM_X86_OP(sched_in)
-KVM_X86_OP_NULL(update_cpu_dirty_logging)
-KVM_X86_OP_NULL(pre_block)
-KVM_X86_OP_NULL(post_block)
-KVM_X86_OP_NULL(vcpu_blocking)
-KVM_X86_OP_NULL(vcpu_unblocking)
-KVM_X86_OP_NULL(update_pi_irte)
-KVM_X86_OP_NULL(apicv_post_state_restore)
-KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt)
-KVM_X86_OP_NULL(set_hv_timer)
-KVM_X86_OP_NULL(cancel_hv_timer)
+KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
+KVM_X86_OP_OPTIONAL(vcpu_blocking)
+KVM_X86_OP_OPTIONAL(vcpu_unblocking)
+KVM_X86_OP_OPTIONAL(pi_update_irte)
+KVM_X86_OP_OPTIONAL(pi_start_assignment)
+KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
+KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_OPTIONAL(set_hv_timer)
+KVM_X86_OP_OPTIONAL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
KVM_X86_OP(smi_allowed)
-KVM_X86_OP(pre_enter_smm)
-KVM_X86_OP(pre_leave_smm)
+KVM_X86_OP(enter_smm)
+KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
-KVM_X86_OP_NULL(mem_enc_op)
-KVM_X86_OP_NULL(mem_enc_reg_region)
-KVM_X86_OP_NULL(mem_enc_unreg_region)
+KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(mem_enc_register_region)
+KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
+KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
+KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
+KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
KVM_X86_OP(get_msr_feature)
KVM_X86_OP(can_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
-KVM_X86_OP_NULL(enable_direct_tlbflush)
-KVM_X86_OP_NULL(migrate_timers)
+KVM_X86_OP_OPTIONAL(enable_direct_tlbflush)
+KVM_X86_OP_OPTIONAL(migrate_timers)
KVM_X86_OP(msr_filter_changed)
-KVM_X86_OP_NULL(complete_emulated_msr)
+KVM_X86_OP(complete_emulated_msr)
+KVM_X86_OP(vcpu_deliver_sipi_vector)
+KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
#undef KVM_X86_OP
-#undef KVM_X86_OP_NULL
+#undef KVM_X86_OP_OPTIONAL
+#undef KVM_X86_OP_OPTIONAL_RET0
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
new file mode 100644
index 000000000000..fdfd8e06fee6
--- /dev/null
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate
+ * both DECLARE/DEFINE_STATIC_CALL() invocations and
+ * "static_call_update()" calls.
+ *
+ * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have
+ * a NULL definition, for example if "static_call_cond()" will be used
+ * at the call sites.
+ */
+KVM_X86_PMU_OP(pmc_perf_hw_id)
+KVM_X86_PMU_OP(pmc_is_enabled)
+KVM_X86_PMU_OP(pmc_idx_to_pmc)
+KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
+KVM_X86_PMU_OP(msr_idx_to_pmc)
+KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
+KVM_X86_PMU_OP(is_valid_msr)
+KVM_X86_PMU_OP(get_msr)
+KVM_X86_PMU_OP(set_msr)
+KVM_X86_PMU_OP(refresh)
+KVM_X86_PMU_OP(init)
+KVM_X86_PMU_OP(reset)
+KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
+KVM_X86_PMU_OP_OPTIONAL(cleanup)
+
+#undef KVM_X86_PMU_OP
+#undef KVM_X86_PMU_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3768819693e5..9217bd6cf0d1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
#include <linux/cpumask.h>
#include <linux/irq_work.h>
#include <linux/irq.h>
+#include <linux/workqueue.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -37,9 +38,20 @@
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
-#define KVM_MAX_VCPUS 288
-#define KVM_SOFT_MAX_VCPUS 240
-#define KVM_MAX_VCPU_ID 1023
+#define KVM_MAX_VCPUS 1024
+
+/*
+ * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
+ * might be larger than the actual number of VCPUs because the
+ * APIC ID encodes CPU topology information.
+ *
+ * In the worst case, we'll need less than one extra bit for the
+ * Core ID, and less than one extra bit for the Package (Die) ID,
+ * so ratio of 4 should be enough.
+ */
+#define KVM_VCPU_ID_RATIO 4
+#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
@@ -85,12 +97,14 @@
#define KVM_REQ_APICV_UPDATE \
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
-#define KVM_REQ_HV_TLB_FLUSH \
- KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_TLB_FLUSH_GUEST \
+ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \
+ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -113,6 +127,7 @@
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define UNMAPPED_GVA (~(gpa_t)0)
+#define INVALID_GPA (~(gpa_t)0)
/* KVM Hugepage definitions for x86 */
#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
@@ -123,14 +138,7 @@
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
-static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
-{
- /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
- return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
- (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-}
-
-#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
@@ -199,6 +207,7 @@ enum x86_intercept_stage;
#define KVM_NR_DB_REGS 4
+#define DR6_BUS_LOCK (1 << 11)
#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
#define DR6_BT (1 << 15)
@@ -212,7 +221,7 @@ enum x86_intercept_stage;
* DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
*/
#define DR6_ACTIVE_LOW 0xffff0ff0
-#define DR6_VOLATILE 0x0001e00f
+#define DR6_VOLATILE 0x0001e80f
#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
#define DR7_BP_EN_MASK 0x000000ff
@@ -221,14 +230,26 @@ enum x86_intercept_stage;
#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | \
+ KVM_GUESTDBG_SINGLESTEP | \
+ KVM_GUESTDBG_USE_HW_BP | \
+ KVM_GUESTDBG_USE_SW_BP | \
+ KVM_GUESTDBG_INJECT_BP | \
+ KVM_GUESTDBG_INJECT_DB | \
+ KVM_GUESTDBG_BLOCKIRQ)
+
+
#define PFERR_PRESENT_BIT 0
#define PFERR_WRITE_BIT 1
#define PFERR_USER_BIT 2
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
#define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
#define PFERR_GUEST_FINAL_BIT 32
#define PFERR_GUEST_PAGE_BIT 33
+#define PFERR_IMPLICIT_ACCESS_BIT 48
#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
@@ -236,8 +257,10 @@ enum x86_intercept_stage;
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
+#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT)
#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
PFERR_WRITE_MASK | \
@@ -256,29 +279,60 @@ enum x86_intercept_stage;
struct kvm_kernel_irq_routing_entry;
/*
- * the pages used as guest page table on soft mmu are tracked by
- * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
- * by indirect shadow page can not be more than 15 bits.
+ * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
+ * also includes TDP pages) to determine whether or not a page can be used in
+ * the given MMU context. This is a subset of the overall kvm_cpu_role to
+ * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
+ * 2 bytes per gfn instead of 4 bytes per gfn.
*
- * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access,
- * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
+ * Upper-level shadow pages having gptes are tracked for write-protection via
+ * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create
+ * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
+ * gfn_track will overflow and explosions will ensure.
+ *
+ * A unique shadow page (SP) for a gfn is created if and only if an existing SP
+ * cannot be reused. The ability to reuse a SP is tracked by its role, which
+ * incorporates various mode bits and properties of the SP. Roughly speaking,
+ * the number of unique SPs that can theoretically be created is 2^n, where n
+ * is the number of bits that are used to compute the role.
+ *
+ * But, even though there are 19 bits in the mask below, not all combinations
+ * of modes and flags are possible:
+ *
+ * - invalid shadow pages are not accounted, so the bits are effectively 18
+ *
+ * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
+ * execonly and ad_disabled are only used for nested EPT which has
+ * has_4_byte_gpte=0. Therefore, 2 bits are always unused.
+ *
+ * - the 4 bits of level are effectively limited to the values 2/3/4/5,
+ * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE
+ * paging has exactly one upper level, making level completely redundant
+ * when has_4_byte_gpte=1.
+ *
+ * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
+ * cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
+ *
+ * Therefore, the maximum number of possible upper-level shadow pages for a
+ * single gfn is a bit less than 2^13.
*/
union kvm_mmu_page_role {
u32 word;
struct {
unsigned level:4;
- unsigned gpte_is_8_bytes:1;
+ unsigned has_4_byte_gpte:1;
unsigned quadrant:2;
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
- unsigned nxe:1;
+ unsigned efer_nx:1;
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
unsigned ad_disabled:1;
unsigned guest_mode:1;
- unsigned :6;
+ unsigned passthrough:1;
+ unsigned :5;
/*
* This is left at the top of the word so that
@@ -290,28 +344,40 @@ union kvm_mmu_page_role {
};
};
-union kvm_mmu_extended_role {
/*
- * This structure complements kvm_mmu_page_role caching everything needed for
- * MMU configuration. If nothing in both these structures changed, MMU
- * re-configuration can be skipped. @valid bit is set on first usage so we don't
- * treat all-zero structure as valid data.
+ * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties
+ * relevant to the current MMU configuration. When loading CR0, CR4, or EFER,
+ * including on nested transitions, if nothing in the full role changes then
+ * MMU re-configuration can be skipped. @valid bit is set on first usage so we
+ * don't treat all-zero structure as valid data.
+ *
+ * The properties that are tracked in the extended role but not the page role
+ * are for things that either (a) do not affect the validity of the shadow page
+ * or (b) are indirectly reflected in the shadow page's role. For example,
+ * CR4.PKE only affects permission checks for software walks of the guest page
+ * tables (because KVM doesn't support Protection Keys with shadow paging), and
+ * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level.
+ *
+ * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role.
+ * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and
+ * SMAP, but the MMU's permission checks for software walks need to be SMEP and
+ * SMAP aware regardless of CR0.WP.
*/
+union kvm_mmu_extended_role {
u32 word;
struct {
unsigned int valid:1;
unsigned int execonly:1;
- unsigned int cr0_pg:1;
- unsigned int cr4_pae:1;
unsigned int cr4_pse:1;
unsigned int cr4_pke:1;
unsigned int cr4_smap:1;
unsigned int cr4_smep:1;
- unsigned int maxphyaddr:6;
+ unsigned int cr4_la57:1;
+ unsigned int efer_lma:1;
};
};
-union kvm_mmu_role {
+union kvm_cpu_role {
u64 as_u64;
struct {
union kvm_mmu_page_role base;
@@ -351,6 +417,7 @@ struct kvm_mmu_root_info {
#define KVM_HAVE_MMU_RWLOCK
struct kvm_mmu_page;
+struct kvm_page_fault;
/*
* x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
@@ -360,32 +427,18 @@ struct kvm_mmu_page;
struct kvm_mmu {
unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
- int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err,
- bool prefault);
+ int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa,
- u32 access, struct x86_exception *exception);
- gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t gva_or_gpa, u64 access,
+ struct x86_exception *exception);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
- hpa_t root_hpa;
- gpa_t root_pgd;
- union kvm_mmu_role mmu_role;
- u8 root_level;
- u8 shadow_root_level;
- u8 ept_ad;
- bool direct_map;
- struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
-
- /*
- * Bitmap; bit set = permission fault
- * Byte index: page fault error code [4:1]
- * Bit index: pte permissions in ACC_* format
- */
- u8 permissions[16];
+ struct kvm_mmu_root_info root;
+ union kvm_cpu_role cpu_role;
+ union kvm_mmu_page_role root_role;
/*
* The pkru_mask indicates if protection key checks are needed. It
@@ -395,8 +448,18 @@ struct kvm_mmu {
*/
u32 pkru_mask;
+ struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
+
+ /*
+ * Bitmap; bit set = permission fault
+ * Byte index: page fault error code [4:1]
+ * Bit index: pte permissions in ACC_* format
+ */
+ u8 permissions[16];
+
u64 *pae_root;
- u64 *lm_root;
+ u64 *pml4_root;
+ u64 *pml5_root;
/*
* check zero bits on shadow page table entries, these
@@ -407,11 +470,6 @@ struct kvm_mmu {
struct rsvd_bits_validate guest_rsvd_check;
- /* Can have large pages at levels 2..last_nonleaf_level-1. */
- u8 last_nonleaf_level;
-
- bool nx;
-
u64 pdptrs[4]; /* pae */
};
@@ -437,8 +495,11 @@ struct kvm_pmc {
* ctrl value for fixed counters.
*/
u64 current_config;
+ bool is_paused;
+ bool intr;
};
+#define KVM_PMC_MAX_FIXED 3
struct kvm_pmu {
unsigned nr_arch_gp_counters;
unsigned nr_arch_fixed_counters;
@@ -446,14 +507,14 @@ struct kvm_pmu {
u64 fixed_ctr_ctrl;
u64 global_ctrl;
u64 global_status;
- u64 global_ovf_ctrl;
u64 counter_bitmask[2];
u64 global_ctrl_mask;
u64 global_ovf_ctrl_mask;
u64 reserved_bits;
+ u64 raw_event_mask;
u8 version;
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
- struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
+ struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
struct irq_work irq_work;
DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
@@ -477,7 +538,6 @@ struct kvm_pmu_ops;
enum {
KVM_DEBUGREG_BP_ENABLED = 1,
KVM_DEBUGREG_WONT_EXIT = 2,
- KVM_DEBUGREG_RELOAD = 4,
};
struct kvm_mtrr_range {
@@ -529,22 +589,36 @@ struct kvm_vcpu_hv {
struct kvm_hyperv_exit exit;
struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
- cpumask_t tlb_flush;
+ bool enforce_cpuid;
+ struct {
+ u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */
+ u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */
+ u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */
+ u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
+ u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
+ u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
+ } cpuid_cache;
};
/* Xen HVM per vcpu emulation context */
struct kvm_vcpu_xen {
u64 hypercall_rip;
u32 current_runstate;
- bool vcpu_info_set;
- bool vcpu_time_info_set;
- bool runstate_set;
- struct gfn_to_hva_cache vcpu_info_cache;
- struct gfn_to_hva_cache vcpu_time_info_cache;
- struct gfn_to_hva_cache runstate_cache;
+ u8 upcall_vector;
+ struct gfn_to_pfn_cache vcpu_info_cache;
+ struct gfn_to_pfn_cache vcpu_time_info_cache;
+ struct gfn_to_pfn_cache runstate_cache;
u64 last_steal;
u64 runstate_entry_time;
u64 runstate_times[4];
+ unsigned long evtchn_pending_sel;
+ u32 vcpu_id; /* The Xen / ACPI vCPU ID */
+ u32 timer_virq;
+ u64 timer_expires; /* In guest epoch */
+ atomic_t timer_pending;
+ struct hrtimer timer;
+ int poll_evtchn;
+ struct timer_list poll_timer;
};
struct kvm_vcpu_arch {
@@ -579,8 +653,10 @@ struct kvm_vcpu_arch {
u64 ia32_misc_enable_msr;
u64 smbase;
u64 smi_count;
+ bool at_instruction_boundary;
bool tpr_access_reporting;
bool xsaves_enabled;
+ bool xfd_no_write_intercept;
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
@@ -630,18 +706,17 @@ struct kvm_vcpu_arch {
*
* Note that while the PKRU state lives inside the fpu registers,
* it is switched out separately at VMENTER and VMEXIT time. The
- * "guest_fpu" state here contains the guest FPU context, with the
+ * "guest_fpstate" state here contains the guest FPU context, with the
* host PRKU bits.
*/
- struct fpu *user_fpu;
- struct fpu *guest_fpu;
+ struct fpu_guest guest_fpu;
u64 xcr0;
- u64 guest_supported_xcr0;
struct kvm_pio_request pio;
void *pio_data;
- void *guest_ins_data;
+ void *sev_pio_data;
+ unsigned sev_pio_count;
u8 event_exit_inst_len;
@@ -666,10 +741,10 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
+ u32 kvm_cpuid_base;
u64 reserved_gpa_bits;
int maxphyaddr;
- int max_tdp_level;
/* emulate context */
@@ -681,8 +756,7 @@ struct kvm_vcpu_arch {
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
- struct gfn_to_hva_cache pv_time;
- bool pv_time_enabled;
+ struct gfn_to_pfn_cache pv_time;
/* set guest stopped flag in pvclock flags field */
bool pvclock_set_guest_stopped_request;
@@ -690,11 +764,11 @@ struct kvm_vcpu_arch {
u8 preempted;
u64 msr_val;
u64 last_steal;
- struct gfn_to_pfn_cache cache;
+ struct gfn_to_hva_cache cache;
} st;
u64 l1_tsc_offset;
- u64 tsc_offset;
+ u64 tsc_offset; /* current tsc offset */
u64 last_guest_tsc;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
@@ -708,12 +782,14 @@ struct kvm_vcpu_arch {
u32 virtual_tsc_khz;
s64 ia32_tsc_adjust_msr;
u64 msr_ia32_power_ctl;
- u64 tsc_scaling_ratio;
+ u64 l1_tsc_scaling_ratio;
+ u64 tsc_scaling_ratio; /* current scaling ratio */
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected; /* Trying to inject an NMI this entry */
bool smi_pending; /* SMI queued after currently running handler */
+ u8 handling_intr_from_guest;
struct kvm_mtrr mtrr_state;
u64 pat;
@@ -816,7 +892,7 @@ struct kvm_vcpu_arch {
bool l1tf_flush_l1d;
/* Host CPU on which VM-entry was most recently attempted */
- unsigned int last_vmentry_cpu;
+ int last_vmentry_cpu;
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
@@ -838,6 +914,16 @@ struct kvm_vcpu_arch {
/* Protected Guests */
bool guest_state_protected;
+
+ /*
+ * Set when PDPTS were loaded directly by the userspace without
+ * reading the guest memory
+ */
+ bool pdptrs_from_userspace;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ hpa_t hv_root_tdp;
+#endif
};
struct kvm_lpage_info {
@@ -890,12 +976,10 @@ enum hv_tsc_page_status {
HV_TSC_PAGE_UNSET = 0,
/* TSC page MSR was written by the guest, update pending */
HV_TSC_PAGE_GUEST_CHANGED,
- /* TSC page MSR was written by KVM userspace, update pending */
+ /* TSC page update was triggered from the host side */
HV_TSC_PAGE_HOST_CHANGED,
/* TSC page was properly set up and is currently active */
HV_TSC_PAGE_SET,
- /* TSC page is currently being updated and therefore is inactive */
- HV_TSC_PAGE_UPDATING,
/* TSC page was set up with an inaccessible GPA */
HV_TSC_PAGE_BROKEN,
};
@@ -923,6 +1007,12 @@ struct kvm_hv {
/* How many vCPUs have VP index != vCPU index */
atomic_t num_mismatched_vp_indexes;
+ /*
+ * How many SynICs use 'AutoEOI' feature
+ * (protected by arch.apicv_update_lock)
+ */
+ unsigned int synic_auto_eoi_used;
+
struct hv_partition_assist_pg *hv_pa_pg;
struct kvm_hv_syndbg hv_syndbg;
};
@@ -936,10 +1026,12 @@ struct msr_bitmap_range {
/* Xen emulation context */
struct kvm_xen {
+ u32 xen_version;
bool long_mode;
- bool shinfo_set;
u8 upcall_vector;
- struct gfn_to_hva_cache shinfo_cache;
+ struct gfn_to_pfn_cache shinfo_cache;
+ struct idr evtchn_ports;
+ unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
};
enum kvm_irqchip_mode {
@@ -954,12 +1046,80 @@ struct kvm_x86_msr_filter {
struct msr_bitmap_range ranges[16];
};
-#define APICV_INHIBIT_REASON_DISABLE 0
-#define APICV_INHIBIT_REASON_HYPERV 1
-#define APICV_INHIBIT_REASON_NESTED 2
-#define APICV_INHIBIT_REASON_IRQWIN 3
-#define APICV_INHIBIT_REASON_PIT_REINJ 4
-#define APICV_INHIBIT_REASON_X2APIC 5
+enum kvm_apicv_inhibit {
+
+ /********************************************************************/
+ /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
+ /********************************************************************/
+
+ /*
+ * APIC acceleration is disabled by a module parameter
+ * and/or not supported in hardware.
+ */
+ APICV_INHIBIT_REASON_DISABLE,
+
+ /*
+ * APIC acceleration is inhibited because AutoEOI feature is
+ * being used by a HyperV guest.
+ */
+ APICV_INHIBIT_REASON_HYPERV,
+
+ /*
+ * APIC acceleration is inhibited because the userspace didn't yet
+ * enable the kernel/split irqchip.
+ */
+ APICV_INHIBIT_REASON_ABSENT,
+
+ /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
+ * (out of band, debug measure of blocking all interrupts on this vCPU)
+ * was enabled, to avoid AVIC/APICv bypassing it.
+ */
+ APICV_INHIBIT_REASON_BLOCKIRQ,
+
+ /*
+ * For simplicity, the APIC acceleration is inhibited
+ * first time either APIC ID or APIC base are changed by the guest
+ * from their reset values.
+ */
+ APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
+
+ /******************************************************/
+ /* INHIBITs that are relevant only to the AMD's AVIC. */
+ /******************************************************/
+
+ /*
+ * AVIC is inhibited on a vCPU because it runs a nested guest.
+ *
+ * This is needed because unlike APICv, the peers of this vCPU
+ * cannot use the doorbell mechanism to signal interrupts via AVIC when
+ * a vCPU runs nested.
+ */
+ APICV_INHIBIT_REASON_NESTED,
+
+ /*
+ * On SVM, the wait for the IRQ window is implemented with pending vIRQ,
+ * which cannot be injected when the AVIC is enabled, thus AVIC
+ * is inhibited while KVM waits for IRQ window.
+ */
+ APICV_INHIBIT_REASON_IRQWIN,
+
+ /*
+ * PIT (i8254) 're-inject' mode, relies on EOI intercept,
+ * which AVIC doesn't support for edge triggered interrupts.
+ */
+ APICV_INHIBIT_REASON_PIT_REINJ,
+
+ /*
+ * AVIC is inhibited because the guest has x2apic in its CPUID.
+ */
+ APICV_INHIBIT_REASON_X2APIC,
+
+ /*
+ * AVIC is disabled because SEV doesn't support it.
+ */
+ APICV_INHIBIT_REASON_SEV,
+};
struct kvm_arch {
unsigned long n_used_mmu_pages;
@@ -973,6 +1133,13 @@ struct kvm_arch {
struct list_head lpage_disallowed_mmu_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
+ /*
+ * Protects marking pages unsync during page faults, as TDP MMU page
+ * faults only take mmu_lock for read. For simplicity, the unsync
+ * pages lock is always taken when marking pages unsync regardless of
+ * whether mmu_lock is held for read or write.
+ */
+ spinlock_t mmu_unsync_pages_lock;
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
@@ -989,7 +1156,10 @@ struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
- bool apic_access_page_done;
+ /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
+ struct rw_semaphore apicv_update_lock;
+
+ bool apic_access_memslot_enabled;
unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
@@ -1001,17 +1171,25 @@ struct kvm_arch {
unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
+
+ /*
+ * This also protects nr_vcpus_matched_tsc which is read from a
+ * preemption-disabled region, so it must be a raw spinlock.
+ */
raw_spinlock_t tsc_write_lock;
u64 last_tsc_nsec;
u64 last_tsc_write;
u32 last_tsc_khz;
+ u64 last_tsc_offset;
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
- spinlock_t pvclock_gtod_sync_lock;
+ u32 default_tsc_khz;
+
+ seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
@@ -1026,10 +1204,6 @@ struct kvm_arch {
struct kvm_hv hyperv;
struct kvm_xen xen;
- #ifdef CONFIG_KVM_MMU_AUDIT
- int audit_point;
- #endif
-
bool backwards_tsc_observed;
bool boot_vcpu_runs_old_kvmclock;
u32 bsp_vcpu_id;
@@ -1049,11 +1223,23 @@ struct kvm_arch {
bool exception_payload_enabled;
bool bus_lock_detection_enabled;
+ bool enable_pmu;
+ /*
+ * If exit_on_emulation_error is set, and the in-kernel instruction
+ * emulator fails to emulate an instruction, allow userspace
+ * the opportunity to look at it.
+ */
+ bool exit_on_emulation_error;
/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
u32 user_space_msr_mask;
struct kvm_x86_msr_filter __rcu *msr_filter;
+ u32 hypercall_exit_enabled;
+
+ /* Guest can access the SGX PROVISIONKEY. */
+ bool sgx_provisioning_allowed;
+
struct kvm_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
@@ -1068,25 +1254,36 @@ struct kvm_arch {
bool tdp_mmu_enabled;
/*
- * List of struct kvmp_mmu_pages being used as roots.
+ * List of struct kvm_mmu_pages being used as roots.
* All struct kvm_mmu_pages in the list should have
* tdp_mmu_page set.
- * All struct kvm_mmu_pages in the list should have a positive
- * root_count except when a thread holds the MMU lock and is removing
- * an entry from the list.
+ *
+ * For reads, this list is protected by:
+ * the MMU lock in read mode + RCU or
+ * the MMU lock in write mode
+ *
+ * For writes, this list is protected by:
+ * the MMU lock in read mode + the tdp_mmu_pages_lock or
+ * the MMU lock in write mode
+ *
+ * Roots will remain in the list until their tdp_mmu_root_count
+ * drops to zero, at which point the thread that decremented the
+ * count to zero should removed the root from the list and clean
+ * it up, freeing the root after an RCU grace period.
*/
struct list_head tdp_mmu_roots;
/*
* List of struct kvmp_mmu_pages not being used as roots.
* All struct kvm_mmu_pages in the list should have
- * tdp_mmu_page set and a root_count of 0.
+ * tdp_mmu_page set and a tdp_mmu_root_count of 0.
*/
struct list_head tdp_mmu_pages;
/*
* Protects accesses to the following fields when the MMU lock
* is held in read mode:
+ * - tdp_mmu_roots (above)
* - tdp_mmu_pages (above)
* - the link field of struct kvm_mmu_pages used by the TDP MMU
* - lpage_disallowed_mmu_pages
@@ -1096,25 +1293,52 @@ struct kvm_arch {
* the thread holds the MMU lock in write mode.
*/
spinlock_t tdp_mmu_pages_lock;
+ struct workqueue_struct *tdp_mmu_zap_wq;
#endif /* CONFIG_X86_64 */
+
+ /*
+ * If set, at least one shadow root has been allocated. This flag
+ * is used as one input when determining whether certain memslot
+ * related allocations are necessary.
+ */
+ bool shadow_root_allocated;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ hpa_t hv_root_tdp;
+ spinlock_t hv_root_tdp_lock;
+#endif
};
struct kvm_vm_stat {
- ulong mmu_shadow_zapped;
- ulong mmu_pte_write;
- ulong mmu_pde_zapped;
- ulong mmu_flooded;
- ulong mmu_recycled;
- ulong mmu_cache_miss;
- ulong mmu_unsync;
- ulong remote_tlb_flush;
- ulong lpages;
- ulong nx_lpage_splits;
- ulong max_mmu_page_hash_collisions;
+ struct kvm_vm_stat_generic generic;
+ u64 mmu_shadow_zapped;
+ u64 mmu_pte_write;
+ u64 mmu_pde_zapped;
+ u64 mmu_flooded;
+ u64 mmu_recycled;
+ u64 mmu_cache_miss;
+ u64 mmu_unsync;
+ union {
+ struct {
+ atomic64_t pages_4k;
+ atomic64_t pages_2m;
+ atomic64_t pages_1g;
+ };
+ atomic64_t pages[KVM_NR_PAGE_SIZES];
+ };
+ u64 nx_lpage_splits;
+ u64 max_mmu_page_hash_collisions;
+ u64 max_mmu_rmap_size;
};
struct kvm_vcpu_stat {
+ struct kvm_vcpu_stat_generic generic;
+ u64 pf_taken;
u64 pf_fixed;
+ u64 pf_emulate;
+ u64 pf_spurious;
+ u64 pf_fast;
+ u64 pf_mmio_spte_created;
u64 pf_guest;
u64 tlb_flush;
u64 invlpg;
@@ -1127,10 +1351,6 @@ struct kvm_vcpu_stat {
u64 nmi_window_exits;
u64 l1d_flush;
u64 halt_exits;
- u64 halt_successful_poll;
- u64 halt_attempted_poll;
- u64 halt_poll_invalid;
- u64 halt_wakeup;
u64 request_irq_exits;
u64 irq_exits;
u64 host_state_reload;
@@ -1141,8 +1361,12 @@ struct kvm_vcpu_stat {
u64 irq_injections;
u64 nmi_injections;
u64 req_event;
- u64 halt_poll_success_ns;
- u64 halt_poll_fail_ns;
+ u64 nested_run;
+ u64 directed_yield_attempted;
+ u64 directed_yield_successful;
+ u64 preemption_reported;
+ u64 preemption_other;
+ u64 guest_mode;
};
struct x86_instruction_info;
@@ -1170,10 +1394,11 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
}
struct kvm_x86_ops {
+ const char *name;
+
int (*hardware_enable)(void);
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
- bool (*cpu_has_accelerated_tpr)(void);
bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
@@ -1186,7 +1411,7 @@ struct kvm_x86_ops {
void (*vcpu_free)(struct kvm_vcpu *vcpu);
void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
- void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+ void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
@@ -1201,6 +1426,7 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
@@ -1213,9 +1439,10 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ bool (*get_if_flag)(struct kvm_vcpu *vcpu);
- void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
- void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
+ void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
+ void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
int (*tlb_remote_flush)(struct kvm *kvm);
int (*tlb_remote_flush_with_range)(struct kvm *kvm,
struct kvm_tlb_range *range);
@@ -1226,15 +1453,16 @@ struct kvm_x86_ops {
* Can potentially get non-canonical addresses through INVLPGs, which
* the implementation may choose to ignore if appropriate.
*/
- void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
+ void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr);
/*
* Flush any TLB entries created by the guest. Like tlb_flush_gva(),
* does not need to flush GPA->HPA mappings.
*/
- void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
+ void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
- enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1243,8 +1471,8 @@ struct kvm_x86_ops {
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
- void (*set_irq)(struct kvm_vcpu *vcpu);
- void (*set_nmi)(struct kvm_vcpu *vcpu);
+ void (*inject_irq)(struct kvm_vcpu *vcpu);
+ void (*inject_nmi)(struct kvm_vcpu *vcpu);
void (*queue_exception)(struct kvm_vcpu *vcpu);
void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
@@ -1254,8 +1482,7 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- bool (*check_apicv_inhibit_reasons)(ulong bit);
- void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate);
+ bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
@@ -1263,25 +1490,29 @@ struct kvm_x86_ops {
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
- int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+ void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
- int pgd_level);
+ void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level);
bool (*has_wbinvd_exit)(void);
- /* Returns actual tsc_offset set in active VMCS */
- u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
+ u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
/*
- * Retrieve somewhat arbitrary exit information. Intended to be used
- * only from within tracepoints to avoid VMREADs when tracing is off.
+ * Retrieve somewhat arbitrary exit information. Intended to
+ * be used only from within tracepoints or error paths.
*/
- void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
u32 *exit_int_info, u32 *exit_int_info_err_code);
int (*check_intercept)(struct kvm_vcpu *vcpu,
@@ -1301,27 +1532,14 @@ struct kvm_x86_ops {
int cpu_dirty_log_size;
void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
- /* pmu operations of sub-arch */
- const struct kvm_pmu_ops *pmu_ops;
const struct kvm_x86_nested_ops *nested_ops;
- /*
- * Architecture specific hooks for vCPU blocking due to
- * HLT instruction.
- * Returns for .pre_block():
- * - 0 means continue to block the vCPU.
- * - 1 means we cannot block the vCPU since some event
- * happens during this period, such as, 'ON' bit in
- * posted-interrupts descriptor is set.
- */
- int (*pre_block)(struct kvm_vcpu *vcpu);
- void (*post_block)(struct kvm_vcpu *vcpu);
-
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
- int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+ int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
+ void (*pi_start_assignment)(struct kvm *kvm);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
@@ -1332,17 +1550,21 @@ struct kvm_x86_ops {
void (*setup_mce)(struct kvm_vcpu *vcpu);
int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
- int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
- int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
+ int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+ int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
- int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
- int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
- int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
+ int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+ int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+ void (*guest_memory_reclaimed)(struct kvm *kvm);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
- bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
+ bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
@@ -1352,11 +1574,20 @@ struct kvm_x86_ops {
int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+
+ /*
+ * Returns vCPU specific APICv inhibit reasons
+ */
+ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_nested_ops {
+ void (*leave_nested)(struct kvm_vcpu *vcpu);
int (*check_events)(struct kvm_vcpu *vcpu);
+ bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+ void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
unsigned user_data_size);
@@ -1376,8 +1607,10 @@ struct kvm_x86_init_ops {
int (*disabled_by_bios)(void);
int (*check_processor_compatibility)(void);
int (*hardware_setup)(void);
+ unsigned int (*handle_intel_pt_intr)(void);
struct kvm_x86_ops *runtime_ops;
+ struct kvm_pmu_ops *pmu_ops;
};
struct kvm_arch_async_pf {
@@ -1387,28 +1620,25 @@ struct kvm_arch_async_pf {
bool direct_map;
};
+extern u32 __read_mostly kvm_nr_uret_msrs;
extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
+extern bool __read_mostly enable_apicv;
extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP(func) \
DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
-#define KVM_X86_OP_NULL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
-static inline void kvm_ops_static_call_update(void)
-{
-#define KVM_X86_OP(func) \
- static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP_NULL KVM_X86_OP
-#include <asm/kvm-x86-ops.h>
-}
-
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
}
+
+#define __KVM_HAVE_ARCH_VM_FREE
void kvm_arch_free_vm(struct kvm *kvm);
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
@@ -1421,34 +1651,39 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
return -ENOTSUPP;
}
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
+#define kvm_arch_pmi_in_guest(vcpu) \
+ ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
+
+void kvm_mmu_x86_module_init(void);
+int kvm_mmu_vendor_module_init(void);
+void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-void kvm_mmu_init_vm(struct kvm *kvm);
+int kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask);
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
+ const struct kvm_memory_slot *memslot,
int start_level);
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
+ const struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
-bool pdptrs_changed(struct kvm_vcpu *vcpu);
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@@ -1488,7 +1723,7 @@ extern u64 kvm_mce_cap_supported;
/*
* EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
* userspace I/O) to indicate that the emulation context
- * should be resued as is, i.e. skip initialization of
+ * should be reused as is, i.e. skip initialization of
* emulation context, instruction fetch and decode.
*
* EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
@@ -1498,7 +1733,8 @@ extern u64 kvm_mce_cap_supported;
*
* EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
* decode the instruction length. For use *only* by
- * kvm_x86_ops.skip_emulated_instruction() implementations.
+ * kvm_x86_ops.skip_emulated_instruction() implementations if
+ * EMULTYPE_COMPLETE_USER_EXIT is not set.
*
* EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
* retry native execution under certain conditions,
@@ -1513,11 +1749,15 @@ extern u64 kvm_mce_cap_supported;
*
* EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
* backdoor emulation, which is opt in via module param.
- * VMware backoor emulation handles select instructions
+ * VMware backdoor emulation handles select instructions
* and reinjects the #GP for all other cases.
*
* EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
* case the CR2/GPA value pass on the stack is valid.
+ *
+ * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
+ * state and inject single-step #DBs after skipping
+ * an instruction (after completing userspace I/O).
*/
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
@@ -1526,10 +1766,14 @@ extern u64 kvm_mce_cap_supported;
#define EMULTYPE_TRAP_UD_FORCED (1 << 4)
#define EMULTYPE_VMWARE_GP (1 << 5)
#define EMULTYPE_PF (1 << 6)
+#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
void *insn, int insn_len);
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
+ u64 *data, u8 ndata);
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
@@ -1538,11 +1782,16 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu);
int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
@@ -1553,8 +1802,6 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
int reason, bool has_error_code, u32 error_code);
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu);
-
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@@ -1565,15 +1812,14 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -1583,9 +1829,6 @@ void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t gfn, void *data, int offset, int len,
- u32 access);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
@@ -1613,14 +1856,9 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free);
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception);
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -1631,10 +1869,24 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
bool kvm_apicv_activated(struct kvm *kvm);
-void kvm_apicv_init(struct kvm *kvm, bool enable);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
-void kvm_request_apicv_update(struct kvm *kvm, bool activate,
- unsigned long bit);
+void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set);
+void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set);
+
+static inline void kvm_set_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason)
+{
+ kvm_set_or_clear_apicv_inhibit(kvm, reason, true);
+}
+
+static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason)
+{
+ kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
+}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -1644,11 +1896,10 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t gva, hpa_t root_hpa);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
- bool skip_mmu_sync);
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
- int tdp_huge_page_level);
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level);
static inline u16 kvm_read_ldt(void)
{
@@ -1672,11 +1923,6 @@ static inline unsigned long read_msr(unsigned long msr)
}
#endif
-static inline u32 get_rdx_init_val(void)
-{
- return 0x600; /* P6 family */
-}
-
static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
{
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -1709,59 +1955,36 @@ enum {
#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
-asmlinkage void kvm_spurious_fault(void);
-
-/*
- * Hardware virtualization extension instructions may fault if a
- * reboot turns off virtualization while processes are running.
- * Usually after catching the fault we just panic; during reboot
- * instead the instruction is ignored.
- */
-#define __kvm_handle_fault_on_reboot(insn) \
- "666: \n\t" \
- insn "\n\t" \
- "jmp 668f \n\t" \
- "667: \n\t" \
- "1: \n\t" \
- ".pushsection .discard.instr_begin \n\t" \
- ".long 1b - . \n\t" \
- ".popsection \n\t" \
- "call kvm_spurious_fault \n\t" \
- "1: \n\t" \
- ".pushsection .discard.instr_end \n\t" \
- ".long 1b - . \n\t" \
- ".popsection \n\t" \
- "668: \n\t" \
- _ASM_EXTABLE(666b, 667b)
-
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit);
-void kvm_define_user_return_msr(unsigned index, u32 msr);
+int kvm_add_user_return_msr(u32 msr);
+int kvm_find_user_return_msr(u32 msr);
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
+static inline bool kvm_is_supported_user_return_msr(u32 msr)
+{
+ return kvm_find_user_return_msr(msr) >= 0;
+}
+
+u64 kvm_scale_tsc(u64 tsc, u64 ratio);
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
-void kvm_make_mclock_inprogress_request(struct kvm *kvm);
void kvm_make_scan_ioapic_request(struct kvm *kvm);
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap);
@@ -1780,8 +2003,6 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
-int kvm_is_in_guest(void);
-
void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
@@ -1810,8 +2031,6 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
}
-static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
-
static inline int kvm_cpu_get_apicid(int mps_cpu)
{
#ifdef CONFIG_X86_LOCAL_APIC
@@ -1830,4 +2049,17 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
int kvm_cpu_dirty_log_size(void);
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
+
+#define KVM_CLOCK_VALID_FLAGS \
+ (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
+#define KVM_X86_VALID_QUIRKS \
+ (KVM_X86_QUIRK_LINT0_REENABLED | \
+ KVM_X86_QUIRK_CD_NW_CLEARED | \
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \
+ KVM_X86_QUIRK_OUT_7E_INC_RIP | \
+ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index 87bd6025d91d..eb186bc57f6a 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -46,11 +46,15 @@ struct kvm_page_track_notifier_node {
struct kvm_page_track_notifier_node *node);
};
-void kvm_page_track_init(struct kvm *kvm);
+int kvm_page_track_init(struct kvm *kvm);
void kvm_page_track_cleanup(struct kvm *kvm);
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
+
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
-int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
unsigned long npages);
void kvm_slot_page_track_add_page(struct kvm *kvm,
@@ -59,8 +63,9 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
void kvm_slot_page_track_remove_page(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode);
-bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
- enum kvm_page_track_mode mode);
+bool kvm_slot_page_track_is_active(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, enum kvm_page_track_mode mode);
void
kvm_page_track_register_notifier(struct kvm *kvm,
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 338119852512..57bc74e112f2 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -7,7 +7,7 @@
#include <linux/interrupt.h>
#include <uapi/asm/kvm_para.h>
-extern void kvmclock_init(void);
+#include <asm/tdx.h>
#ifdef CONFIG_KVM_GUEST
bool kvm_check_and_clear_guest_paused(void);
@@ -34,6 +34,10 @@ static inline bool kvm_check_and_clear_guest_paused(void)
static inline long kvm_hypercall0(unsigned int nr)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, 0, 0, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr)
@@ -44,6 +48,10 @@ static inline long kvm_hypercall0(unsigned int nr)
static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, 0, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1)
@@ -55,6 +63,10 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
unsigned long p2)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2)
@@ -66,6 +78,10 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, p3, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3)
@@ -78,6 +94,10 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
unsigned long p4)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, p3, p4);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
@@ -85,14 +105,27 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
return ret;
}
+static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
+{
+ long ret;
+
+ asm volatile("vmmcall"
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
+ : "memory");
+ return ret;
+}
+
#ifdef CONFIG_KVM_GUEST
+void kvmclock_init(void);
+void kvmclock_disable(void);
bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void);
void kvm_async_pf_task_wait_schedule(u32 token);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_apf_flags(void);
-void kvm_disable_steal_time(void);
bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -137,11 +170,6 @@ static inline u32 kvm_read_and_reset_apf_flags(void)
return 0;
}
-static inline void kvm_disable_steal_time(void)
-{
- return;
-}
-
static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
return false;
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
index eceea9299097..6c5765192102 100644
--- a/arch/x86/include/asm/kvmclock.h
+++ b/arch/x86/include/asm/kvmclock.h
@@ -2,6 +2,20 @@
#ifndef _ASM_X86_KVM_CLOCK_H
#define _ASM_X86_KVM_CLOCK_H
+#include <linux/percpu.h>
+
extern struct clocksource kvm_clock;
+DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+
+static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+{
+ return &this_cpu_read(hv_clock_per_cpu)->pvti;
+}
+
+static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
+{
+ return this_cpu_read(hv_clock_per_cpu);
+}
+
#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 365111789cc6..85865f1645bd 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -3,6 +3,7 @@
#define _ASM_X86_LINKAGE_H
#include <linux/stringify.h>
+#include <asm/ibt.h>
#undef notrace
#define notrace __attribute__((no_instrument_function))
@@ -18,7 +19,51 @@
#define __ALIGN_STR __stringify(__ALIGN)
#endif
+#ifdef CONFIG_SLS
+#define RET ret; int3
+#else
+#define RET ret
+#endif
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_SLS
+#define ASM_RET "ret; int3\n\t"
+#else
+#define ASM_RET "ret\n\t"
+#endif
+
#endif /* __ASSEMBLY__ */
+/* SYM_FUNC_START -- use for global functions */
+#define SYM_FUNC_START(name) \
+ SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+ ENDBR
+
+/* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */
+#define SYM_FUNC_START_NOALIGN(name) \
+ SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
+ ENDBR
+
+/* SYM_FUNC_START_LOCAL -- use for local functions */
+#define SYM_FUNC_START_LOCAL(name) \
+ SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
+ ENDBR
+
+/* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */
+#define SYM_FUNC_START_LOCAL_NOALIGN(name) \
+ SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
+ ENDBR
+
+/* SYM_FUNC_START_WEAK -- use for weak functions */
+#define SYM_FUNC_START_WEAK(name) \
+ SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
+ ENDBR
+
+/* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */
+#define SYM_FUNC_START_WEAK_NOALIGN(name) \
+ SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
+ ENDBR
+
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
deleted file mode 100644
index 7c5cc6660e4b..000000000000
--- a/arch/x86/include/asm/livepatch.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * livepatch.h - x86-specific Kernel Live Patching Core
- *
- * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
- * Copyright (C) 2014 SUSE
- */
-
-#ifndef _ASM_X86_LIVEPATCH_H
-#define _ASM_X86_LIVEPATCH_H
-
-#include <asm/setup.h>
-#include <linux/ftrace.h>
-
-static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
-{
- ftrace_instruction_pointer_set(fregs, ip);
-}
-
-#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ddfb3cad8dff..cc73061e7255 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -205,28 +205,16 @@ struct cper_ia_proc_ctx;
int mcheck_init(void);
void mcheck_cpu_init(struct cpuinfo_x86 *c);
void mcheck_cpu_clear(struct cpuinfo_x86 *c);
-void mcheck_vendor_init_severity(void);
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
u64 lapic_id);
#else
static inline int mcheck_init(void) { return 0; }
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
-static inline void mcheck_vendor_init_severity(void) {}
static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
u64 lapic_id) { return -EINVAL; }
#endif
-#ifdef CONFIG_X86_ANCIENT_MCE
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
-#else
-static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void enable_p5_mce(void) {}
-#endif
-
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct device *, mce_device);
@@ -265,6 +253,7 @@ enum mcp_flags {
MCP_TIMESTAMP = BIT(0), /* log time stamp */
MCP_UC = BIT(1), /* log uncorrected errors */
MCP_DONTLOG = BIT(2), /* only clear, don't log */
+ MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */
};
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
@@ -305,7 +294,7 @@ extern void apei_mce_report_mem_error(int corrected,
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
- SMCA_LS_V2, /* Load Store */
+ SMCA_LS_V2,
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -314,36 +303,32 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
- SMCA_CS_V2, /* Coherent Slave */
+ SMCA_CS_V2,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_V2,
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
- SMCA_PSP_V2, /* Platform Security Processor */
+ SMCA_PSP_V2,
SMCA_SMU, /* System Management Unit */
- SMCA_SMU_V2, /* System Management Unit */
+ SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_MPDMA, /* MPDMA Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
+ SMCA_PCIE_V2,
+ SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_NBIF, /* NBIF Unit */
+ SMCA_SHUB, /* System HUB Unit */
+ SMCA_SATA, /* SATA Unit */
+ SMCA_USB, /* USB Unit */
+ SMCA_GMI_PCS, /* GMI PCS Unit */
+ SMCA_XGMI_PHY, /* xGMI PHY Unit */
+ SMCA_WAFL_PHY, /* WAFL PHY Unit */
+ SMCA_GMI_PHY, /* GMI PHY Unit */
N_SMCA_BANK_TYPES
};
-#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
-
-struct smca_hwid {
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
- u32 hwid_mcatype; /* (hwid,mcatype) tuple */
- u8 count; /* Number of instances. */
-};
-
-struct smca_bank {
- struct smca_hwid *hwid;
- u32 id; /* Value of MCA_IPID[InstanceId]. */
- u8 sysfs_id; /* Value used for sysfs name. */
-};
-
-extern struct smca_bank smca_banks[MAX_NR_BANKS];
-
extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);
@@ -351,16 +336,13 @@ extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);
void mce_amd_feature_init(struct cpuinfo_x86 *c);
-int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr);
-
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
#else
static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
-static inline int
-umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; };
#endif
static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 31c4df123aa0..88ceaf3648b3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -13,6 +13,7 @@
#ifndef __ASSEMBLY__
#include <linux/init.h>
+#include <linux/cc_platform.h>
#include <asm/bootparam.h>
@@ -20,7 +21,6 @@
extern u64 sme_me_mask;
extern u64 sev_status;
-extern bool sev_enabled;
void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
unsigned long decrypted_kernel_vaddr,
@@ -44,16 +44,12 @@ void __init sme_enable(struct boot_params *bp);
int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages,
+ bool enc);
void __init mem_encrypt_free_decrypted_mem(void);
-/* Architecture __weak replacement functions */
-void __init mem_encrypt_init(void);
-
void __init sev_es_init_vc_handling(void);
-bool sme_active(void);
-bool sev_active(void);
-bool sev_es_active(void);
#define __bss_decrypted __section(".bss..decrypted")
@@ -76,14 +72,13 @@ static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { }
static inline void sev_es_init_vc_handling(void) { }
-static inline bool sme_active(void) { return false; }
-static inline bool sev_active(void) { return false; }
-static inline bool sev_es_active(void) { return false; }
static inline int __init
early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+static inline void __init
+early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {}
static inline void mem_encrypt_free_decrypted_mem(void) { }
@@ -91,6 +86,9 @@ static inline void mem_encrypt_free_decrypted_mem(void) { }
#endif /* CONFIG_AMD_MEM_ENCRYPT */
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+
/*
* The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
* writing to or comparing values from the cr3 register. Having the
@@ -102,11 +100,6 @@ static inline void mem_encrypt_free_decrypted_mem(void) { }
extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
-static inline bool mem_encrypt_active(void)
-{
- return sme_me_mask;
-}
-
static inline u64 sme_get_me_mask(void)
{
return sme_me_mask;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ab45a220fac4..0c3d3440fe27 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -130,14 +130,13 @@ static inline unsigned int x86_cpuid_family(void)
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
-extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
extern bool initrd_gone;
+void microcode_bsp_resume(void);
#else
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
-static inline bool
-get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; }
+static inline void microcode_bsp_resume(void) { }
#endif
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 7063b5a43220..ac31f9140d07 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -38,7 +38,7 @@ struct microcode_header_amd {
struct microcode_amd {
struct microcode_header_amd hdr;
- unsigned int mpb[0];
+ unsigned int mpb[];
};
#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index d85a07d7154f..4c92cea7e4b5 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -19,7 +19,7 @@ struct microcode_header_intel {
struct microcode_intel {
struct microcode_header_intel hdr;
- unsigned int bits[0];
+ unsigned int bits[];
};
/* microcode format is extended from prescott processors */
@@ -33,7 +33,7 @@ struct extended_sigtable {
unsigned int count;
unsigned int cksum;
unsigned int reserved[3];
- struct extended_signature sigs[0];
+ struct extended_signature sigs[];
};
#define DEFAULT_UCODE_DATASIZE (2000)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 27516046117a..b8d40ddeab00 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -141,7 +141,7 @@ do { \
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
- lazy_load_gs(0); \
+ loadsegment(gs, 0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
deleted file mode 100644
index f572d0f944bb..000000000000
--- a/arch/x86/include/asm/mmx.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_MMX_H
-#define _ASM_X86_MMX_H
-
-/*
- * MMX 3Dnow! helper operations
- */
-
-#include <linux/types.h>
-
-extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
-extern void mmx_copy_page(void *to, void *from);
-
-#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index ccf60a809a17..61f0c206bff0 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -9,79 +9,29 @@
#include <asm/hyperv-tlfs.h>
#include <asm/nospec-branch.h>
#include <asm/paravirt.h>
+#include <asm/mshyperv.h>
+
+union hv_ghcb;
+
+DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
void *data);
-#define hv_init_timer(timer, tick) \
- wrmsrl(HV_X64_MSR_STIMER0_COUNT + (2*timer), tick)
-#define hv_init_timer_config(timer, val) \
- wrmsrl(HV_X64_MSR_STIMER0_CONFIG + (2*timer), val)
-
-#define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val)
-#define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val)
-
-#define hv_get_siefp(val) rdmsrl(HV_X64_MSR_SIEFP, val)
-#define hv_set_siefp(val) wrmsrl(HV_X64_MSR_SIEFP, val)
-
-#define hv_get_synic_state(val) rdmsrl(HV_X64_MSR_SCONTROL, val)
-#define hv_set_synic_state(val) wrmsrl(HV_X64_MSR_SCONTROL, val)
-
-#define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index)
-
-#define hv_signal_eom() wrmsrl(HV_X64_MSR_EOM, 0)
-
-#define hv_get_synint_state(int_num, val) \
- rdmsrl(HV_X64_MSR_SINT0 + int_num, val)
-#define hv_set_synint_state(int_num, val) \
- wrmsrl(HV_X64_MSR_SINT0 + int_num, val)
-#define hv_recommend_using_aeoi() \
- (!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED))
-
-#define hv_get_crash_ctl(val) \
- rdmsrl(HV_X64_MSR_CRASH_CTL, val)
-
-#define hv_get_time_ref_count(val) \
- rdmsrl(HV_X64_MSR_TIME_REF_COUNT, val)
-
-#define hv_get_reference_tsc(val) \
- rdmsrl(HV_X64_MSR_REFERENCE_TSC, val)
-#define hv_set_reference_tsc(val) \
- wrmsrl(HV_X64_MSR_REFERENCE_TSC, val)
-#define hv_set_clocksource_vdso(val) \
- ((val).vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK)
-#define hv_enable_vdso_clocksource() \
- vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK);
#define hv_get_raw_timer() rdtsc_ordered()
-#define hv_get_vector() HYPERVISOR_CALLBACK_VECTOR
-
-/*
- * Reference to pv_ops must be inline so objtool
- * detection of noinstr violations can work correctly.
- */
-static __always_inline void hv_setup_sched_clock(void *sched_clock)
-{
-#ifdef CONFIG_PARAVIRT
- pv_ops.time.sched_clock = sched_clock;
-#endif
-}
void hyperv_vector_handler(struct pt_regs *regs);
-static inline void hv_enable_stimer0_percpu_irq(int irq) {}
-static inline void hv_disable_stimer0_percpu_irq(int irq) {}
-
-
#if IS_ENABLED(CONFIG_HYPERV)
extern int hyperv_init_cpuhp;
extern void *hv_hypercall_pg;
-extern void __percpu **hyperv_pcpu_input_arg;
-extern void __percpu **hyperv_pcpu_output_arg;
extern u64 hv_current_partition_id;
+extern union hv_ghcb * __percpu *hv_ghcb_pg;
+
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
@@ -189,38 +139,6 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
return hv_status;
}
-/*
- * Rep hypercalls. Callers of this functions are supposed to ensure that
- * rep_count and varhead_size comply with Hyper-V hypercall definition.
- */
-static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
- void *input, void *output)
-{
- u64 control = code;
- u64 status;
- u16 rep_comp;
-
- control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
- control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
-
- do {
- status = hv_do_hypercall(control, input, output);
- if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
- return status;
-
- /* Bits 32-43 of status have 'Reps completed' data. */
- rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >>
- HV_HYPERCALL_REP_COMP_OFFSET;
-
- control &= ~HV_HYPERCALL_REP_START_MASK;
- control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET;
-
- touch_nmi_watchdog();
- } while (rep_comp < rep_count);
-
- return status;
-}
-
extern struct hv_vp_assist_page **hv_vp_assist_page;
static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
@@ -233,9 +151,6 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
void __init hyperv_init(void);
void hyperv_setup_mmu_ops(void);
-void *hv_alloc_hyperv_page(void);
-void *hv_alloc_hyperv_zeroed_page(void);
-void hv_free_hyperv_page(unsigned long addr);
void set_hv_tscchange_cb(void (*cb)(void));
void clear_hv_tscchange_cb(void);
void hyperv_stop_tsc_emulation(void);
@@ -246,8 +161,6 @@ int hyperv_fill_flush_guest_mapping_list(
struct hv_guest_mapping_flush_list *flush,
u64 start_gfn, u64 end_gfn);
-extern bool hv_root_partition;
-
#ifdef CONFIG_X86_64
void hv_apic_init(void);
void __init hv_init_spinlocks(void);
@@ -256,24 +169,63 @@ bool hv_vcpu_is_preempted(int vcpu);
static inline void hv_apic_init(void) {}
#endif
-static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
- struct msi_desc *msi_desc)
-{
- msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
- msi_entry->data.as_uint32 = msi_desc->msg.data;
-}
-
struct irq_domain *hv_create_pci_msi_domain(void);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
+int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible);
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+void hv_ghcb_msr_write(u64 msr, u64 value);
+void hv_ghcb_msr_read(u64 msr, u64 *value);
+bool hv_ghcb_negotiate_protocol(void);
+void hv_ghcb_terminate(unsigned int set, unsigned int reason);
+#else
+static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
+static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
+static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
+static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
+#endif
+
+extern bool hv_isolation_type_snp(void);
+
+static inline bool hv_is_synic_reg(unsigned int reg)
+{
+ if ((reg >= HV_REGISTER_SCONTROL) &&
+ (reg <= HV_REGISTER_SINT15))
+ return true;
+ return false;
+}
+
+static inline u64 hv_get_register(unsigned int reg)
+{
+ u64 value;
+
+ if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
+ hv_ghcb_msr_read(reg, &value);
+ else
+ rdmsrl(reg, value);
+ return value;
+}
+
+static inline void hv_set_register(unsigned int reg, u64 value)
+{
+ if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
+ hv_ghcb_msr_write(reg, value);
+
+ /* Write proxy bit via wrmsl instruction */
+ if (reg >= HV_REGISTER_SINT0 &&
+ reg <= HV_REGISTER_SINT15)
+ wrmsrl(reg, value | 1 << 20);
+ } else {
+ wrmsrl(reg, value);
+ }
+}
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
-static inline void *hv_alloc_hyperv_page(void) { return NULL; }
-static inline void hv_free_hyperv_page(unsigned long addr) {}
static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
static inline void clear_hv_tscchange_cb(void) {}
static inline void hyperv_stop_tsc_emulation(void) {};
@@ -287,6 +239,13 @@ static inline int hyperv_flush_guest_mapping_range(u64 as,
{
return -1;
}
+static inline void hv_set_register(unsigned int reg, u64 value) { }
+static inline u64 hv_get_register(unsigned int reg) { return 0; }
+static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages,
+ bool visible)
+{
+ return -1;
+}
#endif /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index b85147d75626..d71c7e8b738d 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -12,14 +12,17 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
/* Structs and defines for the X86 specific MSI message format */
typedef struct x86_msi_data {
- u32 vector : 8,
- delivery_mode : 3,
- dest_mode_logical : 1,
- reserved : 2,
- active_low : 1,
- is_level : 1;
-
- u32 dmar_subhandle;
+ union {
+ struct {
+ u32 vector : 8,
+ delivery_mode : 3,
+ dest_mode_logical : 1,
+ reserved : 2,
+ active_low : 1,
+ is_level : 1;
+ };
+ u32 dmar_subhandle;
+ };
} __attribute__ ((packed)) arch_msi_msg_data_t;
#define arch_msi_msg_data x86_msi_data
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 546d6ecf0a35..d27e0581b777 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -76,6 +76,8 @@
/* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */
#define MSR_IA32_CORE_CAPS 0x000000cf
+#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2
+#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS BIT(MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT)
#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5
#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT)
@@ -114,6 +116,30 @@
* Not susceptible to
* TSX Async Abort (TAA) vulnerabilities.
*/
+#define ARCH_CAP_SBDR_SSDP_NO BIT(13) /*
+ * Not susceptible to SBDR and SSDP
+ * variants of Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_FBSDP_NO BIT(14) /*
+ * Not susceptible to FBSDP variant of
+ * Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_PSDP_NO BIT(15) /*
+ * Not susceptible to PSDP variant of
+ * Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_FB_CLEAR BIT(17) /*
+ * VERW clears CPU fill buffer
+ * even on MDS_NO CPUs.
+ */
+#define ARCH_CAP_FB_CLEAR_CTRL BIT(18) /*
+ * MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]
+ * bit available to control VERW
+ * behavior.
+ */
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH BIT(0) /*
@@ -128,9 +154,10 @@
#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
-/* SRBDS support */
#define MSR_IA32_MCU_OPT_CTRL 0x00000123
-#define RNGDS_MITG_DIS BIT(0)
+#define RNGDS_MITG_DIS BIT(0) /* SRBDS support */
+#define RTM_ALLOW BIT(1) /* TSX development mode */
+#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
@@ -154,6 +181,11 @@
#define MSR_IA32_POWER_CTL 0x000001fc
#define MSR_IA32_POWER_CTL_BIT_EE 19
+/* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */
+#define MSR_INTEGRITY_CAPS 0x000002d9
+#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4
+#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
+
#define MSR_LBR_NHM_FROM 0x00000680
#define MSR_LBR_NHM_TO 0x000006c0
#define MSR_LBR_CORE_FROM 0x00000040
@@ -185,6 +217,9 @@
#define MSR_PEBS_DATA_CFG 0x000003f2
#define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
+#define PERF_CAP_METRICS_IDX 15
+#define PERF_CAP_PT_IDX 16
+
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
#define MSR_IA32_RTIT_CTL 0x00000570
@@ -202,6 +237,8 @@
#define RTIT_CTL_DISRETC BIT(11)
#define RTIT_CTL_PTW_EN BIT(12)
#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_EVENT_EN BIT(31)
+#define RTIT_CTL_NOTNT BIT_ULL(55)
#define RTIT_CTL_MTC_RANGE_OFFSET 14
#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
#define RTIT_CTL_CYC_THRESH_OFFSET 19
@@ -265,6 +302,7 @@
#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
#define DEBUGCTLMSR_BTF_SHIFT 1
#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
+#define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2)
#define DEBUGCTLMSR_TR (1UL << 6)
#define DEBUGCTLMSR_BTS (1UL << 7)
#define DEBUGCTLMSR_BTINT (1UL << 8)
@@ -306,6 +344,7 @@
/* Run Time Average Power Limiting (RAPL) Interface */
+#define MSR_VR_CURRENT_CONFIG 0x00000601
#define MSR_RAPL_POWER_UNIT 0x00000606
#define MSR_PKG_POWER_LIMIT 0x00000610
@@ -356,11 +395,29 @@
#define MSR_ATOM_CORE_TURBO_RATIOS 0x0000066c
#define MSR_ATOM_CORE_TURBO_VIDS 0x0000066d
-
#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690
#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
+/* Control-flow Enforcement Technology MSRs */
+#define MSR_IA32_U_CET 0x000006a0 /* user mode cet */
+#define MSR_IA32_S_CET 0x000006a2 /* kernel mode cet */
+#define CET_SHSTK_EN BIT_ULL(0)
+#define CET_WRSS_EN BIT_ULL(1)
+#define CET_ENDBR_EN BIT_ULL(2)
+#define CET_LEG_IW_EN BIT_ULL(3)
+#define CET_NO_TRACK_EN BIT_ULL(4)
+#define CET_SUPPRESS_DISABLE BIT_ULL(5)
+#define CET_RESERVED (BIT_ULL(6) | BIT_ULL(7) | BIT_ULL(8) | BIT_ULL(9))
+#define CET_SUPPRESS BIT_ULL(10)
+#define CET_WAIT_ENDBR BIT_ULL(11)
+
+#define MSR_IA32_PL0_SSP 0x000006a4 /* ring-0 shadow stack pointer */
+#define MSR_IA32_PL1_SSP 0x000006a5 /* ring-1 shadow stack pointer */
+#define MSR_IA32_PL2_SSP 0x000006a6 /* ring-2 shadow stack pointer */
+#define MSR_IA32_PL3_SSP 0x000006a7 /* ring-3 shadow stack pointer */
+#define MSR_IA32_INT_SSP_TAB 0x000006a8 /* exception shadow stack table */
+
/* Hardware P state interface */
#define MSR_PPERF 0x0000064e
#define MSR_PERF_LIMIT_REASONS 0x0000064f
@@ -472,16 +529,41 @@
#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
#define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b
#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0
#define MSR_AMD64_SEV_ES_ENABLED_BIT 1
+#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
+#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
+/* AMD Collaborative Processor Performance Control MSRs */
+#define MSR_AMD_CPPC_CAP1 0xc00102b0
+#define MSR_AMD_CPPC_ENABLE 0xc00102b1
+#define MSR_AMD_CPPC_CAP2 0xc00102b2
+#define MSR_AMD_CPPC_REQ 0xc00102b3
+#define MSR_AMD_CPPC_STATUS 0xc00102b4
+
+#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff)
+#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff)
+#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff)
+#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff)
+
+#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0)
+#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8)
+#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16)
+#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24)
+
+/* AMD Performance Counter Global Status and Control MSRs */
+#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300
+#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301
+#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302
+
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9
@@ -533,9 +615,9 @@
/* K8 MSRs */
#define MSR_K8_TOP_MEM1 0xc001001a
#define MSR_K8_TOP_MEM2 0xc001001d
-#define MSR_K8_SYSCFG 0xc0010010
-#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23
-#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
+#define MSR_AMD64_SYSCFG 0xc0010010
+#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23
+#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT)
#define MSR_K8_INT_PENDING_MSG 0xc0010055
/* C1E active bits in int pending message */
#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
@@ -621,6 +703,8 @@
#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
+#define MSR_IA32_XFD 0x000001c4
+#define MSR_IA32_XFD_ERR 0x000001c5
#define MSR_IA32_XSS 0x00000da0
#define MSR_IA32_APICBASE 0x0000001b
@@ -628,8 +712,6 @@
#define MSR_IA32_APICBASE_ENABLE (1<<11)
#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
-#define MSR_IA32_TSCDEADLINE 0x000006e0
-
#define MSR_IA32_UCODE_WRITE 0x00000079
#define MSR_IA32_UCODE_REV 0x0000008b
@@ -646,6 +728,10 @@
#define MSR_IA32_PERF_CTL 0x00000199
#define INTEL_PERF_CTL_MASK 0xffff
+/* AMD Branch Sampling configuration */
+#define MSR_AMD_DBG_EXTN_CFG 0xc000010f
+#define MSR_AMD_SAMP_BR_FROM 0xc0010300
+
#define MSR_IA32_MPERF 0x000000e7
#define MSR_IA32_APERF 0x000000e8
@@ -683,12 +769,14 @@
#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
+#define PACKAGE_THERM_STATUS_HFI_UPDATED (1 << 26)
#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
+#define PACKAGE_THERM_INT_HFI_ENABLE (1 << 25)
/* Thermal Thresholds Support */
#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
@@ -770,6 +858,10 @@
#define MSR_TFA_RTM_FORCE_ABORT_BIT 0
#define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT)
+#define MSR_TFA_TSX_CPUID_CLEAR_BIT 1
+#define MSR_TFA_TSX_CPUID_CLEAR BIT_ULL(MSR_TFA_TSX_CPUID_CLEAR_BIT)
+#define MSR_TFA_SDV_ENABLE_RTM_BIT 2
+#define MSR_TFA_SDV_ENABLE_RTM BIT_ULL(MSR_TFA_SDV_ENABLE_RTM_BIT)
/* P4/Xeon+ specific */
#define MSR_IA32_MCG_EAX 0x00000180
@@ -933,4 +1025,8 @@
#define MSR_VM_IGNNE 0xc0010115
#define MSR_VM_HSAVE_PA 0xc0010117
+/* Hardware Feedback Interface */
+#define MSR_IA32_HW_FEEDBACK_PTR 0x17d0
+#define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1
+
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index e16cccdd0420..65ec1965cd28 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -10,16 +10,7 @@
#include <asm/errno.h>
#include <asm/cpumask.h>
#include <uapi/asm/msr.h>
-
-struct msr {
- union {
- struct {
- u32 l;
- u32 h;
- };
- u64 q;
- };
-};
+#include <asm/shared/msr.h>
struct msr_info {
u32 msr_no;
@@ -92,7 +83,7 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr)
asm volatile("1: rdmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
: EAX_EDX_RET(val, low, high) : "c" (msr));
return EAX_EDX_VAL(val, low, high);
@@ -102,7 +93,7 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
{
asm volatile("1: wrmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
: : "c" (msr), "a"(low), "d" (high) : "memory");
}
@@ -137,17 +128,11 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
{
DECLARE_ARGS(val, low, high);
- asm volatile("2: rdmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err]\n\t"
- "xorl %%eax, %%eax\n\t"
- "xorl %%edx, %%edx\n\t"
- "jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
+ asm volatile("1: rdmsr ; xor %[err],%[err]\n"
+ "2:\n\t"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
: [err] "=r" (*err), EAX_EDX_RET(val, low, high)
- : "c" (msr), [fault] "i" (-EIO));
+ : "c" (msr));
if (tracepoint_enabled(read_msr))
do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
return EAX_EDX_VAL(val, low, high);
@@ -169,15 +154,11 @@ native_write_msr_safe(unsigned int msr, u32 low, u32 high)
{
int err;
- asm volatile("2: wrmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err] ; jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
+ asm volatile("1: wrmsr ; xor %[err],%[err]\n"
+ "2:\n\t"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
: [err] "=a" (err)
- : "c" (msr), "0" (low), "d" (high),
- [fault] "i" (-EIO)
+ : "c" (msr), "0" (low), "d" (high)
: "memory");
if (tracepoint_enabled(write_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), err);
@@ -324,10 +305,6 @@ static inline int wrmsrl_safe(u32 msr, u64 val)
return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
}
-#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
-
-#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
-
struct msr *msrs_alloc(void);
void msrs_free(struct msr *msrs);
int msr_set_bit(u32 msr, u8 bit);
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 829df26fd7a3..76d726074c16 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -24,8 +24,8 @@
#define _ASM_X86_MTRR_H
#include <uapi/asm/mtrr.h>
-#include <asm/memtype.h>
+void mtrr_bp_init(void);
/*
* The following functions are for use by other drivers that cannot use
@@ -43,7 +43,6 @@ extern int mtrr_del(int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
extern void mtrr_ap_init(void);
-extern void mtrr_bp_init(void);
extern void set_mtrr_aps_delayed_init(void);
extern void mtrr_aps_init(void);
extern void mtrr_bp_restore(void);
@@ -84,11 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
}
-static inline void mtrr_bp_init(void)
-{
- pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
-}
-
#define mtrr_ap_init() do {} while (0)
#define set_mtrr_aps_delayed_init() do {} while (0)
#define mtrr_aps_init() do {} while (0)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 1cb9c17a4cb4..5c5f1e56c404 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -47,6 +47,7 @@ struct nmiaction {
#define register_nmi_handler(t, fn, fg, n, init...) \
({ \
static struct nmiaction init fn##_na = { \
+ .list = LIST_HEAD_INIT(fn##_na.list), \
.handler = (fn), \
.name = (n), \
.flags = (fg), \
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 12f12b5cf2ca..c5573eaa5bb9 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -2,146 +2,76 @@
#ifndef _ASM_X86_NOPS_H
#define _ASM_X86_NOPS_H
+#include <asm/asm.h>
+
/*
* Define nops for use with alternative() and for tracing.
- *
- * *_NOP5_ATOMIC must be a single instruction.
*/
-#define NOP_DS_PREFIX 0x3e
+#ifndef CONFIG_64BIT
-/* generic versions from gas
- 1: nop
- the following instructions are NOT nops in 64-bit mode,
- for 64-bit mode use K8 or P6 nops instead
- 2: movl %esi,%esi
- 3: leal 0x00(%esi),%esi
- 4: leal 0x00(,%esi,1),%esi
- 6: leal 0x00000000(%esi),%esi
- 7: leal 0x00000000(,%esi,1),%esi
-*/
-#define GENERIC_NOP1 0x90
-#define GENERIC_NOP2 0x89,0xf6
-#define GENERIC_NOP3 0x8d,0x76,0x00
-#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
-#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
-#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
-#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
-#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
-#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
-
-/* Opteron 64bit nops
- 1: nop
- 2: osp nop
- 3: osp osp nop
- 4: osp osp osp nop
-*/
-#define K8_NOP1 GENERIC_NOP1
-#define K8_NOP2 0x66,K8_NOP1
-#define K8_NOP3 0x66,K8_NOP2
-#define K8_NOP4 0x66,K8_NOP3
-#define K8_NOP5 K8_NOP3,K8_NOP2
-#define K8_NOP6 K8_NOP3,K8_NOP3
-#define K8_NOP7 K8_NOP4,K8_NOP3
-#define K8_NOP8 K8_NOP4,K8_NOP4
-#define K8_NOP5_ATOMIC 0x66,K8_NOP4
+/*
+ * Generic 32bit nops from GAS:
+ *
+ * 1: nop
+ * 2: movl %esi,%esi
+ * 3: leal 0x0(%esi),%esi
+ * 4: leal 0x0(%esi,%eiz,1),%esi
+ * 5: leal %ds:0x0(%esi,%eiz,1),%esi
+ * 6: leal 0x0(%esi),%esi
+ * 7: leal 0x0(%esi,%eiz,1),%esi
+ * 8: leal %ds:0x0(%esi,%eiz,1),%esi
+ *
+ * Except 5 and 8, which are DS prefixed 4 and 7 resp, where GAS would emit 2
+ * nop instructions.
+ */
+#define BYTES_NOP1 0x90
+#define BYTES_NOP2 0x89,0xf6
+#define BYTES_NOP3 0x8d,0x76,0x00
+#define BYTES_NOP4 0x8d,0x74,0x26,0x00
+#define BYTES_NOP5 0x3e,BYTES_NOP4
+#define BYTES_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
+#define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
+#define BYTES_NOP8 0x3e,BYTES_NOP7
-/* K7 nops
- uses eax dependencies (arbitrary choice)
- 1: nop
- 2: movl %eax,%eax
- 3: leal (,%eax,1),%eax
- 4: leal 0x00(,%eax,1),%eax
- 6: leal 0x00000000(%eax),%eax
- 7: leal 0x00000000(,%eax,1),%eax
-*/
-#define K7_NOP1 GENERIC_NOP1
-#define K7_NOP2 0x8b,0xc0
-#define K7_NOP3 0x8d,0x04,0x20
-#define K7_NOP4 0x8d,0x44,0x20,0x00
-#define K7_NOP5 K7_NOP4,K7_NOP1
-#define K7_NOP6 0x8d,0x80,0,0,0,0
-#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0
-#define K7_NOP8 K7_NOP7,K7_NOP1
-#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
+#else
-/* P6 nops
- uses eax dependencies (Intel-recommended choice)
- 1: nop
- 2: osp nop
- 3: nopl (%eax)
- 4: nopl 0x00(%eax)
- 5: nopl 0x00(%eax,%eax,1)
- 6: osp nopl 0x00(%eax,%eax,1)
- 7: nopl 0x00000000(%eax)
- 8: nopl 0x00000000(%eax,%eax,1)
- Note: All the above are assumed to be a single instruction.
- There is kernel code that depends on this.
-*/
-#define P6_NOP1 GENERIC_NOP1
-#define P6_NOP2 0x66,0x90
-#define P6_NOP3 0x0f,0x1f,0x00
-#define P6_NOP4 0x0f,0x1f,0x40,0
-#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
-#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
-#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
-#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
-#define P6_NOP5_ATOMIC P6_NOP5
+/*
+ * Generic 64bit nops from GAS:
+ *
+ * 1: nop
+ * 2: osp nop
+ * 3: nopl (%eax)
+ * 4: nopl 0x00(%eax)
+ * 5: nopl 0x00(%eax,%eax,1)
+ * 6: osp nopl 0x00(%eax,%eax,1)
+ * 7: nopl 0x00000000(%eax)
+ * 8: nopl 0x00000000(%eax,%eax,1)
+ */
+#define BYTES_NOP1 0x90
+#define BYTES_NOP2 0x66,BYTES_NOP1
+#define BYTES_NOP3 0x0f,0x1f,0x00
+#define BYTES_NOP4 0x0f,0x1f,0x40,0x00
+#define BYTES_NOP5 0x0f,0x1f,0x44,0x00,0x00
+#define BYTES_NOP6 0x66,BYTES_NOP5
+#define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00
+#define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
-#ifdef __ASSEMBLY__
-#define _ASM_MK_NOP(x) .byte x
-#else
-#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
-#endif
+#endif /* CONFIG_64BIT */
-#if defined(CONFIG_MK7)
-#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
-#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
-#elif defined(CONFIG_X86_P6_NOP)
-#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
-#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
-#elif defined(CONFIG_X86_64)
-#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
-#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
-#else
-#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
-#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
-#endif
+#define ASM_NOP1 _ASM_BYTES(BYTES_NOP1)
+#define ASM_NOP2 _ASM_BYTES(BYTES_NOP2)
+#define ASM_NOP3 _ASM_BYTES(BYTES_NOP3)
+#define ASM_NOP4 _ASM_BYTES(BYTES_NOP4)
+#define ASM_NOP5 _ASM_BYTES(BYTES_NOP5)
+#define ASM_NOP6 _ASM_BYTES(BYTES_NOP6)
+#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7)
+#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8)
#define ASM_NOP_MAX 8
-#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */
#ifndef __ASSEMBLY__
-extern const unsigned char * const *ideal_nops;
-extern void arch_init_ideal_nops(void);
+extern const unsigned char * const x86_nops[];
#endif
#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index cb9ad6b73973..da251a5645b0 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -5,13 +5,15 @@
#include <linux/static_key.h>
#include <linux/objtool.h>
+#include <linux/linkage.h>
#include <asm/alternative.h>
-#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
+#define RETPOLINE_THUNK_SIZE 32
+
/*
* Fill the CPU return stack buffer.
*
@@ -33,7 +35,7 @@
/*
* Google experimented with loop-unrolling and this turned out to be
- * the optimal version — two calls, each with their own speculation
+ * the optimal version - two calls, each with their own speculation
* trap should their return address end up getting used, in a loop.
*/
#define __FILL_RETURN_BUFFER(reg, nr, sp) \
@@ -81,8 +83,8 @@
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
- __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+ __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
#else
jmp *%\reg
#endif
@@ -91,8 +93,8 @@
.macro CALL_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
- __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD
+ __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE
#else
call *%\reg
#endif
@@ -119,6 +121,16 @@
".popsection\n\t"
#ifdef CONFIG_RETPOLINE
+
+typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
+
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+extern retpoline_thunk_t __x86_indirect_thunk_array[];
+
#ifdef CONFIG_X86_64
/*
@@ -129,12 +141,12 @@
ALTERNATIVE_2( \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
- "call __x86_retpoline_%V[thunk_target]\n", \
+ "call __x86_indirect_thunk_%V[thunk_target]\n", \
X86_FEATURE_RETPOLINE, \
"lfence;\n" \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
- X86_FEATURE_RETPOLINE_AMD)
+ X86_FEATURE_RETPOLINE_LFENCE)
# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
@@ -164,7 +176,7 @@
"lfence;\n" \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
- X86_FEATURE_RETPOLINE_AMD)
+ X86_FEATURE_RETPOLINE_LFENCE)
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif
@@ -176,9 +188,11 @@
/* The Spectre V2 mitigation variants */
enum spectre_v2_mitigation {
SPECTRE_V2_NONE,
- SPECTRE_V2_RETPOLINE_GENERIC,
- SPECTRE_V2_RETPOLINE_AMD,
- SPECTRE_V2_IBRS_ENHANCED,
+ SPECTRE_V2_RETPOLINE,
+ SPECTRE_V2_LFENCE,
+ SPECTRE_V2_EIBRS,
+ SPECTRE_V2_EIBRS_RETPOLINE,
+ SPECTRE_V2_EIBRS_LFENCE,
};
/* The indirect branch speculation control variants */
@@ -253,6 +267,10 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
DECLARE_STATIC_KEY_FALSE(mds_user_clear);
DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
+DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+
+DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+
#include <asm/segment.h>
/**
@@ -302,63 +320,4 @@ static inline void mds_idle_clear_cpu_buffers(void)
#endif /* __ASSEMBLY__ */
-/*
- * Below is used in the eBPF JIT compiler and emits the byte sequence
- * for the following assembly:
- *
- * With retpolines configured:
- *
- * callq do_rop
- * spec_trap:
- * pause
- * lfence
- * jmp spec_trap
- * do_rop:
- * mov %rcx,(%rsp) for x86_64
- * mov %edx,(%esp) for x86_32
- * retq
- *
- * Without retpolines configured:
- *
- * jmp *%rcx for x86_64
- * jmp *%edx for x86_32
- */
-#ifdef CONFIG_RETPOLINE
-# ifdef CONFIG_X86_64
-# define RETPOLINE_RCX_BPF_JIT_SIZE 17
-# define RETPOLINE_RCX_BPF_JIT() \
-do { \
- EMIT1_off32(0xE8, 7); /* callq do_rop */ \
- /* spec_trap: */ \
- EMIT2(0xF3, 0x90); /* pause */ \
- EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
- EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
- /* do_rop: */ \
- EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \
- EMIT1(0xC3); /* retq */ \
-} while (0)
-# else /* !CONFIG_X86_64 */
-# define RETPOLINE_EDX_BPF_JIT() \
-do { \
- EMIT1_off32(0xE8, 7); /* call do_rop */ \
- /* spec_trap: */ \
- EMIT2(0xF3, 0x90); /* pause */ \
- EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
- EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
- /* do_rop: */ \
- EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \
- EMIT1(0xC3); /* ret */ \
-} while (0)
-# endif
-#else /* !CONFIG_RETPOLINE */
-# ifdef CONFIG_X86_64
-# define RETPOLINE_RCX_BPF_JIT_SIZE 2
-# define RETPOLINE_RCX_BPF_JIT() \
- EMIT2(0xFF, 0xE1); /* jmp *%rcx */
-# else /* !CONFIG_X86_64 */
-# define RETPOLINE_EDX_BPF_JIT() \
- EMIT2(0xFF, 0xE2) /* jmp *%edx */
-# endif
-#endif
-
#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 7555b48803a8..9cc82f305f4b 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -34,9 +34,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
copy_page(to, from);
}
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
- alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+ alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
#ifndef __pa
#define __pa(x) __phys_addr((unsigned long)(x))
@@ -71,6 +71,16 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
+static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
+{
+ return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
+}
+
+static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits)
+{
+ return __canonical_address(vaddr, vaddr_bits) == vaddr;
+}
+
#endif /* __ASSEMBLY__ */
#include <asm-generic/memory_model.h>
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index 94dbd51df58f..df42f8aa99e4 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -19,19 +19,6 @@ extern unsigned long __phys_addr(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_mapnr)
#endif /* CONFIG_FLATMEM */
-#ifdef CONFIG_X86_USE_3DNOW
-#include <asm/mmx.h>
-
-static inline void clear_page(void *page)
-{
- mmx_clear_page(page);
-}
-
-static inline void copy_page(void *to, void *from)
-{
- mmx_copy_page(to, from);
-}
-#else /* !CONFIG_X86_USE_3DNOW */
#include <linux/string.h>
static inline void clear_page(void *page)
@@ -43,7 +30,6 @@ static inline void copy_page(void *to, void *from)
{
memcpy(to, from, PAGE_SIZE);
}
-#endif /* CONFIG_X86_3DNOW */
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..baa70451b8df 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -5,6 +5,7 @@
#include <asm/page_64_types.h>
#ifndef __ASSEMBLY__
+#include <asm/cpufeatures.h>
#include <asm/alternative.h>
/* duplicated to the one in bootmem.h */
@@ -15,7 +16,7 @@ extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
-static inline unsigned long __phys_addr_nodebug(unsigned long x)
+static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
@@ -56,6 +57,39 @@ static inline void clear_page(void *page)
void copy_page(void *to, void *from);
+#ifdef CONFIG_X86_5LEVEL
+/*
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
+ */
+static __always_inline unsigned long task_size_max(void)
+{
+ unsigned long ret;
+
+ alternative_io("movq %[small],%0","movq %[large],%0",
+ X86_FEATURE_LA57,
+ "=r" (ret),
+ [small] "i" ((1ul << 47)-PAGE_SIZE),
+ [large] "i" ((1ul << 56)-PAGE_SIZE));
+
+ return ret;
+}
+#endif /* CONFIG_X86_5LEVEL */
+
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 64297eabad63..e9e2c3ba5923 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -15,7 +15,7 @@
#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
-#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
+#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER)
#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
@@ -55,30 +55,13 @@
#ifdef CONFIG_X86_5LEVEL
#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
+/* See task_size_max() in <asm/page_64.h> */
#else
#define __VIRTUAL_MASK_SHIFT 47
+#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
#endif
-/*
- * User space process size. This is the first address outside the user range.
- * There are a few constraints that determine this:
- *
- * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
- * address, then that syscall will enter the kernel with a
- * non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything
- * from being mapped at the maximum canonical address.
- *
- * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
- * CPUs malfunction if they execute code from the highest canonical page.
- * They'll speculate right off the end of the canonical space, and
- * bad things happen. This is worked around in the same way as the
- * Intel problem.
- *
- * With page table isolation enabled, we map the LDT in ... [stay tuned]
- */
-#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
-
+#define TASK_SIZE_MAX task_size_max()
#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
/* This decides where the kernel will search for a free chunk of vm
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 4abf110e2243..964442b99245 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -15,11 +15,20 @@
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
+#include <linux/static_call_types.h>
#include <asm/frame.h>
-static inline unsigned long long paravirt_sched_clock(void)
+u64 dummy_steal_clock(int cpu);
+u64 dummy_sched_clock(void);
+
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
+
+void paravirt_set_sched_clock(u64 (*func)(void));
+
+static inline u64 paravirt_sched_clock(void)
{
- return PVOP_CALL0(unsigned long long, time.sched_clock);
+ return static_call(pv_sched_clock)();
}
struct static_key;
@@ -33,24 +42,28 @@ bool pv_is_native_vcpu_is_preempted(void);
static inline u64 paravirt_steal_clock(int cpu)
{
- return PVOP_CALL1(u64, time.steal_clock, cpu);
+ return static_call(pv_steal_clock)(cpu);
}
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init paravirt_set_cap(void);
+#endif
+
/* The paravirtualized I/O functions */
static inline void slow_down_io(void)
{
- pv_ops.cpu.io_delay();
+ PVOP_VCALL0(cpu.io_delay);
#ifdef REALLY_SLOW_IO
- pv_ops.cpu.io_delay();
- pv_ops.cpu.io_delay();
- pv_ops.cpu.io_delay();
+ PVOP_VCALL0(cpu.io_delay);
+ PVOP_VCALL0(cpu.io_delay);
+ PVOP_VCALL0(cpu.io_delay);
#endif
}
void native_flush_tlb_local(void);
void native_flush_tlb_global(void);
void native_flush_tlb_one_user(unsigned long addr);
-void native_flush_tlb_others(const struct cpumask *cpumask,
+void native_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
static inline void __flush_tlb_local(void)
@@ -68,10 +81,10 @@ static inline void __flush_tlb_one_user(unsigned long addr)
PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
}
-static inline void __flush_tlb_others(const struct cpumask *cpumask,
+static inline void __flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
- PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
+ PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
}
static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
@@ -84,6 +97,12 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
PVOP_VCALL1(mmu.exit_mmap, mm);
}
+static inline void notify_page_enc_status_changed(unsigned long pfn,
+ int npages, bool enc)
+{
+ PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc);
+}
+
#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
@@ -100,12 +119,12 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
/*
* These special macros can be used to get or set a debugging register
*/
-static inline unsigned long paravirt_get_debugreg(int reg)
+static __always_inline unsigned long paravirt_get_debugreg(int reg)
{
return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg);
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
-static inline void set_debugreg(unsigned long val, int reg)
+static __always_inline void set_debugreg(unsigned long val, int reg)
{
PVOP_VCALL2(cpu.set_debugreg, reg, val);
}
@@ -120,24 +139,28 @@ static inline void write_cr0(unsigned long x)
PVOP_VCALL1(cpu.write_cr0, x);
}
-static inline unsigned long read_cr2(void)
+static __always_inline unsigned long read_cr2(void)
{
- return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
+ return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
+ "mov %%cr2, %%rax;",
+ ALT_NOT(X86_FEATURE_XENPV));
}
-static inline void write_cr2(unsigned long x)
+static __always_inline void write_cr2(unsigned long x)
{
PVOP_VCALL1(mmu.write_cr2, x);
}
static inline unsigned long __read_cr3(void)
{
- return PVOP_CALL0(unsigned long, mmu.read_cr3);
+ return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
+ "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void write_cr3(unsigned long x)
{
- PVOP_VCALL1(mmu.write_cr3, x);
+ PVOP_ALT_VCALL1(mmu.write_cr3, x,
+ "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void __write_cr4(unsigned long x)
@@ -157,7 +180,7 @@ static inline void halt(void)
static inline void wbinvd(void)
{
- PVOP_VCALL0(cpu.wbinvd);
+ PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
}
static inline u64 paravirt_read_msr(unsigned msr)
@@ -371,22 +394,28 @@ static inline void paravirt_release_p4d(unsigned long pfn)
static inline pte_t __pte(pteval_t val)
{
- return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) };
+ return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pteval_t pte_val(pte_t pte)
{
- return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte);
+ return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline pgd_t __pgd(pgdval_t val)
{
- return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) };
+ return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pgdval_t pgd_val(pgd_t pgd)
{
- return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd);
+ return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -419,12 +448,15 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
static inline pmd_t __pmd(pmdval_t val)
{
- return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) };
+ return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pmdval_t pmd_val(pmd_t pmd)
{
- return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd);
+ return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void set_pud(pud_t *pudp, pud_t pud)
@@ -436,14 +468,16 @@ static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
- ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val);
+ ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
return (pud_t) { ret };
}
static inline pudval_t pud_val(pud_t pud)
{
- return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud);
+ return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void pud_clear(pud_t *pudp)
@@ -462,14 +496,17 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
static inline p4d_t __p4d(p4dval_t val)
{
- p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val);
+ p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV));
return (p4d_t) { ret };
}
static inline p4dval_t p4d_val(p4d_t p4d)
{
- return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d);
+ return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
@@ -556,7 +593,9 @@ static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
{
- PVOP_VCALLEE1(lock.queued_spin_unlock, lock);
+ PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock,
+ "movb $0, (%%" _ASM_ARG1 ");",
+ ALT_NOT(X86_FEATURE_PVUNLOCK));
}
static __always_inline void pv_wait(u8 *ptr, u8 val)
@@ -571,7 +610,9 @@ static __always_inline void pv_kick(int cpu)
static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
- return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu);
+ return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu,
+ "xor %%" _ASM_AX ", %%" _ASM_AX ";",
+ ALT_NOT(X86_FEATURE_VCPUPREEMPT));
}
void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
@@ -618,22 +659,26 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
* functions.
*/
#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
-#define PV_CALLEE_SAVE_REGS_THUNK(func) \
+#define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \
extern typeof(func) __raw_callee_save_##func; \
\
- asm(".pushsection .text;" \
+ asm(".pushsection " section ", \"ax\";" \
".globl " PV_THUNK_NAME(func) ";" \
".type " PV_THUNK_NAME(func) ", @function;" \
PV_THUNK_NAME(func) ":" \
+ ASM_ENDBR \
FRAME_BEGIN \
PV_SAVE_ALL_CALLER_REGS \
"call " #func ";" \
PV_RESTORE_ALL_CALLER_REGS \
FRAME_END \
- "ret;" \
+ ASM_RET \
".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \
".popsection")
+#define PV_CALLEE_SAVE_REGS_THUNK(func) \
+ __PV_CALLEE_SAVE_REGS_THUNK(func, ".text")
+
/* Get a reference to a callee-save function */
#define PV_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { __raw_callee_save_##func })
@@ -643,22 +688,23 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
((struct paravirt_callee_save) { func })
#ifdef CONFIG_PARAVIRT_XXL
-static inline notrace unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
{
- return PVOP_CALLEE0(unsigned long, irq.save_fl);
+ return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
+ ALT_NOT(X86_FEATURE_XENPV));
}
-static inline notrace void arch_local_irq_disable(void)
+static __always_inline void arch_local_irq_disable(void)
{
- PVOP_VCALLEE0(irq.irq_disable);
+ PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
}
-static inline notrace void arch_local_irq_enable(void)
+static __always_inline void arch_local_irq_enable(void)
{
- PVOP_VCALLEE0(irq.irq_enable);
+ PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
}
-static inline notrace unsigned long arch_local_irq_save(void)
+static __always_inline unsigned long arch_local_irq_save(void)
{
unsigned long f;
@@ -700,84 +746,22 @@ extern void default_banner(void);
.popsection
-#define COND_PUSH(set, mask, reg) \
- .if ((~(set)) & mask); push %reg; .endif
-#define COND_POP(set, mask, reg) \
- .if ((~(set)) & mask); pop %reg; .endif
-
#ifdef CONFIG_X86_64
-
-#define PV_SAVE_REGS(set) \
- COND_PUSH(set, CLBR_RAX, rax); \
- COND_PUSH(set, CLBR_RCX, rcx); \
- COND_PUSH(set, CLBR_RDX, rdx); \
- COND_PUSH(set, CLBR_RSI, rsi); \
- COND_PUSH(set, CLBR_RDI, rdi); \
- COND_PUSH(set, CLBR_R8, r8); \
- COND_PUSH(set, CLBR_R9, r9); \
- COND_PUSH(set, CLBR_R10, r10); \
- COND_PUSH(set, CLBR_R11, r11)
-#define PV_RESTORE_REGS(set) \
- COND_POP(set, CLBR_R11, r11); \
- COND_POP(set, CLBR_R10, r10); \
- COND_POP(set, CLBR_R9, r9); \
- COND_POP(set, CLBR_R8, r8); \
- COND_POP(set, CLBR_RDI, rdi); \
- COND_POP(set, CLBR_RSI, rsi); \
- COND_POP(set, CLBR_RDX, rdx); \
- COND_POP(set, CLBR_RCX, rcx); \
- COND_POP(set, CLBR_RAX, rax)
+#ifdef CONFIG_PARAVIRT_XXL
#define PARA_PATCH(off) ((off) / 8)
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#else
-#define PV_SAVE_REGS(set) \
- COND_PUSH(set, CLBR_EAX, eax); \
- COND_PUSH(set, CLBR_EDI, edi); \
- COND_PUSH(set, CLBR_ECX, ecx); \
- COND_PUSH(set, CLBR_EDX, edx)
-#define PV_RESTORE_REGS(set) \
- COND_POP(set, CLBR_EDX, edx); \
- COND_POP(set, CLBR_ECX, ecx); \
- COND_POP(set, CLBR_EDI, edi); \
- COND_POP(set, CLBR_EAX, eax)
-
-#define PARA_PATCH(off) ((off) / 4)
-#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4)
-#define PARA_INDIRECT(addr) *%cs:addr
-#endif
-
-#ifdef CONFIG_PARAVIRT_XXL
-#define INTERRUPT_RETURN \
- PARA_SITE(PARA_PATCH(PV_CPU_iret), \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);)
-
-#define DISABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-
-#define ENABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-#endif
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_PARAVIRT_XXL
#ifdef CONFIG_DEBUG_ENTRY
-#define SAVE_FLAGS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+.macro PARA_IRQ_save_fl
+ PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
+ ANNOTATE_RETPOLINE_SAFE;
+ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+.endm
+
+#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
+ ALT_NOT(X86_FEATURE_XENPV)
#endif
#endif /* CONFIG_PARAVIRT_XXL */
#endif /* CONFIG_X86_64 */
@@ -800,5 +784,11 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
#endif
+
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+static inline void paravirt_set_cap(void)
+{
+}
+#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_api_clock.h b/arch/x86/include/asm/paravirt_api_clock.h
new file mode 100644
index 000000000000..65ac7cee0dad
--- /dev/null
+++ b/arch/x86/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index de87087d3bde..89df6c6617f5 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -3,7 +3,6 @@
#define _ASM_X86_PARAVIRT_TYPES_H
/* Bitmask of what can be clobbered: usually at least eax. */
-#define CLBR_NONE 0
#define CLBR_EAX (1 << 0)
#define CLBR_ECX (1 << 1)
#define CLBR_EDX (1 << 2)
@@ -15,7 +14,6 @@
#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
-#define CLBR_SCRATCH (0)
#else
#define CLBR_RAX CLBR_EAX
#define CLBR_RCX CLBR_ECX
@@ -32,12 +30,9 @@
#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
CLBR_RCX | CLBR_R8 | CLBR_R9)
#define CLBR_RET_REG (CLBR_RAX)
-#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
#endif /* X86_64 */
-#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
-
#ifndef __ASSEMBLY__
#include <asm/desc_defs.h>
@@ -73,19 +68,6 @@ struct pv_info {
const char *name;
};
-struct pv_init_ops {
- /*
- * Patch may replace one of the defined code sequences with
- * arbitrary code, subject to the same register constraints.
- * This generally means the code is not free to clobber any
- * registers other than EAX. The patch function should return
- * the number of bytes of code generated, as we nop pad the
- * rest in generic code.
- */
- unsigned (*patch)(u8 type, void *insn_buff,
- unsigned long addr, unsigned len);
-} __no_randomize_layout;
-
#ifdef CONFIG_PARAVIRT_XXL
struct pv_lazy_ops {
/* Set deferred update mode, used for batching operations. */
@@ -95,11 +77,6 @@ struct pv_lazy_ops {
} __no_randomize_layout;
#endif
-struct pv_time_ops {
- unsigned long long (*sched_clock)(void);
- unsigned long long (*steal_clock)(int cpu);
-} __no_randomize_layout;
-
struct pv_cpu_ops {
/* hooks for various privileged instructions */
void (*io_delay)(void);
@@ -156,10 +133,6 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
- /* Normal iret. Jump to this with the standard iret stack
- frame set up. */
- void (*iret)(void);
-
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
@@ -188,13 +161,14 @@ struct pv_mmu_ops {
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
void (*flush_tlb_one_user)(unsigned long addr);
- void (*flush_tlb_others)(const struct cpumask *cpus,
- const struct flush_tlb_info *info);
+ void (*flush_tlb_multi)(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
/* Hook for intercepting the destruction of an mm_struct. */
void (*exit_mmap)(struct mm_struct *mm);
+ void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
#ifdef CONFIG_PARAVIRT_XXL
struct paravirt_callee_save read_cr2;
@@ -290,8 +264,6 @@ struct pv_lock_ops {
* number for each function using the offset which we use to indicate
* what to patch. */
struct paravirt_patch_template {
- struct pv_init_ops init;
- struct pv_time_ops time;
struct pv_cpu_ops cpu;
struct pv_irq_ops irq;
struct pv_mmu_ops mmu;
@@ -306,7 +278,7 @@ extern struct paravirt_patch_template pv_ops;
#define paravirt_type(op) \
[paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
- [paravirt_opptr] "i" (&(pv_ops.op))
+ [paravirt_opptr] "m" (pv_ops.op)
#define paravirt_clobber(clobber) \
[paravirt_clobber] "i" (clobber)
@@ -331,11 +303,7 @@ extern struct paravirt_patch_template pv_ops;
/* Simple instruction patching code. */
#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
-unsigned paravirt_patch_ident_64(void *insn_buff, unsigned len);
-unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len);
-unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end);
-
-unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned len);
+unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len);
int paravirt_disable_iospace(void);
@@ -347,7 +315,7 @@ int paravirt_disable_iospace(void);
*/
#define PARAVIRT_CALL \
ANNOTATE_RETPOLINE_SAFE \
- "call *%c[paravirt_opptr];"
+ "call *%[paravirt_opptr];"
/*
* These macros are intended to wrap calls through one of the paravirt
@@ -371,7 +339,7 @@ int paravirt_disable_iospace(void);
* on the stack. All caller-save registers (eax,edx,ecx) are expected
* to be modified (either clobbered or used for return values).
* X86_64, on the other hand, already specifies a register-based calling
- * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
+ * conventions, returning at %rax, with parameters going on %rdi, %rsi,
* %rdx, and %rcx. Note that for this reason, x86_64 does not need any
* special handling for dealing with 4 arguments, unlike i386.
* However, x86_64 also have to clobber all caller saved registers, which
@@ -414,11 +382,9 @@ int paravirt_disable_iospace(void);
* makes sure the incoming and outgoing types are always correct.
*/
#ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS \
+#define PVOP_CALL_ARGS \
unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
-
#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
@@ -434,12 +400,10 @@ int paravirt_disable_iospace(void);
#define VEXTRA_CLOBBERS
#else /* CONFIG_X86_64 */
/* [re]ax isn't an arg, but the return val */
-#define PVOP_VCALL_ARGS \
+#define PVOP_CALL_ARGS \
unsigned long __edi = __edi, __esi = __esi, \
__edx = __edx, __ecx = __ecx, __eax = __eax;
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
-
#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
@@ -464,152 +428,138 @@ int paravirt_disable_iospace(void);
#define PVOP_TEST_NULL(op) ((void)pv_ops.op)
#endif
-#define PVOP_RETMASK(rettype) \
+#define PVOP_RETVAL(rettype) \
({ unsigned long __mask = ~0UL; \
+ BUILD_BUG_ON(sizeof(rettype) > sizeof(unsigned long)); \
switch (sizeof(rettype)) { \
case 1: __mask = 0xffUL; break; \
case 2: __mask = 0xffffUL; break; \
case 4: __mask = 0xffffffffUL; break; \
default: break; \
} \
- __mask; \
+ __mask & __eax; \
})
-#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
- pre, post, ...) \
+#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \
({ \
- rettype __ret; \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- /* This is 32-bit specific, but is okay in 64-bit */ \
- /* since this condition will never hold */ \
- if (sizeof(rettype) > sizeof(unsigned long)) { \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- __ret = (rettype)((((u64)__edx) << 32) | __eax); \
- } else { \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \
- } \
- __ret; \
+ asm volatile(paravirt_alt(PARAVIRT_CALL) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
+ : paravirt_type(op), \
+ paravirt_clobber(clbr), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" extra_clbr); \
+ ret; \
})
-#define __PVOP_CALL(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
- EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
-
-#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
- PVOP_CALLEE_CLOBBERS, , \
- pre, post, ##__VA_ARGS__)
-
-
-#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
+#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \
+ extra_clbr, ...) \
({ \
- PVOP_VCALL_ARGS; \
+ PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
+ asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \
+ alt, cond) \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
+ ret; \
})
-#define __PVOP_VCALL(op, pre, post, ...) \
- ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
- VEXTRA_CLOBBERS, \
- pre, post, ##__VA_ARGS__)
+#define __PVOP_CALL(rettype, op, ...) \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \
+ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\
+ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \
+ ##__VA_ARGS__)
+
+#define __PVOP_CALLEESAVE(rettype, op, ...) \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \
+ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \
+ CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+
+#define __PVOP_VCALL(op, ...) \
+ (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
+ VEXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define __PVOP_ALT_VCALL(op, alt, cond, ...) \
+ (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \
+ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \
+ ##__VA_ARGS__)
-#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
- ____PVOP_VCALL(op.func, CLBR_RET_REG, \
- PVOP_VCALLEE_CLOBBERS, , \
- pre, post, ##__VA_ARGS__)
+#define __PVOP_VCALLEESAVE(op, ...) \
+ (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \
+ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
+#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \
+ (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \
+ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
#define PVOP_CALL0(rettype, op) \
- __PVOP_CALL(rettype, op, "", "")
+ __PVOP_CALL(rettype, op)
#define PVOP_VCALL0(op) \
- __PVOP_VCALL(op, "", "")
+ __PVOP_VCALL(op)
+#define PVOP_ALT_CALL0(rettype, op, alt, cond) \
+ __PVOP_ALT_CALL(rettype, op, alt, cond)
+#define PVOP_ALT_VCALL0(op, alt, cond) \
+ __PVOP_ALT_VCALL(op, alt, cond)
#define PVOP_CALLEE0(rettype, op) \
- __PVOP_CALLEESAVE(rettype, op, "", "")
+ __PVOP_CALLEESAVE(rettype, op)
#define PVOP_VCALLEE0(op) \
- __PVOP_VCALLEESAVE(op, "", "")
+ __PVOP_VCALLEESAVE(op)
+#define PVOP_ALT_CALLEE0(rettype, op, alt, cond) \
+ __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond)
+#define PVOP_ALT_VCALLEE0(op, alt, cond) \
+ __PVOP_ALT_VCALLEESAVE(op, alt, cond)
#define PVOP_CALL1(rettype, op, arg1) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1))
#define PVOP_VCALL1(op, arg1) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_VCALL1(op, arg1, alt, cond) \
+ __PVOP_ALT_VCALL(op, alt, cond, PVOP_CALL_ARG1(arg1))
#define PVOP_CALLEE1(rettype, op, arg1) \
- __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_CALLEESAVE(rettype, op, PVOP_CALL_ARG1(arg1))
#define PVOP_VCALLEE1(op, arg1) \
- __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_VCALLEESAVE(op, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_CALLEE1(rettype, op, arg1, alt, cond) \
+ __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_VCALLEE1(op, arg1, alt, cond) \
+ __PVOP_ALT_VCALLEESAVE(op, alt, cond, PVOP_CALL_ARG1(arg1))
#define PVOP_CALL2(rettype, op, arg1, arg2) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
#define PVOP_VCALL2(op, arg1, arg2) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-
-#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
- __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-#define PVOP_VCALLEE2(op, arg1, arg2) \
- __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), \
PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
#define PVOP_VCALL3(op, arg1, arg2, arg3) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), \
PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
-/* This is the only difference in x86_64. We can make it much simpler */
-#ifdef CONFIG_X86_32
#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
__PVOP_CALL(rettype, op, \
- "push %[_arg4];", "lea 4(%%esp),%%esp;", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
- PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
-#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, \
- "push %[_arg4];", "lea 4(%%esp),%%esp;", \
- "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
- "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
-#else
-#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
- __PVOP_CALL(rettype, op, "", "", \
PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, "", "", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-#endif
/* Lazy mode for batching updates / context switch */
enum paravirt_lazy_mode {
@@ -627,7 +577,9 @@ void paravirt_leave_lazy_mmu(void);
void paravirt_flush_lazy_mmu(void);
void _paravirt_nop(void);
+void paravirt_BUG(void);
u64 _paravirt_ident_64(u64);
+unsigned long paravirt_ret0(void);
#define paravirt_nop ((void *)_paravirt_nop)
diff --git a/arch/x86/include/asm/pc-conf-reg.h b/arch/x86/include/asm/pc-conf-reg.h
new file mode 100644
index 000000000000..56bceceacf5f
--- /dev/null
+++ b/arch/x86/include/asm/pc-conf-reg.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Support for the configuration register space at port I/O locations
+ * 0x22 and 0x23 variously used by PC architectures, e.g. the MP Spec,
+ * Cyrix CPUs, numerous chipsets.
+ */
+#ifndef _ASM_X86_PC_CONF_REG_H
+#define _ASM_X86_PC_CONF_REG_H
+
+#include <linux/io.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#define PC_CONF_INDEX 0x22
+#define PC_CONF_DATA 0x23
+
+#define PC_CONF_MPS_IMCR 0x70
+
+extern raw_spinlock_t pc_conf_lock;
+
+static inline u8 pc_conf_get(u8 reg)
+{
+ outb(reg, PC_CONF_INDEX);
+ return inb(PC_CONF_DATA);
+}
+
+static inline void pc_conf_set(u8 reg, u8 data)
+{
+ outb(reg, PC_CONF_INDEX);
+ outb(data, PC_CONF_DATA);
+}
+
+#endif /* _ASM_X86_PC_CONF_REG_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d2c76c8d8cfd..f3fd5928bcbb 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -135,7 +135,7 @@ struct pci_setup_rom {
unsigned long bus;
unsigned long device;
unsigned long function;
- uint8_t romdata[0];
+ uint8_t romdata[];
};
#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 490411dba438..70533fdcbf02 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -5,7 +5,10 @@
* (c) 1999 Martin Mares <mj@ucw.cz>
*/
+#include <linux/errno.h>
+#include <linux/init.h>
#include <linux/ioport.h>
+#include <linux/spinlock.h>
#undef DEBUG
@@ -39,6 +42,8 @@ do { \
#define PCI_ROOT_NO_CRS 0x100000
#define PCI_NOASSIGN_BARS 0x200000
#define PCI_BIG_ROOT_WINDOW 0x400000
+#define PCI_USE_E820 0x800000
+#define PCI_NO_E820 0x1000000
extern unsigned int pci_probe;
extern unsigned long pirq_table_addr;
@@ -64,6 +69,8 @@ void pcibios_scan_specific_bus(int busn);
/* pci-irq.c */
+struct pci_dev;
+
struct irq_info {
u8 bus, devfn; /* Bus, device and function */
struct {
@@ -87,7 +94,16 @@ struct irq_routing_table {
u32 miniport_data; /* Crap */
u8 rfu[11];
u8 checksum; /* Modulo 256 checksum must give 0 */
- struct irq_info slots[0];
+ struct irq_info slots[];
+} __attribute__((packed));
+
+struct irt_routing_table {
+ u32 signature; /* IRT_SIGNATURE should be here */
+ u8 size; /* Number of entries provided */
+ u8 used; /* Number of entries actually used */
+ u16 exclusive_irqs; /* IRQs devoted exclusively to
+ PCI usage */
+ struct irq_info slots[];
} __attribute__((packed));
extern unsigned int pcibios_irq_mask;
@@ -232,3 +248,9 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
# define x86_default_pci_init_irq NULL
# define x86_default_pci_fixup_irqs NULL
#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_ACPI)
+extern bool pci_use_e820;
+#else
+#define pci_use_e820 false
+#endif
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a3c33b79fb86..13c0d63ed55e 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -38,9 +38,9 @@
#define arch_raw_cpu_ptr(ptr) \
({ \
unsigned long tcp_ptr__; \
- asm volatile("add " __percpu_arg(1) ", %0" \
- : "=r" (tcp_ptr__) \
- : "m" (this_cpu_off), "0" (ptr)); \
+ asm ("add " __percpu_arg(1) ", %0" \
+ : "=r" (tcp_ptr__) \
+ : "m" (this_cpu_off), "0" (ptr)); \
(typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
})
#else
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 544f41a179fb..409725e86f42 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -2,12 +2,14 @@
#ifndef _ASM_X86_PERF_EVENT_H
#define _ASM_X86_PERF_EVENT_H
+#include <linux/static_call.h>
+
/*
* Performance event hw details:
*/
#define INTEL_PMC_MAX_GENERIC 32
-#define INTEL_PMC_MAX_FIXED 4
+#define INTEL_PMC_MAX_FIXED 16
#define INTEL_PMC_IDX_FIXED 32
#define X86_PMC_IDX_MAX 64
@@ -184,6 +186,18 @@ union cpuid28_ecx {
unsigned int full;
};
+/*
+ * AMD "Extended Performance Monitoring and Debug" CPUID
+ * detection/enumeration details:
+ */
+union cpuid_0x80000022_ebx {
+ struct {
+ /* Number of Core Performance Counters */
+ unsigned int num_core_pmc:4;
+ } split;
+ unsigned int full;
+};
+
struct x86_pmu_capability {
int version;
int num_counters_gp;
@@ -241,6 +255,11 @@ struct x86_pmu_capability {
#define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3)
#define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS)
+static inline bool use_fixed_pseudo_encoding(u64 code)
+{
+ return !(code & 0xff);
+}
+
/*
* We model BTS tracing as another fixed-mode PMC.
*
@@ -366,6 +385,11 @@ struct pebs_xmm {
};
/*
+ * AMD Extended Performance Monitoring and Debug cpuid feature detection
+ */
+#define EXT_PERFMON_DEBUG_FEATURES 0x80000022
+
+/*
* IBS cpuid feature detection
*/
@@ -386,6 +410,7 @@ struct pebs_xmm {
#define IBS_CAPS_OPBRNFUSE (1U<<8)
#define IBS_CAPS_FETCHCTLEXTD (1U<<9)
#define IBS_CAPS_OPDATA4 (1U<<10)
+#define IBS_CAPS_ZEN4 (1U<<11)
#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
| IBS_CAPS_FETCHSAM \
@@ -399,6 +424,7 @@ struct pebs_xmm {
#define IBSCTL_LVT_OFFSET_MASK 0x0F
/* IBS fetch bits/masks */
+#define IBS_FETCH_L3MISSONLY (1ULL<<59)
#define IBS_FETCH_RAND_EN (1ULL<<57)
#define IBS_FETCH_VAL (1ULL<<49)
#define IBS_FETCH_ENABLE (1ULL<<48)
@@ -415,6 +441,7 @@ struct pebs_xmm {
#define IBS_OP_CNT_CTL (1ULL<<19)
#define IBS_OP_VAL (1ULL<<18)
#define IBS_OP_ENABLE (1ULL<<17)
+#define IBS_OP_L3MISSONLY (1ULL<<16)
#define IBS_OP_MAX_CNT 0x0000FFFFULL
#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
#define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */
@@ -478,6 +505,7 @@ struct x86_pmu_lbr {
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
extern void perf_check_microcode(void);
+extern void perf_clear_dirty_counters(void);
extern int x86_perf_rdpmc_index(struct perf_event *event);
#else
static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
@@ -512,6 +540,27 @@ static inline void intel_pt_handle_vmx(int on)
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
extern void amd_pmu_enable_virt(void);
extern void amd_pmu_disable_virt(void);
+
+#if defined(CONFIG_PERF_EVENTS_AMD_BRS)
+
+#define PERF_NEEDS_LOPWR_CB 1
+
+/*
+ * architectural low power callback impacts
+ * drivers/acpi/processor_idle.c
+ * drivers/acpi/acpi_pad.c
+ */
+extern void perf_amd_brs_lopwr_cb(bool lopwr_in);
+
+DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+
+static inline void perf_lopwr_cb(bool lopwr_in)
+{
+ static_call_mod(perf_lopwr_cb)(lopwr_in);
+}
+
+#endif /* PERF_NEEDS_LOPWR_CB */
+
#else
static inline void amd_pmu_enable_virt(void) { }
static inline void amd_pmu_disable_virt(void) { }
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 62ad61d6fefc..c7ec5bb88334 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -84,8 +84,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
#if CONFIG_PGTABLE_LEVELS > 2
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a02c67291cfc..44e2d6f1dbaa 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -15,17 +15,14 @@
cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \
: (prot))
-/*
- * Macros to add or remove encryption attribute
- */
-#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot)))
-#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
-
#ifndef __ASSEMBLY__
+#include <linux/spinlock.h>
#include <asm/x86_init.h>
-#include <asm/fpu/xstate.h>
+#include <asm/pkru.h>
#include <asm/fpu/api.h>
+#include <asm/coco.h>
#include <asm-generic/pgtable_uffd.h>
+#include <linux/page_table_check.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -36,6 +33,12 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
void ptdump_walk_pgd_level_checkwx(void);
void ptdump_walk_user_pgd_level_checkwx(void);
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot) __pgprot(cc_mkenc(pgprot_val(prot)))
+#define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot)))
+
#ifdef CONFIG_DEBUG_WX
#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
@@ -126,35 +129,6 @@ static inline int pte_dirty(pte_t pte)
return pte_flags(pte) & _PAGE_DIRTY;
}
-
-static inline u32 read_pkru(void)
-{
- if (boot_cpu_has(X86_FEATURE_OSPKE))
- return rdpkru();
- return 0;
-}
-
-static inline void write_pkru(u32 pkru)
-{
- struct pkru_state *pk;
-
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
- return;
-
- pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);
-
- /*
- * The PKRU value in xstate needs to be in sync with the value that is
- * written to the CPU. The FPU restore on return to userland would
- * otherwise load the previous value again.
- */
- fpregs_lock();
- if (pk)
- pk->pkru = pkru;
- __write_pkru(pkru);
- fpregs_unlock();
-}
-
static inline int pte_young(pte_t pte)
{
return pte_flags(pte) & _PAGE_ACCESSED;
@@ -675,11 +649,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
-static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
-{
- return canon_pgprot(prot);
-}
-
static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
enum page_cache_mode pcm,
enum page_cache_mode new_pcm)
@@ -781,7 +750,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
return true;
if ((pte_flags(a) & _PAGE_PROTNONE) &&
- mm_tlb_flush_pending(mm))
+ atomic_read(&mm->tlb_flush_pending))
return true;
return false;
@@ -865,9 +834,9 @@ static inline int pud_present(pud_t pud)
return pud_flags(pud) & _PAGE_PRESENT;
}
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
{
- return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
+ return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}
/*
@@ -906,9 +875,9 @@ static inline int p4d_present(p4d_t p4d)
return p4d_flags(p4d) & _PAGE_PRESENT;
}
-static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+static inline pud_t *p4d_pgtable(p4d_t p4d)
{
- return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
+ return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}
/*
@@ -1035,18 +1004,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
+ page_table_check_pte_set(mm, addr, ptep, pte);
set_pte(ptep, pte);
}
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
+ page_table_check_pmd_set(mm, addr, pmdp, pmd);
set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
+ page_table_check_pud_set(mm, addr, pudp, pud);
native_set_pud(pudp, pud);
}
@@ -1077,6 +1049,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, addr, pte);
return pte;
}
@@ -1092,6 +1065,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
* care about updates and native needs no locking
*/
pte = native_local_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, addr, pte);
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
@@ -1138,14 +1112,22 @@ static inline int pmd_write(pmd_t pmd)
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
- return native_pmdp_get_and_clear(pmdp);
+ pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+
+ page_table_check_pmd_clear(mm, addr, pmd);
+
+ return pmd;
}
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pud_t *pudp)
{
- return native_pudp_get_and_clear(pudp);
+ pud_t pud = native_pudp_get_and_clear(pudp);
+
+ page_table_check_pud_clear(mm, addr, pud);
+
+ return pud;
}
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
@@ -1166,6 +1148,7 @@ static inline int pud_write(pud_t pud)
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
+ page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
if (IS_ENABLED(CONFIG_SMP)) {
return xchg(pmdp, pmd);
} else {
@@ -1175,6 +1158,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
}
}
#endif
+
+#define __HAVE_ARCH_PMDP_INVALIDATE_AD
+extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
@@ -1244,7 +1232,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
- * dst - pointer to pgd range anwhere on a pgd page
+ * dst - pointer to pgd range anywhere on a pgd page
* src - ""
* count - the number of pgds to copy.
*
@@ -1293,6 +1281,23 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
}
+#ifdef _PAGE_SWP_EXCLUSIVE
+#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE
+static inline pte_t pte_swp_mkexclusive(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
+}
+
+static inline int pte_swp_exclusive(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
+}
+
+static inline pte_t pte_swp_clear_exclusive(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
+}
+#endif /* _PAGE_SWP_EXCLUSIVE */
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
@@ -1360,32 +1365,6 @@ static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
-#define PKRU_BITS_PER_PKEY 2
-
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-extern u32 init_pkru_value;
-#else
-#define init_pkru_value 0
-#endif
-
-static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
-{
- int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
- return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
-}
-
-static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
-{
- int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
- /*
- * Access-disable disables writes too so we need to check
- * both bits here.
- */
- return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
-}
-
static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -1458,6 +1437,23 @@ static inline bool arch_faults_on_old_pte(void)
return false;
}
+#ifdef CONFIG_PAGE_TABLE_CHECK
+static inline bool pte_user_accessible_page(pte_t pte)
+{
+ return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
+}
+
+static inline bool pmd_user_accessible_page(pmd_t pmd)
+{
+ return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER);
+}
+
+static inline bool pud_user_accessible_page(pud_t pud)
+{
+ return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER);
+}
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 56d0399a0cd1..e479491da8d5 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -186,7 +186,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
*
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
- * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry
+ * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| E|F|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
@@ -203,6 +203,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
* F (2) in swp entry is used to record when a pagetable is
* writeprotected by userfaultfd WP support.
*
+ * E (3) in swp entry is used to rememeber PG_anon_exclusive.
+ *
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
*
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 91ac10654570..70e360a2e5fb 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -163,4 +163,9 @@ extern unsigned int ptrs_per_p4d;
#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
+/*
+ * We borrow bit 3 to remember PG_anon_exclusive.
+ */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_PWT
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f24d7ef8fffa..bdaf8391e2e0 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -7,8 +7,6 @@
#include <asm/page_types.h>
-#define FIRST_USER_ADDRESS 0UL
-
#define _PAGE_BIT_PRESENT 0 /* is present */
#define _PAGE_BIT_RW 1 /* writeable */
#define _PAGE_BIT_USER 2 /* userspace addressable */
@@ -112,9 +110,11 @@
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
+#define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
#else
#define _PAGE_NX (_AT(pteval_t, 0))
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
+#define _PAGE_SOFTW4 (_AT(pteval_t, 0))
#endif
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
@@ -561,10 +561,6 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
-
-struct mm_struct;
-extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
- unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 2ff9b98812b7..2e6c04d8a45b 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -2,21 +2,19 @@
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H
-#define ARCH_DEFAULT_PKEY 0
-
/*
* If more than 16 keys are ever supported, a thorough audit
* will be necessary to ensure that the types that store key
* numbers and masks have sufficient capacity.
*/
-#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
static inline bool arch_pkeys_enabled(void)
{
- return boot_cpu_has(X86_FEATURE_OSPKE);
+ return cpu_feature_enabled(X86_FEATURE_OSPKE);
}
/*
@@ -26,7 +24,7 @@ static inline bool arch_pkeys_enabled(void)
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return ARCH_DEFAULT_PKEY;
return __execute_only_pkey(mm);
@@ -37,15 +35,12 @@ extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
int prot, int pkey)
{
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return 0;
return __arch_override_mprotect_pkey(vma, prot, pkey);
}
-extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val);
-
#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
@@ -120,12 +115,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey)
return 0;
}
-extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val);
-extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val);
-extern void copy_init_pkru_to_fpregs(void);
-
static inline int vma_pkey(struct vm_area_struct *vma)
{
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
new file mode 100644
index 000000000000..74f0a2d34ffd
--- /dev/null
+++ b/arch/x86/include/asm/pkru.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PKRU_H
+#define _ASM_X86_PKRU_H
+
+#include <asm/cpufeature.h>
+
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
+#define PKRU_BITS_PER_PKEY 2
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+extern u32 init_pkru_value;
+#define pkru_get_init_value() READ_ONCE(init_pkru_value)
+#else
+#define init_pkru_value 0
+#define pkru_get_init_value() 0
+#endif
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ /*
+ * Access-disable disables writes too so we need to check
+ * both bits here.
+ */
+ return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+static inline u32 read_pkru(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return rdpkru();
+ return 0;
+}
+
+static inline void write_pkru(u32 pkru)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+ /*
+ * WRPKRU is relatively expensive compared to RDPKRU.
+ * Avoid WRPKRU when it would not change the value.
+ */
+ if (pkru != rdpkru())
+ wrpkru(pkru);
+}
+
+static inline void pkru_write_default(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ wrpkru(pkru_get_init_value());
+}
+
+#endif
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index f8cb8af4de5c..5f6daea1ee24 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
+ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)
/*
@@ -108,16 +108,18 @@ static __always_inline bool should_resched(int preempt_offset)
extern asmlinkage void preempt_schedule(void);
extern asmlinkage void preempt_schedule_thunk(void);
-#define __preempt_schedule_func preempt_schedule_thunk
+#define preempt_schedule_dynamic_enabled preempt_schedule_thunk
+#define preempt_schedule_dynamic_disabled NULL
extern asmlinkage void preempt_schedule_notrace(void);
extern asmlinkage void preempt_schedule_notrace_thunk(void);
-#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk
+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace_thunk
+#define preempt_schedule_notrace_dynamic_disabled NULL
#ifdef CONFIG_PREEMPT_DYNAMIC
-DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+DECLARE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
#define __preempt_schedule() \
do { \
@@ -125,7 +127,7 @@ do { \
asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
} while (0)
-DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+DECLARE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
#define __preempt_schedule_notrace() \
do { \
diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h
index df700a6cc869..efe3e46e454b 100644
--- a/arch/x86/include/asm/processor-cyrix.h
+++ b/arch/x86/include/asm/processor-cyrix.h
@@ -5,14 +5,14 @@
* Access order is always 0x22 (=offset), 0x23 (=value)
*/
+#include <asm/pc-conf-reg.h>
+
static inline u8 getCx86(u8 reg)
{
- outb(reg, 0x22);
- return inb(0x23);
+ return pc_conf_get(reg);
}
static inline void setCx86(u8 reg, u8 data)
{
- outb(reg, 0x22);
- outb(data, 0x23);
+ pc_conf_set(reg, data);
}
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f1b9ed5efaa9..356308c73951 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -119,6 +119,8 @@ struct cpuinfo_x86 {
int x86_cache_mbm_width_offset;
int x86_power;
unsigned long loops_per_jiffy;
+ /* protected processor identification number */
+ u64 ppin;
/* cpuid returned max cores value: */
u16 x86_max_cores;
u16 apicid;
@@ -136,6 +138,8 @@ struct cpuinfo_x86 {
u16 logical_die_id;
/* Index into per_cpu list: */
u16 cpu_index;
+ /* Is SMT active on this core? */
+ bool smt_active;
u32 microcode;
/* Address space bits used by the cache internally */
u8 x86_cache_bits;
@@ -162,7 +166,8 @@ enum cpuid_regs_idx {
#define X86_VENDOR_NSC 8
#define X86_VENDOR_HYGON 9
#define X86_VENDOR_ZHAOXIN 10
-#define X86_VENDOR_NUM 11
+#define X86_VENDOR_VORTEX 11
+#define X86_VENDOR_NUM 12
#define X86_VENDOR_UNKNOWN 0xff
@@ -314,11 +319,6 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
- /*
- * We store cpu_current_top_of_stack in sp1 so it's always accessible.
- * Linux does not use ring 1, so sp1 is not otherwise needed.
- */
u64 sp1;
/*
@@ -426,12 +426,7 @@ struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
#ifdef CONFIG_X86_64
struct fixed_percpu_data {
@@ -439,6 +434,9 @@ struct fixed_percpu_data {
* GCC hardcodes the stack canary as %gs:40. Since the
* irq_stack is the object at %gs:0, we reserve the bottom
* 48 bytes of the irq stack for the canary.
+ *
+ * Once we are willing to require -mstack-protector-guard-symbol=
+ * support for x86_64 stackprotector, we can get rid of this.
*/
char gs_base[40];
unsigned long stack_canary;
@@ -460,25 +458,12 @@ extern asmlinkage void ignore_sysret(void);
void current_save_fsgs(void);
#else /* X86_64 */
#ifdef CONFIG_STACKPROTECTOR
-/*
- * Make sure stack canary segment base is cached-aligned:
- * "For Intel Atom processors, avoid non zero segment base address
- * that is not aligned to cache line boundary at all cost."
- * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
- */
-struct stack_canary {
- char __pad[20]; /* canary at %gs:20 */
- unsigned long canary;
-};
-DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
#endif
DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
#endif /* !X86_64 */
-extern unsigned int fpu_kernel_xstate_size;
-extern unsigned int fpu_user_xstate_size;
-
struct perf_event;
struct thread_struct {
@@ -527,14 +512,24 @@ struct thread_struct {
struct io_bitmap *io_bitmap;
/*
- * IOPL. Priviledge level dependent I/O permission which is
+ * IOPL. Privilege level dependent I/O permission which is
* emulated via the I/O bitmap to prevent user space from disabling
* interrupts.
*/
unsigned long iopl_emul;
+ unsigned int iopl_warn:1;
unsigned int sig_on_uaccess_err:1;
+ /*
+ * Protection Keys Register for Userspace. Loaded immediately on
+ * context switch. Store it in thread_struct to avoid a lookup in
+ * the tasks's FPU xstate buffer. This value is only valid when a
+ * task is scheduled out. For 'current' the authoritative source of
+ * PKRU is the hardware itself.
+ */
+ u32 pkru;
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
@@ -543,12 +538,12 @@ struct thread_struct {
*/
};
-/* Whitelist the FPU state from the task_struct for hardened usercopy. */
+extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size);
+
static inline void arch_thread_struct_whitelist(unsigned long *offset,
unsigned long *size)
{
- *offset = offsetof(struct thread_struct, fpu.state);
- *size = fpu_kernel_xstate_size;
+ fpu_thread_struct_whitelist(offset, size);
}
static inline void
@@ -564,7 +559,7 @@ static __always_inline void native_swapgs(void)
#endif
}
-static inline unsigned long current_top_of_stack(void)
+static __always_inline unsigned long current_top_of_stack(void)
{
/*
* We can't read directly from tss.sp0: sp0 on x86_32 is special in
@@ -574,7 +569,7 @@ static inline unsigned long current_top_of_stack(void)
return this_cpu_read_stable(cpu_current_top_of_stack);
}
-static inline bool on_thread_stack(void)
+static __always_inline bool on_thread_stack(void)
{
return (unsigned long)(current_top_of_stack() -
current_stack_pointer) < THREAD_SIZE;
@@ -595,7 +590,7 @@ static inline void load_sp0(unsigned long sp0)
/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);
-unsigned long get_wchan(struct task_struct *p);
+unsigned long __get_wchan(struct task_struct *p);
/*
* Generic CPUID function
@@ -680,6 +675,7 @@ extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
+extern void cpu_init_secondary(void);
extern void cpu_init_exception_handling(void);
extern void cr4_init(void);
@@ -802,17 +798,24 @@ extern int set_tsc_mode(unsigned int val);
DECLARE_PER_CPU(u64, msr_misc_features_shadow);
+extern u16 get_llc_id(unsigned int cpu);
+
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
+extern u32 amd_get_highest_perf(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
+static inline u32 amd_get_highest_perf(void) { return 0; }
#endif
+#define for_each_possible_hypervisor_cpuid_base(function) \
+ for (function = 0x40000000; function < 0x40010000; function += 0x100)
+
static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
uint32_t base, eax, signature[3];
- for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ for_each_possible_hypervisor_cpuid_base(base) {
cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
if (!memcmp(sig, signature, 12) &&
@@ -834,7 +837,7 @@ bool xen_set_default_idle(void);
#define xen_set_default_idle 0
#endif
-void stop_this_cpu(void *dummy);
+void __noreturn stop_this_cpu(void *dummy);
void microcode_check(void);
enum l1tf_mitigations {
@@ -854,4 +857,12 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index b6a9d51d1d79..12ef86b19910 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -4,6 +4,8 @@
#include <asm/ldt.h>
+struct task_struct;
+
/* misc architecture specific prototypes */
void syscall_init(void);
@@ -11,6 +13,8 @@ void syscall_init(void);
#ifdef CONFIG_X86_64
void entry_SYSCALL_64(void);
void entry_SYSCALL_64_safe_stack(void);
+void entry_SYSRETQ_unsafe_stack(void);
+void entry_SYSRETQ_end(void);
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif
@@ -26,6 +30,8 @@ void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
void entry_SYSCALL_compat_safe_stack(void);
+void entry_SYSRETL_compat_unsafe_stack(void);
+void entry_SYSRETL_compat_end(void);
void entry_INT80_compat(void);
#ifdef CONFIG_XEN_PV
void xen_entry_INT80_compat(void);
@@ -33,11 +39,9 @@ void xen_entry_INT80_compat(void);
#endif
void x86_configure_nx(void);
-void x86_report_nx(void);
extern int reboot_force;
-long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long cpuid_enabled);
+long do_arch_prctl_common(int option, unsigned long arg2);
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 409f661481e1..f4db78b09c8f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -37,7 +37,10 @@ struct pt_regs {
unsigned short __esh;
unsigned short fs;
unsigned short __fsh;
- /* On interrupt, gs and __gsh store the vector number. */
+ /*
+ * On interrupt, gs and __gsh store the vector number. They never
+ * store gs any more.
+ */
unsigned short gs;
unsigned short __gsh;
/* On interrupt, this is the error code. */
@@ -134,7 +137,7 @@ static __always_inline int user_mode(struct pt_regs *regs)
#endif
}
-static inline int v8086_mode(struct pt_regs *regs)
+static __always_inline int v8086_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
return (regs->flags & X86_VM_MASK);
@@ -178,14 +181,18 @@ static inline bool any_64bit_mode(struct pt_regs *regs)
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
-static inline bool ip_within_syscall_gap(struct pt_regs *regs)
+static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack);
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack &&
+ regs->ip < (unsigned long)entry_SYSRETQ_end);
#ifdef CONFIG_IA32_EMULATION
ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack);
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack &&
+ regs->ip < (unsigned long)entry_SYSRETL_compat_end);
#endif
return ret;
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index d86ab942219c..d87451df480b 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -53,6 +53,7 @@ static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
static inline void queued_spin_unlock(struct qspinlock *lock)
{
+ kcsan_release();
pv_queued_spin_unlock(lock);
}
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 159622ee0674..892fd8c3a6f7 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -2,6 +2,8 @@
#ifndef __ASM_QSPINLOCK_PARAVIRT_H
#define __ASM_QSPINLOCK_PARAVIRT_H
+#include <asm/ibt.h>
+
/*
* For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
* registers. For i386, however, only 1 32-bit register needs to be saved
@@ -39,6 +41,7 @@ asm (".pushsection .text;"
".type " PV_UNLOCK ", @function;"
".align 4,0x90;"
PV_UNLOCK ": "
+ ASM_ENDBR
FRAME_BEGIN
"push %rdx;"
"mov $0x1,%eax;"
@@ -48,7 +51,7 @@ asm (".pushsection .text;"
"jne .slowpath;"
"pop %rdx;"
FRAME_END
- "ret;"
+ ASM_RET
".slowpath: "
"push %rsi;"
"movzbl %al,%esi;"
@@ -56,7 +59,7 @@ asm (".pushsection .text;"
"pop %rsi;"
"pop %rdx;"
FRAME_END
- "ret;"
+ ASM_RET
".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
".popsection");
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 5db5d083c873..fd6f6e5b755a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -25,6 +25,7 @@ struct real_mode_header {
u32 sev_es_trampoline_start;
#endif
#ifdef CONFIG_X86_64
+ u32 trampoline_start64;
u32 trampoline_pgd;
#endif
/* ACPI S3 wakeup */
@@ -89,6 +90,7 @@ static inline void set_real_mode_mem(phys_addr_t mem)
}
void reserve_real_mode(void);
+void load_trampoline_pgtable(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index b2d504f11937..aff774775c67 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -35,11 +35,7 @@
# define NEED_CMOV 0
#endif
-#ifdef CONFIG_X86_USE_3DNOW
-# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
-#else
# define NEED_3DNOW 0
-#endif
#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 7fdd4facfce7..2e7890dd58a4 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -4,6 +4,7 @@
#include <linux/const.h>
#include <asm/alternative.h>
+#include <asm/ibt.h>
/*
* Constructor for a conventional segment GDT (or LDT) entry.
@@ -95,7 +96,7 @@
*
* 26 - ESPFIX small SS
* 27 - per-cpu [ offset to per-cpu data area ]
- * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8
+ * 28 - unused
* 29 - unused
* 30 - unused
* 31 - TSS for double fault handler
@@ -118,7 +119,6 @@
#define GDT_ENTRY_ESPFIX_SS 26
#define GDT_ENTRY_PERCPU 27
-#define GDT_ENTRY_STACK_CANARY 28
#define GDT_ENTRY_DOUBLEFAULT_TSS 31
@@ -158,12 +158,6 @@
# define __KERNEL_PERCPU 0
#endif
-#ifdef CONFIG_STACKPROTECTOR
-# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
-#else
-# define __KERNEL_STACK_CANARY 0
-#endif
-
#else /* 64-bit: */
#include <asm/cache.h>
@@ -282,7 +276,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
* vector has no error code (two bytes), a 'push $vector_number' (two
* bytes), and a jump to the common entry code (up to five bytes).
*/
-#define EARLY_IDT_HANDLER_SIZE 9
+#define EARLY_IDT_HANDLER_SIZE (9 + ENDBR_INSN_SIZE)
/*
* xen_early_idt_handler_array is for Xen pv guests: for each entry in
@@ -290,7 +284,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
* pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to
* max 8 bytes.
*/
-#define XEN_EARLY_IDT_HANDLER_SIZE 8
+#define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE)
#ifndef __ASSEMBLY__
@@ -314,14 +308,7 @@ do { \
\
asm volatile(" \n" \
"1: movl %k0,%%" #seg " \n" \
- \
- ".section .fixup,\"ax\" \n" \
- "2: xorl %k0,%k0 \n" \
- " jmp 1b \n" \
- ".previous \n" \
- \
- _ASM_EXTABLE(1b, 2b) \
- \
+ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\
: "+r" (__val) : : "memory"); \
} while (0)
@@ -346,7 +333,7 @@ static inline void __loadsegment_fs(unsigned short value)
"1: movw %0, %%fs \n"
"2: \n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS)
: : "rm" (value) : "memory");
}
@@ -363,25 +350,6 @@ static inline void __loadsegment_fs(unsigned short value)
#define savesegment(seg, value) \
asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
-/*
- * x86-32 user GS accessors:
- */
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_32_LAZY_GS
-# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
-# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
-# define task_user_gs(tsk) ((tsk)->thread.gs)
-# define lazy_save_gs(v) savesegment(gs, (v))
-# define lazy_load_gs(v) loadsegment(gs, (v))
-# else /* X86_32_LAZY_GS */
-# define get_user_gs(regs) (u16)((regs)->gs)
-# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
-# define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
-# define lazy_save_gs(v) do { } while (0)
-# define lazy_load_gs(v) do { } while (0)
-# endif /* X86_32_LAZY_GS */
-#endif /* X86_32 */
-
#endif /* !__ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 4352f08bfbb5..b45c4d27fd46 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -2,14 +2,15 @@
#ifndef _ASM_X86_SET_MEMORY_H
#define _ASM_X86_SET_MEMORY_H
+#include <linux/mm.h>
#include <asm/page.h>
#include <asm-generic/set_memory.h>
/*
* The set_memory_* API can be used to change various attributes of a virtual
* address range. The attributes include:
- * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack
- * Executability : eXeutable, NoteXecutable
+ * Cacheability : UnCached, WriteCombining, WriteThrough, WriteBack
+ * Executability : eXecutable, NoteXecutable
* Read/Write : ReadOnly, ReadWrite
* Presence : NotPresent
* Encryption : Encrypted, Decrypted
@@ -52,7 +53,6 @@ int set_memory_global(unsigned long addr, int numpages);
int set_pages_array_uc(struct page **pages, int addrinarray);
int set_pages_array_wc(struct page **pages, int addrinarray);
-int set_pages_array_wt(struct page **pages, int addrinarray);
int set_pages_array_wb(struct page **pages, int addrinarray);
/*
@@ -86,53 +86,4 @@ bool kernel_page_present(struct page *page);
extern int kernel_set_to_readonly;
-#ifdef CONFIG_X86_64
-/*
- * Prevent speculative access to the page by either unmapping
- * it (if we do not require access to any part of the page) or
- * marking it uncacheable (if we want to try to retrieve data
- * from non-poisoned lines in the page).
- */
-static inline int set_mce_nospec(unsigned long pfn, bool unmap)
-{
- unsigned long decoy_addr;
- int rc;
-
- /*
- * We would like to just call:
- * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
- * but doing that would radically increase the odds of a
- * speculative access to the poison page because we'd have
- * the virtual address of the kernel 1:1 mapping sitting
- * around in registers.
- * Instead we get tricky. We create a non-canonical address
- * that looks just like the one we want, but has bit 63 flipped.
- * This relies on set_memory_XX() properly sanitizing any __pa()
- * results with __PHYSICAL_MASK or PTE_PFN_MASK.
- */
- decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
-
- if (unmap)
- rc = set_memory_np(decoy_addr, 1);
- else
- rc = set_memory_uc(decoy_addr, 1);
- if (rc)
- pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
- return rc;
-}
-#define set_mce_nospec set_mce_nospec
-
-/* Restore full speculative operation to the pfn. */
-static inline int clear_mce_nospec(unsigned long pfn)
-{
- return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
-}
-#define clear_mce_nospec clear_mce_nospec
-#else
-/*
- * Few people would run a 32-bit kernel on a machine that supports
- * recoverable errors because they have too much memory to boot 32-bit.
- */
-#endif
-
#endif /* _ASM_X86_SET_MEMORY_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 389d851a02c4..f8b9ee97a891 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -8,6 +8,7 @@
#include <linux/linkage.h>
#include <asm/page_types.h>
+#include <asm/ibt.h>
#ifdef __i386__
@@ -49,7 +50,6 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
-extern unsigned long __startup_secondary_64(void);
extern void startup_64_setup_env(unsigned long physbase);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
@@ -108,32 +108,16 @@ extern unsigned long _brk_end;
void *extend_brk(size_t size, size_t align);
/*
- * Reserve space in the brk section. The name must be unique within
- * the file, and somewhat descriptive. The size is in bytes. Must be
- * used at file scope.
+ * Reserve space in the .brk section, which is a block of memory from which the
+ * caller is allowed to allocate very early (before even memblock is available)
+ * by calling extend_brk(). All allocated memory will be eventually converted
+ * to memblock. Any leftover unallocated memory will be freed.
*
- * (This uses a temp function to wrap the asm so we can pass it the
- * size parameter; otherwise we wouldn't be able to. We can't use a
- * "section" attribute on a normal variable because it always ends up
- * being @progbits, which ends up allocating space in the vmlinux
- * executable.)
+ * The size is in bytes.
*/
-#define RESERVE_BRK(name,sz) \
- static void __section(".discard.text") __used notrace \
- __brk_reservation_fn_##name##__(void) { \
- asm volatile ( \
- ".pushsection .brk_reservation,\"aw\",@nobits;" \
- ".brk." #name ":" \
- " 1:.skip %c0;" \
- " .size .brk." #name ", . - 1b;" \
- " .popsection" \
- : : "i" (sz)); \
- }
-
-/* Helper for reserving space for arrays of things */
-#define RESERVE_BRK_ARRAY(type, name, entries) \
- type *name; \
- RESERVE_BRK(name, sizeof(type) * entries)
+#define RESERVE_BRK(name, size) \
+ __section(".bss..brk") __aligned(1) __used \
+ static char __brk_##name[size]
extern void probe_roms(void);
#ifdef __i386__
@@ -146,12 +130,19 @@ asmlinkage void __init x86_64_start_reservations(char *real_mode_data);
#endif /* __i386__ */
#endif /* _SETUP */
-#else
-#define RESERVE_BRK(name,sz) \
- .pushsection .brk_reservation,"aw",@nobits; \
-.brk.name: \
-1: .skip sz; \
- .size .brk.name,.-1b; \
+
+#else /* __ASSEMBLY */
+
+.macro __RESERVE_BRK name, size
+ .pushsection .bss..brk, "aw"
+SYM_DATA_START(__brk_\name)
+ .skip \size
+SYM_DATA_END(__brk_\name)
.popsection
+.endm
+
+#define RESERVE_BRK(name, size) __RESERVE_BRK name, size
+
#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
new file mode 100644
index 000000000000..b8357d6ecd47
--- /dev/null
+++ b/arch/x86/include/asm/sev-common.h
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD SEV header common between the guest and the hypervisor.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ */
+
+#ifndef __ASM_X86_SEV_COMMON_H
+#define __ASM_X86_SEV_COMMON_H
+
+#define GHCB_MSR_INFO_POS 0
+#define GHCB_DATA_LOW 12
+#define GHCB_MSR_INFO_MASK (BIT_ULL(GHCB_DATA_LOW) - 1)
+
+#define GHCB_DATA(v) \
+ (((unsigned long)(v) & ~GHCB_MSR_INFO_MASK) >> GHCB_DATA_LOW)
+
+/* SEV Information Request/Response */
+#define GHCB_MSR_SEV_INFO_RESP 0x001
+#define GHCB_MSR_SEV_INFO_REQ 0x002
+
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ /* GHCBData[63:48] */ \
+ ((((_max) & 0xffff) << 48) | \
+ /* GHCBData[47:32] */ \
+ (((_min) & 0xffff) << 32) | \
+ /* GHCBData[31:24] */ \
+ (((_cbit) & 0xff) << 24) | \
+ GHCB_MSR_SEV_INFO_RESP)
+
+#define GHCB_MSR_INFO(v) ((v) & 0xfffUL)
+#define GHCB_MSR_PROTO_MAX(v) (((v) >> 48) & 0xffff)
+#define GHCB_MSR_PROTO_MIN(v) (((v) >> 32) & 0xffff)
+
+/* CPUID Request/Response */
+#define GHCB_MSR_CPUID_REQ 0x004
+#define GHCB_MSR_CPUID_RESP 0x005
+#define GHCB_MSR_CPUID_FUNC_POS 32
+#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff
+#define GHCB_MSR_CPUID_VALUE_POS 32
+#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff
+#define GHCB_MSR_CPUID_REG_POS 30
+#define GHCB_MSR_CPUID_REG_MASK 0x3
+#define GHCB_CPUID_REQ_EAX 0
+#define GHCB_CPUID_REQ_EBX 1
+#define GHCB_CPUID_REQ_ECX 2
+#define GHCB_CPUID_REQ_EDX 3
+#define GHCB_CPUID_REQ(fn, reg) \
+ /* GHCBData[11:0] */ \
+ (GHCB_MSR_CPUID_REQ | \
+ /* GHCBData[31:12] */ \
+ (((unsigned long)(reg) & 0x3) << 30) | \
+ /* GHCBData[63:32] */ \
+ (((unsigned long)fn) << 32))
+
+/* AP Reset Hold */
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+
+/* GHCB GPA Register */
+#define GHCB_MSR_REG_GPA_REQ 0x012
+#define GHCB_MSR_REG_GPA_REQ_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)((v) & GENMASK_ULL(51, 0)) << 12) | \
+ /* GHCBData[11:0] */ \
+ GHCB_MSR_REG_GPA_REQ)
+
+#define GHCB_MSR_REG_GPA_RESP 0x013
+#define GHCB_MSR_REG_GPA_RESP_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
+
+/*
+ * SNP Page State Change Operation
+ *
+ * GHCBData[55:52] - Page operation:
+ * 0x0001 Page assignment, Private
+ * 0x0002 Page assignment, Shared
+ */
+enum psc_op {
+ SNP_PAGE_STATE_PRIVATE = 1,
+ SNP_PAGE_STATE_SHARED,
+};
+
+#define GHCB_MSR_PSC_REQ 0x014
+#define GHCB_MSR_PSC_REQ_GFN(gfn, op) \
+ /* GHCBData[55:52] */ \
+ (((u64)((op) & 0xf) << 52) | \
+ /* GHCBData[51:12] */ \
+ ((u64)((gfn) & GENMASK_ULL(39, 0)) << 12) | \
+ /* GHCBData[11:0] */ \
+ GHCB_MSR_PSC_REQ)
+
+#define GHCB_MSR_PSC_RESP 0x015
+#define GHCB_MSR_PSC_RESP_VAL(val) \
+ /* GHCBData[63:32] */ \
+ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
+
+/* GHCB Hypervisor Feature Request/Response */
+#define GHCB_MSR_HV_FT_REQ 0x080
+#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_RESP_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
+
+#define GHCB_HV_FT_SNP BIT_ULL(0)
+#define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1)
+
+/* SNP Page State Change NAE event */
+#define VMGEXIT_PSC_MAX_ENTRY 253
+
+struct psc_hdr {
+ u16 cur_entry;
+ u16 end_entry;
+ u32 reserved;
+} __packed;
+
+struct psc_entry {
+ u64 cur_page : 12,
+ gfn : 40,
+ operation : 4,
+ pagesize : 1,
+ reserved : 7;
+} __packed;
+
+struct snp_psc_desc {
+ struct psc_hdr hdr;
+ struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY];
+} __packed;
+
+/* Guest message request error code */
+#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32)
+
+#define GHCB_MSR_TERM_REQ 0x100
+#define GHCB_MSR_TERM_REASON_SET_POS 12
+#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
+#define GHCB_MSR_TERM_REASON_POS 16
+#define GHCB_MSR_TERM_REASON_MASK 0xff
+
+#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
+ /* GHCBData[15:12] */ \
+ (((((u64)reason_set) & 0xf) << 12) | \
+ /* GHCBData[23:16] */ \
+ ((((u64)reason_val) & 0xff) << 16))
+
+/* Error codes from reason set 0 */
+#define SEV_TERM_SET_GEN 0
+#define GHCB_SEV_ES_GEN_REQ 0
+#define GHCB_SEV_ES_PROT_UNSUPPORTED 1
+#define GHCB_SNP_UNSUPPORTED 2
+
+/* Linux-specific reason codes (used with reason set 1) */
+#define SEV_TERM_SET_LINUX 1
+#define GHCB_TERM_REGISTER 0 /* GHCB GPA registration failure */
+#define GHCB_TERM_PSC 1 /* Page State Change failure */
+#define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */
+#define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */
+#define GHCB_TERM_CPUID 4 /* CPUID-validation failure */
+#define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */
+
+#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
+
+/*
+ * Error codes related to GHCB input that can be communicated back to the guest
+ * by setting the lower 32-bits of the GHCB SW_EXITINFO1 field to 2.
+ */
+#define GHCB_ERR_NOT_REGISTERED 1
+#define GHCB_ERR_INVALID_USAGE 2
+#define GHCB_ERR_INVALID_SCRATCH_AREA 3
+#define GHCB_ERR_MISSING_INPUT 4
+#define GHCB_ERR_INVALID_INPUT 5
+#define GHCB_ERR_INVALID_EVENT 6
+
+#endif
diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h
deleted file mode 100644
index cf1d957c7091..000000000000
--- a/arch/x86/include/asm/sev-es.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * AMD Encrypted Register State Support
- *
- * Author: Joerg Roedel <jroedel@suse.de>
- */
-
-#ifndef __ASM_ENCRYPTED_STATE_H
-#define __ASM_ENCRYPTED_STATE_H
-
-#include <linux/types.h>
-#include <asm/insn.h>
-
-#define GHCB_SEV_INFO 0x001UL
-#define GHCB_SEV_INFO_REQ 0x002UL
-#define GHCB_INFO(v) ((v) & 0xfffUL)
-#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL)
-#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL)
-#define GHCB_PROTO_OUR 0x0001UL
-#define GHCB_SEV_CPUID_REQ 0x004UL
-#define GHCB_CPUID_REQ_EAX 0
-#define GHCB_CPUID_REQ_EBX 1
-#define GHCB_CPUID_REQ_ECX 2
-#define GHCB_CPUID_REQ_EDX 3
-#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \
- (((unsigned long)reg & 3) << 30) | \
- (((unsigned long)fn) << 32))
-
-#define GHCB_PROTOCOL_MAX 0x0001UL
-#define GHCB_DEFAULT_USAGE 0x0000UL
-
-#define GHCB_SEV_CPUID_RESP 0x005UL
-#define GHCB_SEV_TERMINATE 0x100UL
-#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \
- (((((u64)reason_set) & 0x7) << 12) | \
- ((((u64)reason_val) & 0xff) << 16))
-#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
-#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
-
-#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff)
-#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); }
-
-enum es_result {
- ES_OK, /* All good */
- ES_UNSUPPORTED, /* Requested operation not supported */
- ES_VMM_ERROR, /* Unexpected state from the VMM */
- ES_DECODE_FAILED, /* Instruction decoding failed */
- ES_EXCEPTION, /* Instruction caused exception */
- ES_RETRY, /* Retry instruction emulation */
-};
-
-struct es_fault_info {
- unsigned long vector;
- unsigned long error_code;
- unsigned long cr2;
-};
-
-struct pt_regs;
-
-/* ES instruction emulation context */
-struct es_em_ctxt {
- struct pt_regs *regs;
- struct insn insn;
- struct es_fault_info fi;
-};
-
-void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code);
-
-static inline u64 lower_bits(u64 val, unsigned int bits)
-{
- u64 mask = (1ULL << bits) - 1;
-
- return (val & mask);
-}
-
-struct real_mode_header;
-enum stack_type;
-
-/* Early IDT entry points for #VC handler */
-extern void vc_no_ghcb(void);
-extern void vc_boot_ghcb(void);
-extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
-
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-extern struct static_key_false sev_es_enable_key;
-extern void __sev_es_ist_enter(struct pt_regs *regs);
-extern void __sev_es_ist_exit(void);
-static __always_inline void sev_es_ist_enter(struct pt_regs *regs)
-{
- if (static_branch_unlikely(&sev_es_enable_key))
- __sev_es_ist_enter(regs);
-}
-static __always_inline void sev_es_ist_exit(void)
-{
- if (static_branch_unlikely(&sev_es_enable_key))
- __sev_es_ist_exit();
-}
-extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh);
-extern void __sev_es_nmi_complete(void);
-static __always_inline void sev_es_nmi_complete(void)
-{
- if (static_branch_unlikely(&sev_es_enable_key))
- __sev_es_nmi_complete();
-}
-extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
-#else
-static inline void sev_es_ist_enter(struct pt_regs *regs) { }
-static inline void sev_es_ist_exit(void) { }
-static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
-static inline void sev_es_nmi_complete(void) { }
-static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
-#endif
-
-#endif
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
new file mode 100644
index 000000000000..19514524f0f8
--- /dev/null
+++ b/arch/x86/include/asm/sev.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#ifndef __ASM_ENCRYPTED_STATE_H
+#define __ASM_ENCRYPTED_STATE_H
+
+#include <linux/types.h>
+#include <asm/insn.h>
+#include <asm/sev-common.h>
+#include <asm/bootparam.h>
+
+#define GHCB_PROTOCOL_MIN 1ULL
+#define GHCB_PROTOCOL_MAX 2ULL
+#define GHCB_DEFAULT_USAGE 0ULL
+
+#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); }
+
+enum es_result {
+ ES_OK, /* All good */
+ ES_UNSUPPORTED, /* Requested operation not supported */
+ ES_VMM_ERROR, /* Unexpected state from the VMM */
+ ES_DECODE_FAILED, /* Instruction decoding failed */
+ ES_EXCEPTION, /* Instruction caused exception */
+ ES_RETRY, /* Retry instruction emulation */
+};
+
+struct es_fault_info {
+ unsigned long vector;
+ unsigned long error_code;
+ unsigned long cr2;
+};
+
+struct pt_regs;
+
+/* ES instruction emulation context */
+struct es_em_ctxt {
+ struct pt_regs *regs;
+ struct insn insn;
+ struct es_fault_info fi;
+};
+
+/*
+ * AMD SEV Confidential computing blob structure. The structure is
+ * defined in OVMF UEFI firmware header:
+ * https://github.com/tianocore/edk2/blob/master/OvmfPkg/Include/Guid/ConfidentialComputingSevSnpBlob.h
+ */
+#define CC_BLOB_SEV_HDR_MAGIC 0x45444d41
+struct cc_blob_sev_info {
+ u32 magic;
+ u16 version;
+ u16 reserved;
+ u64 secrets_phys;
+ u32 secrets_len;
+ u32 rsvd1;
+ u64 cpuid_phys;
+ u32 cpuid_len;
+ u32 rsvd2;
+} __packed;
+
+void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code);
+
+static inline u64 lower_bits(u64 val, unsigned int bits)
+{
+ u64 mask = (1ULL << bits) - 1;
+
+ return (val & mask);
+}
+
+struct real_mode_header;
+enum stack_type;
+struct ghcb;
+
+/* Early IDT entry points for #VC handler */
+extern void vc_no_ghcb(void);
+extern void vc_boot_ghcb(void);
+extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
+
+/* Software defined (when rFlags.CF = 1) */
+#define PVALIDATE_FAIL_NOUPDATE 255
+
+/* RMP page size */
+#define RMP_PG_SIZE_4K 0
+
+#define RMPADJUST_VMSA_PAGE_BIT BIT(16)
+
+/* SNP Guest message request */
+struct snp_req_data {
+ unsigned long req_gpa;
+ unsigned long resp_gpa;
+ unsigned long data_gpa;
+ unsigned int data_npages;
+};
+
+struct sev_guest_platform_data {
+ u64 secrets_gpa;
+};
+
+/*
+ * The secrets page contains 96-bytes of reserved field that can be used by
+ * the guest OS. The guest OS uses the area to save the message sequence
+ * number for each VMPCK.
+ *
+ * See the GHCB spec section Secret page layout for the format for this area.
+ */
+struct secrets_os_area {
+ u32 msg_seqno_0;
+ u32 msg_seqno_1;
+ u32 msg_seqno_2;
+ u32 msg_seqno_3;
+ u64 ap_jump_table_pa;
+ u8 rsvd[40];
+ u8 guest_usage[32];
+} __packed;
+
+#define VMPCK_KEY_LEN 32
+
+/* See the SNP spec version 0.9 for secrets page format */
+struct snp_secrets_page_layout {
+ u32 version;
+ u32 imien : 1,
+ rsvd1 : 31;
+ u32 fms;
+ u32 rsvd2;
+ u8 gosvw[16];
+ u8 vmpck0[VMPCK_KEY_LEN];
+ u8 vmpck1[VMPCK_KEY_LEN];
+ u8 vmpck2[VMPCK_KEY_LEN];
+ u8 vmpck3[VMPCK_KEY_LEN];
+ struct secrets_os_area os_area;
+ u8 rsvd3[3840];
+} __packed;
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+extern struct static_key_false sev_es_enable_key;
+extern void __sev_es_ist_enter(struct pt_regs *regs);
+extern void __sev_es_ist_exit(void);
+static __always_inline void sev_es_ist_enter(struct pt_regs *regs)
+{
+ if (static_branch_unlikely(&sev_es_enable_key))
+ __sev_es_ist_enter(regs);
+}
+static __always_inline void sev_es_ist_exit(void)
+{
+ if (static_branch_unlikely(&sev_es_enable_key))
+ __sev_es_ist_exit();
+}
+extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh);
+extern void __sev_es_nmi_complete(void);
+static __always_inline void sev_es_nmi_complete(void)
+{
+ if (static_branch_unlikely(&sev_es_enable_key))
+ __sev_es_nmi_complete();
+}
+extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
+extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ bool set_ghcb_msr,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2);
+static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
+{
+ int rc;
+
+ /* "rmpadjust" mnemonic support in binutils 2.36 and newer */
+ asm volatile(".byte 0xF3,0x0F,0x01,0xFE\n\t"
+ : "=a"(rc)
+ : "a"(vaddr), "c"(rmp_psize), "d"(attrs)
+ : "memory", "cc");
+
+ return rc;
+}
+static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
+{
+ bool no_rmpupdate;
+ int rc;
+
+ /* "pvalidate" mnemonic support in binutils 2.36 and newer */
+ asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFF\n\t"
+ CC_SET(c)
+ : CC_OUT(c) (no_rmpupdate), "=a"(rc)
+ : "a"(vaddr), "c"(rmp_psize), "d"(validate)
+ : "memory", "cc");
+
+ if (no_rmpupdate)
+ return PVALIDATE_FAIL_NOUPDATE;
+
+ return rc;
+}
+void setup_ghcb(void);
+void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned int npages);
+void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned int npages);
+void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
+void snp_set_memory_shared(unsigned long vaddr, unsigned int npages);
+void snp_set_memory_private(unsigned long vaddr, unsigned int npages);
+void snp_set_wakeup_secondary_cpu(void);
+bool snp_init(struct boot_params *bp);
+void snp_abort(void);
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err);
+#else
+static inline void sev_es_ist_enter(struct pt_regs *regs) { }
+static inline void sev_es_ist_exit(void) { }
+static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
+static inline void sev_es_nmi_complete(void) { }
+static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
+static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
+static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
+static inline void setup_ghcb(void) { }
+static inline void __init
+early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { }
+static inline void __init
+early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { }
+static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { }
+static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { }
+static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { }
+static inline void snp_set_wakeup_secondary_cpu(void) { }
+static inline bool snp_init(struct boot_params *bp) { return false; }
+static inline void snp_abort(void) { }
+static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input,
+ unsigned long *fw_err)
+{
+ return -ENOTTY;
+}
+#endif
+
+#endif
diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h
index dd7602c44c72..3f9334ef67cd 100644
--- a/arch/x86/kernel/cpu/sgx/arch.h
+++ b/arch/x86/include/asm/sgx.h
@@ -2,15 +2,20 @@
/**
* Copyright(c) 2016-20 Intel Corporation.
*
- * Contains data structures defined by the SGX architecture. Data structures
- * defined by the Linux software stack should not be placed here.
+ * Intel Software Guard Extensions (SGX) support.
*/
-#ifndef _ASM_X86_SGX_ARCH_H
-#define _ASM_X86_SGX_ARCH_H
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
#include <linux/bits.h>
#include <linux/types.h>
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions. The two should not be mixed
+ * together for better readability. The architectural definitions come first.
+ */
+
/* The SGX specific CPUID function. */
#define SGX_CPUID 0x12
/* EPC enumeration. */
@@ -22,16 +27,54 @@
/* The bitmask for the EPC section type. */
#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+ EAUG = 0x0D,
+ EMODPR = 0x0E,
+ EMODT = 0x0F,
+};
+
+/**
+ * SGX_ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
+ *
+ * ENCLS has its own (positive value) error codes and also generates
+ * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
+ * with system error codes as everything percolates back up the stack.
+ * Unfortunately (for us), we need to precisely identify each unique
+ * error code, e.g. the action taken if EWB fails varies based on the
+ * type of fault and on the exact SGX error code, i.e. we can't simply
+ * convert all faults to -EFAULT.
+ *
+ * To make all three error types coexist, we set bit 30 to identify an
+ * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
+ * between positive (faults and SGX error codes) and negative (system
+ * error codes) values.
+ */
+#define SGX_ENCLS_FAULT_FLAG 0x40000000
+
/**
* enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
* %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
* been completed yet.
+ * %SGX_CHILD_PRESENT SECS has child pages present in the EPC.
* %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
* public key does not match IA32_SGXLEPUBKEYHASH.
* %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
*/
enum sgx_return_code {
SGX_NOT_TRACKED = 11,
+ SGX_CHILD_PRESENT = 13,
SGX_INVALID_EINITTOKEN = 16,
SGX_UNMASKED_EVENT = 128,
};
@@ -271,7 +314,7 @@ struct sgx_pcmd {
* @header1: constant byte string
* @vendor: must be either 0x0000 or 0x8086
* @date: YYYYMMDD in BCD
- * @header2: costant byte string
+ * @header2: constant byte string
* @swdefined: software defined value
*/
struct sgx_sigstruct_header {
@@ -335,4 +378,19 @@ struct sgx_sigstruct {
#define SGX_LAUNCH_TOKEN_SIZE 304
-#endif /* _ASM_X86_SGX_ARCH_H */
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h
new file mode 100644
index 000000000000..c0ef921c0586
--- /dev/null
+++ b/arch/x86/include/asm/shared/io.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_IO_H
+#define _ASM_X86_SHARED_IO_H
+
+#include <linux/types.h>
+
+#define BUILDIO(bwl, bw, type) \
+static inline void __out##bwl(type value, u16 port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static inline type __in##bwl(u16 port) \
+{ \
+ type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+}
+
+BUILDIO(b, b, u8)
+BUILDIO(w, w, u16)
+BUILDIO(l, , u32)
+#undef BUILDIO
+
+#define inb __inb
+#define inw __inw
+#define inl __inl
+#define outb __outb
+#define outw __outw
+#define outl __outl
+
+#endif
diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h
new file mode 100644
index 000000000000..1e6ec10b3a15
--- /dev/null
+++ b/arch/x86/include/asm/shared/msr.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_MSR_H
+#define _ASM_X86_SHARED_MSR_H
+
+struct msr {
+ union {
+ struct {
+ u32 l;
+ u32 h;
+ };
+ u64 q;
+ };
+};
+
+#endif /* _ASM_X86_SHARED_MSR_H */
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
new file mode 100644
index 000000000000..e53f26228fbb
--- /dev/null
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_TDX_H
+#define _ASM_X86_SHARED_TDX_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+#define TDX_HYPERCALL_STANDARD 0
+
+#define TDX_HCALL_HAS_OUTPUT BIT(0)
+#define TDX_HCALL_ISSUE_STI BIT(1)
+
+#define TDX_CPUID_LEAF_ID 0x21
+#define TDX_IDENT "IntelTDX "
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Used in __tdx_hypercall() to pass down and get back registers' values of
+ * the TDCALL instruction when requesting services from the VMM.
+ *
+ * This is a software only structure and not part of the TDX module/VMM ABI.
+ */
+struct tdx_hypercall_args {
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+};
+
+/* Used to request services from the VMM */
+u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags);
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __tdx_hypercall_failed(void);
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_SHARED_TDX_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 84eab2724875..5b1ed650b124 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -85,4 +85,6 @@ struct rt_sigframe_x32 {
#endif /* CONFIG_X86_64 */
+void __init init_sigframe_size(void);
+
#endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 6fd8410a3910..2dfb5fea13af 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -29,6 +29,7 @@ typedef struct {
#define SA_X32_ABI 0x01000000u
#ifndef CONFIG_COMPAT
+#define compat_sigset_t compat_sigset_t
typedef sigset_t compat_sigset_t;
#endif
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 0bc9b0895f33..bab490379c65 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -11,6 +11,7 @@
#include <asm/nops.h>
#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
/* "Raw" instruction opcodes */
#define __ASM_CLAC ".byte 0x0f,0x01,0xca"
@@ -18,29 +19,14 @@
#ifdef __ASSEMBLY__
-#include <asm/alternative-asm.h>
-
-#ifdef CONFIG_X86_SMAP
-
#define ASM_CLAC \
ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP
#define ASM_STAC \
ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP
-#else /* CONFIG_X86_SMAP */
-
-#define ASM_CLAC
-#define ASM_STAC
-
-#endif /* CONFIG_X86_SMAP */
-
#else /* __ASSEMBLY__ */
-#include <asm/alternative.h>
-
-#ifdef CONFIG_X86_SMAP
-
static __always_inline void clac(void)
{
/* Note: a barrier is implicit in alternative() */
@@ -79,19 +65,6 @@ static __always_inline void smap_restore(unsigned long flags)
#define ASM_STAC \
ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP)
-#else /* CONFIG_X86_SMAP */
-
-static inline void clac(void) { }
-static inline void stac(void) { }
-
-static inline unsigned long smap_save(void) { return 0; }
-static inline void smap_restore(unsigned long flags) { }
-
-#define ASM_CLAC
-#define ASM_STAC
-
-#endif /* CONFIG_X86_SMAP */
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMAP_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index c0538f82c9a2..81a0211a372d 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
@@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
return per_cpu(cpu_llc_shared_map, cpu);
}
+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+ return per_cpu(cpu_l2c_shared_map, cpu);
+}
+
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -119,6 +126,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
void cpu_disable_common(void);
void native_smp_prepare_boot_cpu(void);
+void smp_prepare_cpus_common(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void calculate_max_logical_packages(void);
void native_smp_cpus_done(unsigned int max_cpus);
@@ -132,6 +140,7 @@ void native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
+void cond_wakeup_cpu0(void);
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 1d3cbaef4bb7..45b18eb94fa1 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -104,25 +104,13 @@ static inline void wrpkru(u32 pkru)
: : "a" (pkru), "c"(ecx), "d"(edx));
}
-static inline void __write_pkru(u32 pkru)
-{
- /*
- * WRPKRU is relatively expensive compared to RDPKRU.
- * Avoid WRPKRU when it would not change the value.
- */
- if (pkru == rdpkru())
- return;
-
- wrpkru(pkru);
-}
-
#else
static inline u32 rdpkru(void)
{
return 0;
}
-static inline void __write_pkru(u32 pkru)
+static inline void wrpkru(u32 pkru)
{
}
#endif
@@ -196,14 +184,15 @@ static inline void wbinvd(void)
native_wbinvd();
}
-#ifdef CONFIG_X86_64
static inline void load_gs_index(unsigned int selector)
{
+#ifdef CONFIG_X86_64
native_load_gs_index(selector);
-}
-
+#else
+ loadsegment(gs, selector);
#endif
+}
#endif /* CONFIG_PARAVIRT_XXL */
@@ -214,7 +203,7 @@ static inline void clflush(volatile void *__p)
static inline void clflushopt(volatile void *__p)
{
- alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0",
+ alternative_io(".byte 0x3e; clflush %P0",
".byte 0x66; clflush %P0",
X86_FEATURE_CLFLUSHOPT,
"+m" (*(volatile char __force *)__p));
@@ -225,7 +214,7 @@ static inline void clwb(volatile void *__p)
volatile struct { char x[64]; } *p = __p;
asm volatile(ALTERNATIVE_2(
- ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
+ ".byte 0x3e; clflush (%[pax])",
".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
X86_FEATURE_CLFLUSHOPT,
".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */
@@ -287,7 +276,7 @@ static inline int enqcmds(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
struct { char _[64]; } __iomem *__dst = dst;
- int zf;
+ bool zf;
/*
* ENQCMDS %(rdx), rax
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 7fb482f0f25b..24a8d6c4fb18 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -5,30 +5,23 @@
* Stack protector works by putting predefined pattern at the start of
* the stack frame and verifying that it hasn't been overwritten when
* returning from the function. The pattern is called stack canary
- * and unfortunately gcc requires it to be at a fixed offset from %gs.
- * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64
- * and x86_32 use segment registers differently and thus handles this
- * requirement differently.
+ * and unfortunately gcc historically required it to be at a fixed offset
+ * from the percpu segment base. On x86_64, the offset is 40 bytes.
*
- * On x86_64, %gs is shared by percpu area and stack canary. All
- * percpu symbols are zero based and %gs points to the base of percpu
- * area. The first occupant of the percpu area is always
- * fixed_percpu_data which contains stack_canary at offset 40. Userland
- * %gs is always saved and restored on kernel entry and exit using
- * swapgs, so stack protector doesn't add any complexity there.
+ * The same segment is shared by percpu area and stack canary. On
+ * x86_64, percpu symbols are zero based and %gs (64-bit) points to the
+ * base of percpu area. The first occupant of the percpu area is always
+ * fixed_percpu_data which contains stack_canary at the appropriate
+ * offset. On x86_32, the stack canary is just a regular percpu
+ * variable.
*
- * On x86_32, it's slightly more complicated. As in x86_64, %gs is
- * used for userland TLS. Unfortunately, some processors are much
- * slower at loading segment registers with different value when
- * entering and leaving the kernel, so the kernel uses %fs for percpu
- * area and manages %gs lazily so that %gs is switched only when
- * necessary, usually during task switch.
+ * Putting percpu data in %fs on 32-bit is a minor optimization compared to
+ * using %gs. Since 32-bit userspace normally has %fs == 0, we are likely
+ * to load 0 into %fs on exit to usermode, whereas with percpu data in
+ * %gs, we are likely to load a non-null %gs on return to user mode.
*
- * As gcc requires the stack canary at %gs:20, %gs can't be managed
- * lazily if stack protector is enabled, so the kernel saves and
- * restores userland %gs on kernel entry and exit. This behavior is
- * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
- * system.h to hide the details.
+ * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector
+ * support, we can remove some of this complexity.
*/
#ifndef _ASM_STACKPROTECTOR_H
@@ -45,14 +38,6 @@
#include <linux/sched.h>
/*
- * 24 byte read-only segment initializer for stack canary. Linker
- * can't handle the address bit shifting. Address will be set in
- * head_32 for boot CPU and setup_per_cpu_areas() for others.
- */
-#define GDT_STACK_CANARY_INIT \
- [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18),
-
-/*
* Initialize the stackprotector canary value.
*
* NOTE: this must only be called from functions that never return
@@ -86,7 +71,7 @@ static __always_inline void boot_init_stack_canary(void)
#ifdef CONFIG_X86_64
this_cpu_write(fixed_percpu_data.stack_canary, canary);
#else
- this_cpu_write(stack_canary.canary, canary);
+ this_cpu_write(__stack_chk_guard, canary);
#endif
}
@@ -95,48 +80,16 @@ static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_64
per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary;
#else
- per_cpu(stack_canary.canary, cpu) = idle->stack_canary;
-#endif
-}
-
-static inline void setup_stack_canary_segment(int cpu)
-{
-#ifdef CONFIG_X86_32
- unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
- struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
- struct desc_struct desc;
-
- desc = gdt_table[GDT_ENTRY_STACK_CANARY];
- set_desc_base(&desc, canary);
- write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S);
-#endif
-}
-
-static inline void load_stack_canary_segment(void)
-{
-#ifdef CONFIG_X86_32
- asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory");
+ per_cpu(__stack_chk_guard, cpu) = idle->stack_canary;
#endif
}
#else /* STACKPROTECTOR */
-#define GDT_STACK_CANARY_INIT
-
/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */
-static inline void setup_stack_canary_segment(int cpu)
-{ }
-
static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
{ }
-static inline void load_stack_canary_segment(void)
-{
-#ifdef CONFIG_X86_32
- asm volatile ("mov %0, %%gs" : : "r" (0));
-#endif
-}
-
#endif /* STACKPROTECTOR */
#endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f248eb2ac2d4..3881b5333eb8 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -38,6 +38,16 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
+static __always_inline
+bool get_stack_guard_info(unsigned long *stack, struct stack_info *info)
+{
+ /* make sure it's not in the stack proper */
+ if (get_stack_info_noinstr(stack, current, info))
+ return false;
+ /* but if it is in the page below it, we hit a guard */
+ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info);
+}
+
const char *stack_type_name(enum stack_type type);
static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index cbb67b6030f9..2d8dacd02643 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -26,7 +26,9 @@
".align 4 \n" \
".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
STATIC_CALL_TRAMP_STR(name) ": \n" \
+ ANNOTATE_NOENDBR \
insns " \n" \
+ ".byte 0x53, 0x43, 0x54 \n" \
".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \
".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
".popsection \n")
@@ -35,8 +37,10 @@
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
- __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) \
+ ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0)
#define ARCH_ADD_TRAMP_KEY(name) \
asm(".pushsection .static_call_tramp_key, \"a\" \n" \
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index f74362b05619..32c0d981a82a 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -146,42 +146,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
extern void *memcpy(void *, const void *, size_t);
#ifndef CONFIG_FORTIFY_SOURCE
-#ifdef CONFIG_X86_USE_3DNOW
-
-#include <asm/mmx.h>
-
-/*
- * This CPU favours 3DNow strongly (eg AMD Athlon)
- */
-
-static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
-{
- if (len < 512)
- return __constant_memcpy(to, from, len);
- return _mmx_memcpy(to, from, len);
-}
-
-static inline void *__memcpy3d(void *to, const void *from, size_t len)
-{
- if (len < 512)
- return __memcpy(to, from, len);
- return _mmx_memcpy(to, from, len);
-}
-
-#define memcpy(t, f, n) \
- (__builtin_constant_p((n)) \
- ? __constant_memcpy3d((t), (f), (n)) \
- : __memcpy3d((t), (f), (n)))
-
-#else
-
-/*
- * No 3D Now!
- */
#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
-#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
#define __HAVE_ARCH_MEMMOVE
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index fdbd9d7b7bca..a800abb1a992 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -13,15 +13,12 @@
/* image of the saved processor state */
struct saved_context {
/*
- * On x86_32, all segment registers, with the possible exception of
- * gs, are saved at kernel entry in pt_regs.
+ * On x86_32, all segment registers except gs are saved at kernel
+ * entry in pt_regs.
*/
-#ifdef CONFIG_X86_32_LAZY_GS
u16 gs;
-#endif
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
- bool misc_enable_saved;
struct saved_msrs saved_msrs;
struct desc_ptr gdt_desc;
struct desc_ptr idt;
@@ -30,6 +27,7 @@ struct saved_context {
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ bool misc_enable_saved;
} __attribute__((packed));
/* routines for saving/restoring kernel state */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 35bb35d28733..54df06687d83 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -14,9 +14,13 @@
* Image of the saved processor state, used by the low level ACPI suspend to
* RAM code and by the low level hibernation code.
*
- * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
- * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
- * still work as required.
+ * If you modify it, check how it is used in arch/x86/kernel/acpi/wakeup_64.S
+ * and make sure that __save/__restore_processor_state(), defined in
+ * arch/x86/power/cpu.c, still work as required.
+ *
+ * Because the structure is packed, make sure to avoid unaligned members. For
+ * optimisation purposes but also because tools like kmemleak only search for
+ * pointers that are aligned.
*/
struct saved_context {
struct pt_regs regs;
@@ -36,7 +40,6 @@ struct saved_context {
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
- bool misc_enable_saved;
struct saved_msrs saved_msrs;
unsigned long efer;
u16 gdt_pad; /* Unused */
@@ -48,6 +51,7 @@ struct saved_context {
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ bool misc_enable_saved;
} __attribute__((packed));
#define loaddebug(thread,register) \
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1c561945b426..1b07fba11704 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -156,6 +156,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 avic_physical_id; /* Offset 0xf8 */
u8 reserved_7[8];
u64 vmsa_pa; /* Used for an SEV-ES guest */
+ u8 reserved_8[720];
+ /*
+ * Offset 0x3e0, 32 bytes reserved
+ * for use by hypervisor/software.
+ */
+ u8 reserved_sw[32];
};
@@ -178,6 +184,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_IGN_TPR_SHIFT 20
#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+#define V_IRQ_INJECTION_BITS_MASK (V_IRQ_MASK | V_INTR_PRIO_MASK | V_IGN_TPR_MASK)
+
#define V_INTR_MASKING_SHIFT 24
#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
@@ -212,6 +220,50 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2)
+
+#define SVM_TSC_RATIO_RSVD 0xffffff0000000000ULL
+#define SVM_TSC_RATIO_MIN 0x0000000000000001ULL
+#define SVM_TSC_RATIO_MAX 0x000000ffffffffffULL
+#define SVM_TSC_RATIO_DEFAULT 0x0100000000ULL
+
+
+/* AVIC */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL)
+
+#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
+#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
+#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+
+enum avic_ipi_failure_cause {
+ AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+ AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+ AVIC_IPI_FAILURE_INVALID_TARGET,
+ AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+};
+
+
+/*
+ * 0xff is broadcast, so the max index allowed for physical APIC ID
+ * table is 0xfe. APIC IDs above 0xff are reserved.
+ */
+#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff
+
+#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
+
struct vmcb_seg {
u16 selector;
u16 attrib;
@@ -219,6 +271,7 @@ struct vmcb_seg {
u64 base;
} __packed;
+/* Save area definition for legacy and SEV-MEM guests */
struct vmcb_save_area {
struct vmcb_seg es;
struct vmcb_seg cs;
@@ -230,12 +283,12 @@ struct vmcb_save_area {
struct vmcb_seg ldtr;
struct vmcb_seg idtr;
struct vmcb_seg tr;
- u8 reserved_1[43];
+ u8 reserved_1[42];
+ u8 vmpl;
u8 cpl;
u8 reserved_2[4];
u64 efer;
- u8 reserved_3[104];
- u64 xss; /* Valid for SEV-ES only */
+ u8 reserved_3[112];
u64 cr4;
u64 cr3;
u64 cr0;
@@ -245,7 +298,9 @@ struct vmcb_save_area {
u64 rip;
u8 reserved_4[88];
u64 rsp;
- u8 reserved_5[24];
+ u64 s_cet;
+ u64 ssp;
+ u64 isst_addr;
u64 rax;
u64 star;
u64 lstar;
@@ -256,27 +311,145 @@ struct vmcb_save_area {
u64 sysenter_esp;
u64 sysenter_eip;
u64 cr2;
- u8 reserved_6[32];
+ u8 reserved_5[32];
u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
+ u8 reserved_6[72];
+ u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
+} __packed;
- /*
- * The following part of the save area is valid only for
- * SEV-ES guests when referenced through the GHCB or for
- * saving to the host save area.
- */
+/* Save area definition for SEV-ES and SEV-SNP guests */
+struct sev_es_save_area {
+ struct vmcb_seg es;
+ struct vmcb_seg cs;
+ struct vmcb_seg ss;
+ struct vmcb_seg ds;
+ struct vmcb_seg fs;
+ struct vmcb_seg gs;
+ struct vmcb_seg gdtr;
+ struct vmcb_seg ldtr;
+ struct vmcb_seg idtr;
+ struct vmcb_seg tr;
+ u64 vmpl0_ssp;
+ u64 vmpl1_ssp;
+ u64 vmpl2_ssp;
+ u64 vmpl3_ssp;
+ u64 u_cet;
+ u8 reserved_1[2];
+ u8 vmpl;
+ u8 cpl;
+ u8 reserved_2[4];
+ u64 efer;
+ u8 reserved_3[104];
+ u64 xss;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 dr0;
+ u64 dr1;
+ u64 dr2;
+ u64 dr3;
+ u64 dr0_addr_mask;
+ u64 dr1_addr_mask;
+ u64 dr2_addr_mask;
+ u64 dr3_addr_mask;
+ u8 reserved_4[24];
+ u64 rsp;
+ u64 s_cet;
+ u64 ssp;
+ u64 isst_addr;
+ u64 rax;
+ u64 star;
+ u64 lstar;
+ u64 cstar;
+ u64 sfmask;
+ u64 kernel_gs_base;
+ u64 sysenter_cs;
+ u64 sysenter_esp;
+ u64 sysenter_eip;
+ u64 cr2;
+ u8 reserved_5[32];
+ u64 g_pat;
+ u64 dbgctl;
+ u64 br_from;
+ u64 br_to;
+ u64 last_excp_from;
+ u64 last_excp_to;
u8 reserved_7[80];
u32 pkru;
- u8 reserved_7a[20];
- u64 reserved_8; /* rax already available at 0x01f8 */
+ u8 reserved_8[20];
+ u64 reserved_9; /* rax already available at 0x01f8 */
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 reserved_10; /* rsp already available at 0x01d8 */
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u8 reserved_11[16];
+ u64 guest_exit_info_1;
+ u64 guest_exit_info_2;
+ u64 guest_exit_int_info;
+ u64 guest_nrip;
+ u64 sev_features;
+ u64 vintr_ctrl;
+ u64 guest_exit_code;
+ u64 virtual_tom;
+ u64 tlb_id;
+ u64 pcpu_id;
+ u64 event_inj;
+ u64 xcr0;
+ u8 reserved_12[16];
+
+ /* Floating point area */
+ u64 x87_dp;
+ u32 mxcsr;
+ u16 x87_ftw;
+ u16 x87_fsw;
+ u16 x87_fcw;
+ u16 x87_fop;
+ u16 x87_ds;
+ u16 x87_cs;
+ u64 x87_rip;
+ u8 fpreg_x87[80];
+ u8 fpreg_xmm[256];
+ u8 fpreg_ymm[256];
+} __packed;
+
+struct ghcb_save_area {
+ u8 reserved_1[203];
+ u8 cpl;
+ u8 reserved_2[116];
+ u64 xss;
+ u8 reserved_3[24];
+ u64 dr7;
+ u8 reserved_4[16];
+ u64 rip;
+ u8 reserved_5[88];
+ u64 rsp;
+ u8 reserved_6[24];
+ u64 rax;
+ u8 reserved_7[264];
u64 rcx;
u64 rdx;
u64 rbx;
- u64 reserved_9; /* rsp already available at 0x01d8 */
+ u8 reserved_8[8];
u64 rbp;
u64 rsi;
u64 rdi;
@@ -288,22 +461,24 @@ struct vmcb_save_area {
u64 r13;
u64 r14;
u64 r15;
- u8 reserved_10[16];
+ u8 reserved_9[16];
u64 sw_exit_code;
u64 sw_exit_info_1;
u64 sw_exit_info_2;
u64 sw_scratch;
- u8 reserved_11[56];
+ u8 reserved_10[56];
u64 xcr0;
u8 valid_bitmap[16];
u64 x87_state_gpa;
} __packed;
+#define GHCB_SHARED_BUF_SIZE 2032
+
struct ghcb {
- struct vmcb_save_area save;
- u8 reserved_save[2048 - sizeof(struct vmcb_save_area)];
+ struct ghcb_save_area save;
+ u8 reserved_save[2048 - sizeof(struct ghcb_save_area)];
- u8 shared_buffer[2032];
+ u8 shared_buffer[GHCB_SHARED_BUF_SIZE];
u8 reserved_1[10];
u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
@@ -311,20 +486,23 @@ struct ghcb {
} __packed;
-#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
-#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272
+#define EXPECTED_VMCB_SAVE_AREA_SIZE 740
+#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032
+#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
{
BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
}
struct vmcb {
struct vmcb_control_area control;
- u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
struct vmcb_save_area save;
} __packed;
@@ -388,26 +566,26 @@ struct vmcb {
/* GHCB Accessor functions */
#define GHCB_BITMAP_IDX(field) \
- (offsetof(struct vmcb_save_area, field) / sizeof(u64))
+ (offsetof(struct ghcb_save_area, field) / sizeof(u64))
#define DEFINE_GHCB_ACCESSORS(field) \
- static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
+ static __always_inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
{ \
return test_bit(GHCB_BITMAP_IDX(field), \
(unsigned long *)&ghcb->save.valid_bitmap); \
} \
\
- static inline u64 ghcb_get_##field(struct ghcb *ghcb) \
+ static __always_inline u64 ghcb_get_##field(struct ghcb *ghcb) \
{ \
return ghcb->save.field; \
} \
\
- static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \
+ static __always_inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \
{ \
return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \
} \
\
- static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
+ static __always_inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
{ \
__set_bit(GHCB_BITMAP_IDX(field), \
(unsigned long *)&ghcb->save.valid_bitmap); \
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
deleted file mode 100644
index ff6c92eff035..000000000000
--- a/arch/x86/include/asm/swiotlb.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_SWIOTLB_H
-#define _ASM_X86_SWIOTLB_H
-
-#include <linux/swiotlb.h>
-
-#ifdef CONFIG_SWIOTLB
-extern int swiotlb;
-extern int __init pci_swiotlb_detect_override(void);
-extern int __init pci_swiotlb_detect_4gb(void);
-extern void __init pci_swiotlb_init(void);
-extern void __init pci_swiotlb_late_init(void);
-#else
-#define swiotlb 0
-static inline int pci_swiotlb_detect_override(void)
-{
- return 0;
-}
-static inline int pci_swiotlb_detect_4gb(void)
-{
- return 0;
-}
-static inline void pci_swiotlb_init(void)
-{
-}
-static inline void pci_swiotlb_late_init(void)
-{
-}
-#endif
-#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..c08eb0fdd11f 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,25 +71,20 @@ static inline void update_task_stack(struct task_struct *task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
- /*
- * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
- * doesn't work on x86-32 because sp1 and
- * cpu_current_top_of_stack have different values (because of
- * the non-zero stack-padding on 32bit).
- */
+ /* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
}
static inline void kthread_frame_init(struct inactive_task_frame *frame,
- unsigned long fun, unsigned long arg)
+ int (*fun)(void *), void *arg)
{
- frame->bx = fun;
+ frame->bx = (unsigned long)fun;
#ifdef CONFIG_X86_32
- frame->di = arg;
+ frame->di = (unsigned long)arg;
#else
- frame->r12 = arg;
+ frame->r12 = (unsigned long)arg;
#endif
}
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 7cbf733d11af..5b85987a5e97 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -21,13 +21,12 @@ extern const sys_call_ptr_t sys_call_table[];
#if defined(CONFIG_X86_32)
#define ia32_sys_call_table sys_call_table
-#endif
-
-#if defined(CONFIG_IA32_EMULATION)
+#else
+/*
+ * These may not exist, but still put the prototypes in so we
+ * can use IS_ENABLED().
+ */
extern const sys_call_ptr_t ia32_sys_call_table[];
-#endif
-
-#ifdef CONFIG_X86_X32_ABI
extern const sys_call_ptr_t x32_sys_call_table[];
#endif
@@ -88,15 +87,6 @@ static inline void syscall_get_arguments(struct task_struct *task,
memcpy(args, &regs->bx, 6 * sizeof(args[0]));
}
-static inline void syscall_set_arguments(struct task_struct *task,
- struct pt_regs *regs,
- unsigned int i, unsigned int n,
- const unsigned long *args)
-{
- BUG_ON(i + n > 6);
- memcpy(&regs->bx + i, args, n * sizeof(args[0]));
-}
-
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_I386;
@@ -128,30 +118,6 @@ static inline void syscall_get_arguments(struct task_struct *task,
}
}
-static inline void syscall_set_arguments(struct task_struct *task,
- struct pt_regs *regs,
- const unsigned long *args)
-{
-# ifdef CONFIG_IA32_EMULATION
- if (task->thread_info.status & TS_COMPAT) {
- regs->bx = *args++;
- regs->cx = *args++;
- regs->dx = *args++;
- regs->si = *args++;
- regs->di = *args++;
- regs->bp = *args;
- } else
-# endif
- {
- regs->di = *args++;
- regs->si = *args++;
- regs->dx = *args++;
- regs->r10 = *args++;
- regs->r8 = *args++;
- regs->r9 = *args;
- }
-}
-
static inline int syscall_get_arch(struct task_struct *task)
{
/* x32 tasks should be considered AUDIT_ARCH_X86_64. */
@@ -160,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
-void do_syscall_64(unsigned long nr, struct pt_regs *regs);
+void do_syscall_64(struct pt_regs *regs, int nr);
void do_int80_syscall_32(struct pt_regs *regs);
long do_fast_syscall_32(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index a84333adeef2..59358d1bf880 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -17,7 +17,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
* __x64_sys_*() - 64-bit native syscall
* __ia32_sys_*() - 32-bit native syscall or common compat syscall
* __ia32_compat_sys_*() - 32-bit compat syscall
- * __x32_compat_sys_*() - 64-bit X32 compat syscall
+ * __x64_compat_sys_*() - 64-bit X32 compat syscall
*
* The registers are decoded according to the ABI:
* 64-bit: RDI, RSI, RDX, R10, R8, R9
@@ -80,6 +80,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
}
#define __COND_SYSCALL(abi, name) \
+ __weak long __##abi##_##name(const struct pt_regs *__unused); \
__weak long __##abi##_##name(const struct pt_regs *__unused) \
{ \
return sys_ni_syscall(); \
@@ -158,30 +159,30 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#endif /* CONFIG_IA32_EMULATION */
-#ifdef CONFIG_X86_X32
+#ifdef CONFIG_X86_X32_ABI
/*
* For the x32 ABI, we need to create a stub for compat_sys_*() which is aware
* of the x86-64-style parameter ordering of x32 syscalls. The syscalls common
* with x86_64 obviously do not need such care.
*/
#define __X32_COMPAT_SYS_STUB0(name) \
- __SYS_STUB0(x32, compat_sys_##name)
+ __SYS_STUB0(x64, compat_sys_##name)
#define __X32_COMPAT_SYS_STUBx(x, name, ...) \
- __SYS_STUBx(x32, compat_sys##name, \
+ __SYS_STUBx(x64, compat_sys##name, \
SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__))
#define __X32_COMPAT_COND_SYSCALL(name) \
- __COND_SYSCALL(x32, compat_sys_##name)
+ __COND_SYSCALL(x64, compat_sys_##name)
#define __X32_COMPAT_SYS_NI(name) \
- __SYS_NI(x32, compat_sys_##name)
-#else /* CONFIG_X86_X32 */
+ __SYS_NI(x64, compat_sys_##name)
+#else /* CONFIG_X86_X32_ABI */
#define __X32_COMPAT_SYS_STUB0(name)
#define __X32_COMPAT_SYS_STUBx(x, name, ...)
#define __X32_COMPAT_COND_SYSCALL(name)
#define __X32_COMPAT_SYS_NI(name)
-#endif /* CONFIG_X86_X32 */
+#endif /* CONFIG_X86_X32_ABI */
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h
deleted file mode 100644
index 9834eef7f034..000000000000
--- a/arch/x86/include/asm/sysfb.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ARCH_X86_KERNEL_SYSFB_H
-#define _ARCH_X86_KERNEL_SYSFB_H
-
-/*
- * Generic System Framebuffers on x86
- * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/platform_data/simplefb.h>
-#include <linux/screen_info.h>
-
-enum {
- M_I17, /* 17-Inch iMac */
- M_I20, /* 20-Inch iMac */
- M_I20_SR, /* 20-Inch iMac (Santa Rosa) */
- M_I24, /* 24-Inch iMac */
- M_I24_8_1, /* 24-Inch iMac, 8,1th gen */
- M_I24_10_1, /* 24-Inch iMac, 10,1th gen */
- M_I27_11_1, /* 27-Inch iMac, 11,1th gen */
- M_MINI, /* Mac Mini */
- M_MINI_3_1, /* Mac Mini, 3,1th gen */
- M_MINI_4_1, /* Mac Mini, 4,1th gen */
- M_MB, /* MacBook */
- M_MB_2, /* MacBook, 2nd rev. */
- M_MB_3, /* MacBook, 3rd rev. */
- M_MB_5_1, /* MacBook, 5th rev. */
- M_MB_6_1, /* MacBook, 6th rev. */
- M_MB_7_1, /* MacBook, 7th rev. */
- M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */
- M_MBA, /* MacBook Air */
- M_MBA_3, /* Macbook Air, 3rd rev */
- M_MBP, /* MacBook Pro */
- M_MBP_2, /* MacBook Pro 2nd gen */
- M_MBP_2_2, /* MacBook Pro 2,2nd gen */
- M_MBP_SR, /* MacBook Pro (Santa Rosa) */
- M_MBP_4, /* MacBook Pro, 4th gen */
- M_MBP_5_1, /* MacBook Pro, 5,1th gen */
- M_MBP_5_2, /* MacBook Pro, 5,2th gen */
- M_MBP_5_3, /* MacBook Pro, 5,3rd gen */
- M_MBP_6_1, /* MacBook Pro, 6,1th gen */
- M_MBP_6_2, /* MacBook Pro, 6,2th gen */
- M_MBP_7_1, /* MacBook Pro, 7,1th gen */
- M_MBP_8_2, /* MacBook Pro, 8,2nd gen */
- M_UNKNOWN /* placeholder */
-};
-
-struct efifb_dmi_info {
- char *optname;
- unsigned long base;
- int stride;
- int width;
- int height;
- int flags;
-};
-
-#ifdef CONFIG_EFI
-
-extern struct efifb_dmi_info efifb_dmi_list[];
-void sysfb_apply_efi_quirks(void);
-
-#else /* CONFIG_EFI */
-
-static inline void sysfb_apply_efi_quirks(void)
-{
-}
-
-#endif /* CONFIG_EFI */
-
-#ifdef CONFIG_X86_SYSFB
-
-bool parse_mode(const struct screen_info *si,
- struct simplefb_platform_data *mode);
-int create_simplefb(const struct screen_info *si,
- const struct simplefb_platform_data *mode);
-
-#else /* CONFIG_X86_SYSFB */
-
-static inline bool parse_mode(const struct screen_info *si,
- struct simplefb_platform_data *mode)
-{
- return false;
-}
-
-static inline int create_simplefb(const struct screen_info *si,
- const struct simplefb_platform_data *mode)
-{
- return -EINVAL;
-}
-
-#endif /* CONFIG_X86_SYSFB */
-
-#endif /* _ARCH_X86_KERNEL_SYSFB_H */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
new file mode 100644
index 000000000000..020c81a7c729
--- /dev/null
+++ b/arch/x86/include/asm/tdx.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021-2022 Intel Corporation */
+#ifndef _ASM_X86_TDX_H
+#define _ASM_X86_TDX_H
+
+#include <linux/init.h>
+#include <linux/bits.h>
+#include <asm/ptrace.h>
+#include <asm/shared/tdx.h>
+
+/*
+ * SW-defined error codes.
+ *
+ * Bits 47:40 == 0xFF indicate Reserved status code class that never used by
+ * TDX module.
+ */
+#define TDX_ERROR _BITUL(63)
+#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40))
+#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000))
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Used to gather the output registers values of the TDCALL and SEAMCALL
+ * instructions when requesting services from the TDX module.
+ *
+ * This is a software only structure and not part of the TDX module/VMM ABI.
+ */
+struct tdx_module_output {
+ u64 rcx;
+ u64 rdx;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+};
+
+/*
+ * Used by the #VE exception handler to gather the #VE exception
+ * info from the TDX module. This is a software only structure
+ * and not part of the TDX module/VMM ABI.
+ */
+struct ve_info {
+ u64 exit_reason;
+ u64 exit_qual;
+ /* Guest Linear (virtual) Address */
+ u64 gla;
+ /* Guest Physical Address */
+ u64 gpa;
+ u32 instr_len;
+ u32 instr_info;
+};
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+void __init tdx_early_init(void);
+
+/* Used to communicate with the TDX module */
+u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
+ struct tdx_module_output *out);
+
+void tdx_get_ve_info(struct ve_info *ve);
+
+bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
+
+void tdx_safe_halt(void);
+
+bool tdx_early_handle_ve(struct pt_regs *regs);
+
+#else
+
+static inline void tdx_early_init(void) { };
+static inline void tdx_safe_halt(void) { };
+
+static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
+
+#endif /* CONFIG_INTEL_TDX_GUEST */
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_INTEL_TDX_GUEST)
+long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4);
+#else
+static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index b7421780e4e9..1cc15528ce29 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -44,6 +44,8 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
+extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
+extern void *text_poke_set(void *addr, int c, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
@@ -96,24 +98,40 @@ union text_poke_insn {
};
static __always_inline
-void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
+void __text_gen_insn(void *buf, u8 opcode, const void *addr, const void *dest, int size)
{
- static union text_poke_insn insn; /* per instance */
- int size = text_opcode_size(opcode);
+ union text_poke_insn *insn = buf;
+
+ BUG_ON(size < text_opcode_size(opcode));
+
+ /*
+ * Hide the addresses to avoid the compiler folding in constants when
+ * referencing code, these can mess up annotations like
+ * ANNOTATE_NOENDBR.
+ */
+ OPTIMIZER_HIDE_VAR(insn);
+ OPTIMIZER_HIDE_VAR(addr);
+ OPTIMIZER_HIDE_VAR(dest);
- insn.opcode = opcode;
+ insn->opcode = opcode;
if (size > 1) {
- insn.disp = (long)dest - (long)(addr + size);
+ insn->disp = (long)dest - (long)(addr + size);
if (size == 2) {
/*
- * Ensure that for JMP9 the displacement
+ * Ensure that for JMP8 the displacement
* actually fits the signed byte.
*/
- BUG_ON((insn.disp >> 31) != (insn.disp >> 7));
+ BUG_ON((insn->disp >> 31) != (insn->disp >> 7));
}
}
+}
+static __always_inline
+void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
+{
+ static union text_poke_insn insn; /* per instance */
+ __text_gen_insn(&insn, opcode, addr, dest, text_opcode_size(opcode));
return &insn.text;
}
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
index ddbdefd5b94f..91a7b6687c3b 100644
--- a/arch/x86/include/asm/thermal.h
+++ b/arch/x86/include/asm/thermal.h
@@ -3,11 +3,13 @@
#define _ASM_X86_THERMAL_H
#ifdef CONFIG_X86_THERMAL_VECTOR
+void therm_lvt_init(void);
void intel_init_thermal(struct cpuinfo_x86 *c);
bool x86_thermal_enabled(void);
void intel_thermal_interrupt(void);
#else
-static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+static inline void therm_lvt_init(void) { }
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
#endif
#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 06b740bae431..f0cb881c1d69 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,6 +57,9 @@ struct thread_info {
unsigned long flags; /* low level flags */
unsigned long syscall_work; /* SYSCALL_WORK_ flags */
u32 status; /* thread synchronous flags */
+#ifdef CONFIG_SMP
+ u32 cpu; /* current CPU */
+#endif
};
#define INIT_THREAD_INFO(tsk) \
@@ -81,7 +84,7 @@ struct thread_info {
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
#define TIF_SSBD 5 /* Speculative store bypass disable */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
-#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
+#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING 13 /* pending live patching update */
@@ -89,10 +92,10 @@ struct thread_info {
#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */
-#define TIF_SLD 18 /* Restore split lock detection on context switch */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
+#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
@@ -104,7 +107,7 @@ struct thread_info {
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
#define _TIF_SSBD (1 << TIF_SSBD)
#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
-#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
+#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
@@ -112,9 +115,9 @@ struct thread_info {
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
-#define _TIF_SLD (1 << TIF_SLD)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
+#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
@@ -123,7 +126,7 @@ struct thread_info {
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE \
(_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
- _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD)
+ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
/*
* Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
@@ -197,13 +200,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
}
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
-#endif
+#endif /* !__ASSEMBLY__ */
/*
* Thread-synchronous status.
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index a4a8b1b16c0c..956e4145311b 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -5,6 +5,15 @@
#include <asm/processor.h>
#include <asm/tsc.h>
+static inline unsigned long random_get_entropy(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_TSC) &&
+ !cpu_feature_enabled(X86_FEATURE_TSC))
+ return random_get_entropy_fallback();
+ return rdtsc();
+}
+#define random_get_entropy random_get_entropy
+
/* Assume we use the PIT time source for the clock tick */
#define CLOCK_TICK_RATE PIT_TICK_RATE
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8c87a2e0b660..4af5579c7ef7 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,30 +83,13 @@ struct tlb_state {
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
- unsigned long last_user_mm_ibpb;
+ unsigned long last_user_mm_spec;
};
u16 loaded_mm_asid;
u16 next_asid;
/*
- * We can be in one of several states:
- *
- * - Actively using an mm. Our CPU's bit will be set in
- * mm_cpumask(loaded_mm) and is_lazy == false;
- *
- * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
- * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
- *
- * - Lazily using a real mm. loaded_mm != &init_mm, our bit
- * is set in mm_cpumask(loaded_mm), but is_lazy == true.
- * We're heuristically guessing that the CR3 load we
- * skipped more than makes up for the overhead added by
- * lazy mode.
- */
- bool is_lazy;
-
- /*
* If set we changed the page tables in such a way that we
* needed an invalidation of all contexts (aka. PCIDs / ASIDs).
* This tells us to go invalidate all the non-loaded ctxs[]
@@ -151,7 +134,27 @@ struct tlb_state {
*/
struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+struct tlb_state_shared {
+ /*
+ * We can be in one of several states:
+ *
+ * - Actively using an mm. Our CPU's bit will be set in
+ * mm_cpumask(loaded_mm) and is_lazy == false;
+ *
+ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
+ * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+ *
+ * - Lazily using a real mm. loaded_mm != &init_mm, our bit
+ * is set in mm_cpumask(loaded_mm), but is_lazy == true.
+ * We're heuristically guessing that the CR3 load we
+ * skipped more than makes up for the overhead added by
+ * lazy mode.
+ */
+ bool is_lazy;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
bool nmi_uaccess_okay(void);
#define nmi_uaccess_okay nmi_uaccess_okay
@@ -175,7 +178,7 @@ extern void initialize_tlbstate_and_flush(void);
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
+ * - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
@@ -201,14 +204,15 @@ struct flush_tlb_info {
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
- unsigned int stride_shift;
- bool freed_tables;
+ unsigned int initiating_cpu;
+ u8 stride_shift;
+ u8 freed_tables;
};
void flush_tlb_local(void);
void flush_tlb_one_user(unsigned long addr);
void flush_tlb_one_kernel(unsigned long addr);
-void flush_tlb_others(const struct cpumask *cpumask,
+void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
#ifdef CONFIG_PARAVIRT
@@ -255,6 +259,108 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+static inline bool pte_flags_need_flush(unsigned long oldflags,
+ unsigned long newflags,
+ bool ignore_access)
+{
+ /*
+ * Flags that require a flush when cleared but not when they are set.
+ * Only include flags that would not trigger spurious page-faults.
+ * Non-present entries are not cached. Hardware would set the
+ * dirty/access bit if needed without a fault.
+ */
+ const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
+ _PAGE_ACCESSED;
+ const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
+ _PAGE_SOFTW3 | _PAGE_SOFTW4;
+ const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
+ _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
+ _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
+ _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX;
+ unsigned long diff = oldflags ^ newflags;
+
+ BUILD_BUG_ON(flush_on_clear & software_flags);
+ BUILD_BUG_ON(flush_on_clear & flush_on_change);
+ BUILD_BUG_ON(flush_on_change & software_flags);
+
+ /* Ignore software flags */
+ diff &= ~software_flags;
+
+ if (ignore_access)
+ diff &= ~_PAGE_ACCESSED;
+
+ /*
+ * Did any of the 'flush_on_clear' flags was clleared set from between
+ * 'oldflags' and 'newflags'?
+ */
+ if (diff & oldflags & flush_on_clear)
+ return true;
+
+ /* Flush on modified flags. */
+ if (diff & flush_on_change)
+ return true;
+
+ /* Ensure there are no flags that were left behind */
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+ (diff & ~(flush_on_clear | software_flags | flush_on_change))) {
+ VM_WARN_ON_ONCE(1);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * pte_needs_flush() checks whether permissions were demoted and require a
+ * flush. It should only be used for userspace PTEs.
+ */
+static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
+{
+ /* !PRESENT -> * ; no need for flush */
+ if (!(pte_flags(oldpte) & _PAGE_PRESENT))
+ return false;
+
+ /* PFN changed ; needs flush */
+ if (pte_pfn(oldpte) != pte_pfn(newpte))
+ return true;
+
+ /*
+ * check PTE flags; ignore access-bit; see comment in
+ * ptep_clear_flush_young().
+ */
+ return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte),
+ true);
+}
+#define pte_needs_flush pte_needs_flush
+
+/*
+ * huge_pmd_needs_flush() checks whether permissions were demoted and require a
+ * flush. It should only be used for userspace huge PMDs.
+ */
+static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
+{
+ /* !PRESENT -> * ; no need for flush */
+ if (!(pmd_flags(oldpmd) & _PAGE_PRESENT))
+ return false;
+
+ /* PFN changed ; needs flush */
+ if (pmd_pfn(oldpmd) != pmd_pfn(newpmd))
+ return true;
+
+ /*
+ * check PMD flags; do not ignore access-bit; see
+ * pmdp_clear_flush_young().
+ */
+ return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd),
+ false);
+}
+#define huge_pmd_needs_flush huge_pmd_needs_flush
+
#endif /* !MODULE */
+static inline void __native_tlb_flush_global(unsigned long cr4)
+{
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ native_write_cr4(cr4);
+}
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 9239399e5491..458c891a8273 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -103,17 +103,21 @@ static inline void setup_node_to_cpumask_map(void) { }
#include <asm-generic/topology.h>
extern const struct cpumask *cpu_coregroup_mask(int cpu);
+extern const struct cpumask *cpu_clustergroup_mask(int cpu);
#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id)
#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id)
#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
+#define topology_ppin(cpu) (cpu_data(cpu).ppin)
extern unsigned int __max_die_per_package;
#ifdef CONFIG_SMP
+#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu))
#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
+#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu))
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
@@ -208,19 +212,19 @@ static inline long arch_scale_freq_capacity(int cpu)
}
#define arch_scale_freq_capacity arch_scale_freq_capacity
-extern void arch_scale_freq_tick(void);
-#define arch_scale_freq_tick arch_scale_freq_tick
-
extern void arch_set_max_freq_ratio(bool turbo_disabled);
+extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
#else
-static inline void arch_set_max_freq_ratio(bool turbo_disabled)
-{
-}
+static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
+static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
#endif
+extern void arch_scale_freq_tick(void);
+#define arch_scale_freq_tick arch_scale_freq_tick
+
#ifdef CONFIG_ACPI_CPPC_LIB
void init_freq_invariance_cppc(void);
-#define init_freq_invariance_cppc init_freq_invariance_cppc
+#define arch_init_invariance_cppc init_freq_invariance_cppc
#endif
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 879b77792f94..4645a6334063 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -22,8 +22,8 @@ DECLARE_EVENT_CLASS(x86_fpu,
__entry->fpu = fpu;
__entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD);
if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
- __entry->xfeatures = fpu->state.xsave.header.xfeatures;
- __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv;
+ __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
+ __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv;
}
),
TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h
index 4d705cb4d63b..a8e5a7a2b460 100644
--- a/arch/x86/include/asm/trace/hyperv.h
+++ b/arch/x86/include/asm/trace/hyperv.h
@@ -8,7 +8,7 @@
#if IS_ENABLED(CONFIG_HYPERV)
-TRACE_EVENT(hyperv_mmu_flush_tlb_others,
+TRACE_EVENT(hyperv_mmu_flush_tlb_multi,
TP_PROTO(const struct cpumask *cpus,
const struct flush_tlb_info *info),
TP_ARGS(cpus, info),
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 7f7200021bd1..47ecfff2c83d 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -13,11 +13,13 @@
#ifdef CONFIG_X86_64
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
-struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
+struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
+extern bool ibt_selftest(void);
+
#ifdef CONFIG_X86_F00F_BUG
/* For handling the FOOF bug */
void handle_invalid_op(struct pt_regs *regs);
@@ -40,9 +42,9 @@ void math_emulate(struct math_emu_info *);
bool fault_in_kernel_space(unsigned long address);
#ifdef CONFIG_VMAP_STACK
-void __noreturn handle_stack_overflow(const char *message,
- struct pt_regs *regs,
- unsigned long fault_address);
+void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info);
#endif
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 01a300a9700b..fbdc3d951494 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -20,13 +20,12 @@ extern void disable_TSC(void);
static inline cycles_t get_cycles(void)
{
-#ifndef CONFIG_X86_TSC
- if (!boot_cpu_has(X86_FEATURE_TSC))
+ if (!IS_ENABLED(CONFIG_X86_TSC) &&
+ !cpu_feature_enabled(X86_FEATURE_TSC))
return 0;
-#endif
-
return rdtsc();
}
+#define get_cycles get_cycles
extern struct system_counterval_t convert_art_to_tsc(u64 art);
extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c9fa7be3df82..913e593a3b45 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -12,35 +12,6 @@
#include <asm/smap.h>
#include <asm/extable.h>
-/*
- * Test whether a block of memory is a valid user space address.
- * Returns 0 if the range is valid, nonzero otherwise.
- */
-static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
-{
- /*
- * If we have used "sizeof()" for the size,
- * we know it won't overflow the limit (but
- * it might overflow the 'addr', so it's
- * important to subtract the size from the
- * limit, not add it to the address).
- */
- if (__builtin_constant_p(size))
- return unlikely(addr > limit - size);
-
- /* Arbitrary sizes? Be careful about overflow */
- addr += size;
- if (unlikely(addr < size))
- return true;
- return unlikely(addr > limit);
-}
-
-#define __range_not_ok(addr, size, limit) \
-({ \
- __chk_user_ptr(addr); \
- __chk_range_not_ok((unsigned long __force)(addr), size, limit); \
-})
-
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline bool pagefault_disabled(void);
# define WARN_ON_IN_IRQ() \
@@ -69,9 +40,11 @@ static inline bool pagefault_disabled(void);
#define access_ok(addr, size) \
({ \
WARN_ON_IN_IRQ(); \
- likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \
+ likely(__access_ok(addr, size)); \
})
+#include <asm-generic/access_ok.h>
+
extern int __get_user_1(void);
extern int __get_user_2(void);
extern int __get_user_4(void);
@@ -301,8 +274,8 @@ do { \
unsigned int __gu_low, __gu_high; \
const unsigned int __user *__gu_ptr; \
__gu_ptr = (const void __user *)(ptr); \
- __get_user_asm(__gu_low, ptr, "l", "=r", label); \
- __get_user_asm(__gu_high, ptr+1, "l", "=r", label); \
+ __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \
+ __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \
(x) = ((unsigned long long)__gu_high << 32) | __gu_low; \
} while (0)
#else
@@ -314,11 +287,12 @@ do { \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
- unsigned char x_u8__; \
- case 1: \
+ case 1: { \
+ unsigned char x_u8__; \
__get_user_asm(x_u8__, ptr, "b", "=q", label); \
(x) = x_u8__; \
break; \
+ } \
case 2: \
__get_user_asm(x, ptr, "w", "=r", label); \
break; \
@@ -351,24 +325,22 @@ do { \
"1: movl %[lowbits],%%eax\n" \
"2: movl %[highbits],%%edx\n" \
"3:\n" \
- ".section .fixup,\"ax\"\n" \
- "4: mov %[efault],%[errout]\n" \
- " xorl %%eax,%%eax\n" \
- " xorl %%edx,%%edx\n" \
- " jmp 3b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(1b, 4b) \
- _ASM_EXTABLE_UA(2b, 4b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX_DX, \
+ %[errout]) \
+ _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX_DX, \
+ %[errout]) \
: [errout] "=r" (retval), \
[output] "=&A"(x) \
: [lowbits] "m" (__m(__ptr)), \
[highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \
- [efault] "i" (-EFAULT), "0" (retval)); \
+ "0" (retval)); \
})
#else
#define __get_user_asm_u64(x, ptr, retval) \
- __get_user_asm(x, ptr, retval, "q", "=r")
+ __get_user_asm(x, ptr, retval, "q")
#endif
#define __get_user_size(x, ptr, size, retval) \
@@ -379,14 +351,14 @@ do { \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
- __get_user_asm(x_u8__, ptr, retval, "b", "=q"); \
+ __get_user_asm(x_u8__, ptr, retval, "b"); \
(x) = x_u8__; \
break; \
case 2: \
- __get_user_asm(x, ptr, retval, "w", "=r"); \
+ __get_user_asm(x, ptr, retval, "w"); \
break; \
case 4: \
- __get_user_asm(x, ptr, retval, "l", "=r"); \
+ __get_user_asm(x, ptr, retval, "l"); \
break; \
case 8: \
__get_user_asm_u64(x, ptr, retval); \
@@ -396,22 +368,116 @@ do { \
} \
} while (0)
-#define __get_user_asm(x, addr, err, itype, ltype) \
+#define __get_user_asm(x, addr, err, itype) \
asm volatile("\n" \
"1: mov"itype" %[umem],%[output]\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: mov %[efault],%[errout]\n" \
- " xorl %k[output],%k[output]\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(1b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX, \
+ %[errout]) \
: [errout] "=r" (err), \
- [output] ltype(x) \
+ [output] "=a" (x) \
: [umem] "m" (__m(addr)), \
- [efault] "i" (-EFAULT), "0" (err))
+ "0" (err))
+
+#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-#endif // CONFIG_CC_ASM_GOTO_OUTPUT
+#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm_volatile_goto("\n" \
+ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+ _ASM_EXTABLE_UA(1b, %l[label]) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*_ptr), \
+ [old] "+a" (__old) \
+ : [new] ltype (__new) \
+ : "memory" \
+ : label); \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+
+#ifdef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm_volatile_goto("\n" \
+ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
+ _ASM_EXTABLE_UA(1b, %l[label]) \
+ : CC_OUT(z) (success), \
+ "+A" (__old), \
+ [ptr] "+m" (*_ptr) \
+ : "b" ((u32)__new), \
+ "c" ((u32)((u64)__new >> 32)) \
+ : "memory" \
+ : label); \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+#endif // CONFIG_X86_32
+#else // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+ int __err = 0; \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm volatile("\n" \
+ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+ CC_SET(z) \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \
+ %[errout]) \
+ : CC_OUT(z) (success), \
+ [errout] "+r" (__err), \
+ [ptr] "+m" (*_ptr), \
+ [old] "+a" (__old) \
+ : [new] ltype (__new) \
+ : "memory"); \
+ if (unlikely(__err)) \
+ goto label; \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+
+#ifdef CONFIG_X86_32
+/*
+ * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
+ * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
+ * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses
+ * both ESI and EDI for the memory operand, compilation will fail if the error
+ * is an input+output as there will be no register available for input.
+ */
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \
+ int __result; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm volatile("\n" \
+ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
+ "mov $0, %%ecx\n\t" \
+ "setz %%cl\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
+ : [result]"=c" (__result), \
+ "+A" (__old), \
+ [ptr] "+m" (*_ptr) \
+ : "b" ((u32)__new), \
+ "c" ((u32)((u64)__new >> 32)) \
+ : "memory", "cc"); \
+ if (unlikely(__result < 0)) \
+ goto label; \
+ if (unlikely(!__result)) \
+ *_old = __old; \
+ likely(__result); })
+#endif // CONFIG_X86_32
+#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
@@ -505,6 +571,51 @@ do { \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+extern void __try_cmpxchg_user_wrong_size(void);
+
+#ifndef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label) \
+ __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
+#endif
+
+/*
+ * Force the pointer to u<size> to match the size expected by the asm helper.
+ * clang/LLVM compiles all cases and only discards the unused paths after
+ * processing errors, which breaks i386 if the pointer is an 8-byte value.
+ */
+#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \
+ bool __ret; \
+ __chk_user_ptr(_ptr); \
+ switch (sizeof(*(_ptr))) { \
+ case 1: __ret = __try_cmpxchg_user_asm("b", "q", \
+ (__force u8 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 2: __ret = __try_cmpxchg_user_asm("w", "r", \
+ (__force u16 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 4: __ret = __try_cmpxchg_user_asm("l", "r", \
+ (__force u32 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
+ (_nval), _label); \
+ break; \
+ default: __try_cmpxchg_user_wrong_size(); \
+ } \
+ __ret; })
+
+/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
+#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \
+ int __ret = -EFAULT; \
+ __uaccess_begin_nospec(); \
+ __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label); \
+_label: \
+ __uaccess_end(); \
+ __ret; \
+ })
+
/*
* We want the unsafe accessors to always be inlined and use
* the error labels - thus the macro games.
@@ -528,8 +639,6 @@ do { \
unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \
} while (0)
-#define HAVE_GET_KERNEL_NOFAULT
-
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label) \
__get_user_size(*((type *)(dst)), (__force type __user *)(src), \
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index e7265a552f4f..45697e04d771 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -58,13 +58,6 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
return copy_user_generic((__force void *)dst, src, size);
}
-static __always_inline __must_check
-unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
-{
- return copy_user_generic((__force void *)dst,
- (__force void *)src, size);
-}
-
extern long __copy_user_nocache(void *dst, const void __user *src,
unsigned size, int zerorest);
diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h
deleted file mode 100644
index 9c754a7447aa..000000000000
--- a/arch/x86/include/asm/unaligned.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_UNALIGNED_H
-#define _ASM_X86_UNALIGNED_H
-
-/*
- * The x86 can do unaligned accesses itself.
- */
-
-#include <linux/unaligned/access_ok.h>
-#include <linux/unaligned/generic.h>
-
-#define get_unaligned __get_unaligned_le
-#define put_unaligned __put_unaligned_le
-
-#endif /* _ASM_X86_UNALIGNED_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index c1c3d31b15c0..761173ccc33c 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -13,7 +13,7 @@
# define __ARCH_WANT_SYS_OLD_MMAP
# define __ARCH_WANT_SYS_OLD_SELECT
-# define __NR_ia32_syscall_max __NR_syscall_max
+# define IA32_NR_syscalls (__NR_syscalls)
# else
@@ -22,16 +22,17 @@
# include <asm/unistd_32_ia32.h>
# define __ARCH_WANT_SYS_TIME
# define __ARCH_WANT_SYS_UTIME
+# define __ARCH_WANT_COMPAT_STAT
# define __ARCH_WANT_COMPAT_SYS_PREADV64
# define __ARCH_WANT_COMPAT_SYS_PWRITEV64
# define __ARCH_WANT_COMPAT_SYS_PREADV64V2
# define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
+# define X32_NR_syscalls (__NR_x32_syscalls)
+# define IA32_NR_syscalls (__NR_ia32_syscalls)
# endif
-# define NR_syscalls (__NR_syscall_max + 1)
-# define X32_NR_syscalls (__NR_x32_syscall_max + 1)
-# define IA32_NR_syscalls (__NR_ia32_syscall_max + 1)
+# define NR_syscalls (__NR_syscalls)
# define __ARCH_WANT_NEW_STAT
# define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 70fc159ebe69..7cede4dc21f0 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/ftrace.h>
+#include <linux/rethook.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
@@ -15,6 +16,9 @@ struct unwind_state {
unsigned long stack_mask;
struct task_struct *task;
int graph_idx;
+#if defined(CONFIG_RETHOOK)
+ struct llist_node *kr_cur;
+#endif
bool error;
#if defined(CONFIG_UNWINDER_ORC)
bool signal, full_regs;
@@ -99,6 +103,30 @@ void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
void *orc, size_t orc_size) {}
#endif
+static inline
+unsigned long unwind_recover_rethook(struct unwind_state *state,
+ unsigned long addr, unsigned long *addr_p)
+{
+#ifdef CONFIG_RETHOOK
+ if (is_rethook_trampoline(addr))
+ return rethook_find_ret_addr(state->task, (unsigned long)addr_p,
+ &state->kr_cur);
+#endif
+ return addr;
+}
+
+/* Recover the return address modified by rethook and ftrace_graph. */
+static inline
+unsigned long unwind_recover_ret_addr(struct unwind_state *state,
+ unsigned long addr, unsigned long *addr_p)
+{
+ unsigned long ret;
+
+ ret = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ addr, addr_p);
+ return unwind_recover_rethook(state, ret, addr_p);
+}
+
/*
* This disables KASAN checking when reading a value from another task's stack,
* since the other task could be running on another CPU and could have poisoned
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 8e574c0afef8..8b33674288ea 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -52,6 +52,11 @@
UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
.endm
+#else
+
+#define UNWIND_HINT_FUNC \
+ UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0)
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h
index d72c3d66e94f..8963915e533f 100644
--- a/arch/x86/include/asm/user_32.h
+++ b/arch/x86/include/asm/user_32.h
@@ -124,9 +124,5 @@ struct user{
char u_comm[32]; /* User command that was responsible */
int u_debugreg[8];
};
-#define NBPG PAGE_SIZE
-#define UPAGES 1
-#define HOST_TEXT_START_ADDR (u.start_code)
-#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
#endif /* _ASM_X86_USER_32_H */
diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h
index db909923611c..1dd10f07ccd6 100644
--- a/arch/x86/include/asm/user_64.h
+++ b/arch/x86/include/asm/user_64.h
@@ -130,9 +130,5 @@ struct user {
unsigned long error_code; /* CPU error code or 0 */
unsigned long fault_address; /* CR3 or 0 */
};
-#define NBPG PAGE_SIZE
-#define UPAGES 1
-#define HOST_TEXT_START_ADDR (u.start_code)
-#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
#endif /* _ASM_X86_USER_64_H */
diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h
index f241451035fb..027a9258dbca 100644
--- a/arch/x86/include/asm/uv/uv_geo.h
+++ b/arch/x86/include/asm/uv/uv_geo.h
@@ -10,7 +10,7 @@
#ifndef _ASM_UV_GEO_H
#define _ASM_UV_GEO_H
-/* Type declaractions */
+/* Type declarations */
/* Size of a geoid_s structure (must be before decl. of geoid_u) */
#define GEOID_SIZE 8
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 5002f52be332..d3e3197917be 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -353,7 +353,7 @@ union uvh_apicid {
*
* Note there are NO leds on a UV system. This register is only
* used by the system controller to monitor system-wide operation.
- * There are 64 regs per node. With Nahelem cpus (2 cores per node,
+ * There are 64 regs per node. With Nehalem cpus (2 cores per node,
* 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on
* a node.
*
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 98aa103eb4ab..2963a2f5dbc4 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -37,7 +37,7 @@ struct vdso_image {
extern const struct vdso_image vdso_image_64;
#endif
-#ifdef CONFIG_X86_X32
+#ifdef CONFIG_X86_X32_ABI
extern const struct vdso_image vdso_image_x32;
#endif
diff --git a/arch/x86/include/asm/vdso/clocksource.h b/arch/x86/include/asm/vdso/clocksource.h
index 119ac8612d89..136e5e57cfe1 100644
--- a/arch/x86/include/asm/vdso/clocksource.h
+++ b/arch/x86/include/asm/vdso/clocksource.h
@@ -7,4 +7,6 @@
VDSO_CLOCKMODE_PVCLOCK, \
VDSO_CLOCKMODE_HVCLOCK
+#define HAVE_VDSO_CLOCKMODE_HVCLOCK
+
#endif /* __ASM_VDSO_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index df01d7349d79..1936f21ed8cd 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -58,7 +58,8 @@ extern struct ms_hyperv_tsc_page hvclock_page
#endif
#ifdef CONFIG_TIME_NS
-static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void)
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
{
return __timens_vdso_data;
}
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
index 29837740b520..49ce331f3ac6 100644
--- a/arch/x86/include/asm/vmalloc.h
+++ b/arch/x86/include/asm/vmalloc.h
@@ -1,6 +1,26 @@
#ifndef _ASM_X86_VMALLOC_H
#define _ASM_X86_VMALLOC_H
+#include <asm/cpufeature.h>
+#include <asm/page.h>
#include <asm/pgtable_areas.h>
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+
+#ifdef CONFIG_X86_64
+#define arch_vmap_pud_supported arch_vmap_pud_supported
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+ return boot_cpu_has(X86_FEATURE_GBPAGES);
+}
+#endif
+
+#define arch_vmap_pmd_supported arch_vmap_pmd_supported
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+ return boot_cpu_has(X86_FEATURE_PSE);
+}
+
+#endif
+
#endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 358707f60d99..6c343c6a1855 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -373,6 +373,7 @@ enum vmcs_field {
#define GUEST_INTR_STATE_MOV_SS 0x00000002
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010
/* GUEST_ACTIVITY_STATE flags */
#define GUEST_ACTIVITY_ACTIVE 0
@@ -542,16 +543,14 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_ACC_READ_BIT 0
#define EPT_VIOLATION_ACC_WRITE_BIT 1
#define EPT_VIOLATION_ACC_INSTR_BIT 2
-#define EPT_VIOLATION_READABLE_BIT 3
-#define EPT_VIOLATION_WRITABLE_BIT 4
-#define EPT_VIOLATION_EXECUTABLE_BIT 5
+#define EPT_VIOLATION_RWX_SHIFT 3
+#define EPT_VIOLATION_GVA_IS_VALID_BIT 7
#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT)
#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT)
-#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT)
-#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT)
-#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT)
+#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
/*
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index 06006b0351f3..8338b0432b50 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -77,30 +77,58 @@ static inline unsigned long find_zero(unsigned long mask)
* and the next page not being mapped, take the exception and
* return zeroes in the non-existing part.
*/
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
- unsigned long ret, dummy;
+ unsigned long offset, data;
+ unsigned long ret;
+
+ asm_volatile_goto(
+ "1: mov %[mem], %[ret]\n"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [ret] "=r" (ret)
+ : [mem] "m" (*(unsigned long *)addr)
+ : : do_exception);
+
+ return ret;
+
+do_exception:
+ offset = (unsigned long)addr & (sizeof(long) - 1);
+ addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
+ data = *(unsigned long *)addr;
+ ret = data >> offset * 8;
+
+ return ret;
+}
- asm(
- "1:\tmov %2,%0\n"
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+static inline unsigned long load_unaligned_zeropad(const void *addr)
+{
+ unsigned long offset, data;
+ unsigned long ret, err = 0;
+
+ asm( "1: mov %[mem], %[ret]\n"
"2:\n"
- ".section .fixup,\"ax\"\n"
- "3:\t"
- "lea %2,%1\n\t"
- "and %3,%1\n\t"
- "mov (%1),%0\n\t"
- "leal %2,%%ecx\n\t"
- "andl %4,%%ecx\n\t"
- "shll $3,%%ecx\n\t"
- "shr %%cl,%0\n\t"
- "jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- :"=&r" (ret),"=&c" (dummy)
- :"m" (*(unsigned long *)addr),
- "i" (-sizeof(unsigned long)),
- "i" (sizeof(unsigned long)-1));
+
+ _ASM_EXTABLE_FAULT(1b, 2b)
+
+ : [ret] "=&r" (ret), "+a" (err)
+ : [mem] "m" (*(unsigned long *)addr));
+
+ if (unlikely(err)) {
+ offset = (unsigned long)addr & (sizeof(long) - 1);
+ addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
+ data = *(unsigned long *)addr;
+ ret = data >> offset * 8;
+ }
+
return ret;
}
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 5c69f7eb5d47..e9170457697e 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -142,6 +142,21 @@ struct x86_init_acpi {
};
/**
+ * struct x86_guest - Functions used by misc guest incarnations like SEV, TDX, etc.
+ *
+ * @enc_status_change_prepare Notify HV before the encryption status of a range is changed
+ * @enc_status_change_finish Notify HV after the encryption status of a range is changed
+ * @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
+ * @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
+ */
+struct x86_guest {
+ void (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
+ bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
+ bool (*enc_tlb_flush_required)(bool enc);
+ bool (*enc_cache_flush_required)(void);
+};
+
+/**
* struct x86_init_ops - functions for platform specific setup
*
*/
@@ -287,12 +302,7 @@ struct x86_platform_ops {
struct x86_legacy_features legacy;
void (*set_legacy_features)(void);
struct x86_hyper_runtime hyper;
-};
-
-struct pci_dev;
-
-struct x86_msi_ops {
- void (*restore_msi_irqs)(struct pci_dev *dev);
+ struct x86_guest guest;
};
struct x86_apic_ops {
diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
index a9630104f1c4..78e667a31d6c 100644
--- a/arch/x86/include/asm/xen/cpuid.h
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -100,6 +100,13 @@
/* Memory mapped from other domains has valid IOMMU entries */
#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2)
#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */
+#define XEN_HVM_CPUID_DOMID_PRESENT (1u << 4) /* domid is present in ECX */
+/*
+ * Bits 55:49 from the IO-APIC RTE and bits 11:5 from the MSI address can be
+ * used to store high bits for the Destination ID. This expands the Destination
+ * ID field from 8 to 15 bits, allowing to target APIC IDs up 32768.
+ */
+#define XEN_HVM_CPUID_EXT_DEST_ID (1u << 5)
/*
* Leaf 6 (0x40000x05)
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 454b20815f35..e5e0fe10c692 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -248,6 +248,7 @@ privcmd_call(unsigned int call,
return res;
}
+#ifdef CONFIG_XEN_PV
static inline int
HYPERVISOR_set_trap_table(struct trap_info *table)
{
@@ -280,6 +281,107 @@ HYPERVISOR_callback_op(int cmd, void *arg)
return _hypercall2(int, callback_op, cmd, arg);
}
+static __always_inline int
+HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static __always_inline unsigned long
+HYPERVISOR_get_debugreg(int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int
+HYPERVISOR_update_descriptor(u64 ma, u64 desc)
+{
+ return _hypercall2(int, update_descriptor, ma, desc);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
+ unsigned long flags)
+{
+ return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
+}
+
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+
+static inline void
+MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+{
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = set;
+
+ trace_xen_mc_entry(mcl, 1);
+}
+
+static inline void
+MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = va;
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+
+ trace_xen_mc_entry(mcl, 3);
+}
+
+static inline void
+MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
+ struct desc_struct desc)
+{
+ mcl->op = __HYPERVISOR_update_descriptor;
+ mcl->args[0] = maddr;
+ mcl->args[1] = *(unsigned long *)&desc;
+
+ trace_xen_mc_entry(mcl, 2);
+}
+
+static inline void
+MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
+ int count, int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)req;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
+}
+
+static inline void
+MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmuext_op;
+ mcl->args[0] = (unsigned long)op;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl,
+ unsigned long ss, unsigned long esp)
+{
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = ss;
+ mcl->args[1] = esp;
+
+ trace_xen_mc_entry(mcl, 2);
+}
+#endif
+
static inline int
HYPERVISOR_sched_op(int cmd, void *arg)
{
@@ -308,26 +410,6 @@ HYPERVISOR_platform_op(struct xen_platform_op *op)
return _hypercall1(int, platform_op, op);
}
-static inline int
-HYPERVISOR_set_debugreg(int reg, unsigned long value)
-{
- return _hypercall2(int, set_debugreg, reg, value);
-}
-
-static inline unsigned long
-HYPERVISOR_get_debugreg(int reg)
-{
- return _hypercall1(unsigned long, get_debugreg, reg);
-}
-
-static inline int
-HYPERVISOR_update_descriptor(u64 ma, u64 desc)
-{
- if (sizeof(u64) == sizeof(long))
- return _hypercall2(int, update_descriptor, ma, desc);
- return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
-}
-
static inline long
HYPERVISOR_memory_op(unsigned int cmd, void *arg)
{
@@ -341,24 +423,12 @@ HYPERVISOR_multicall(void *call_list, uint32_t nr_calls)
}
static inline int
-HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
- unsigned long flags)
-{
- if (sizeof(new_val) == sizeof(long))
- return _hypercall3(int, update_va_mapping, va,
- new_val.pte, flags);
- else
- return _hypercall4(int, update_va_mapping, va,
- new_val.pte, new_val.pte >> 32, flags);
-}
-
-static inline int
HYPERVISOR_event_channel_op(int cmd, void *arg)
{
return _hypercall2(int, event_channel_op, cmd, arg);
}
-static inline int
+static __always_inline int
HYPERVISOR_xen_version(int cmd, void *arg)
{
return _hypercall2(int, xen_version, cmd, arg);
@@ -394,14 +464,6 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
}
-#ifdef CONFIG_X86_64
-static inline int
-HYPERVISOR_set_segment_base(int reg, unsigned long value)
-{
- return _hypercall2(int, set_segment_base, reg, value);
-}
-#endif
-
static inline int
HYPERVISOR_suspend(unsigned long start_info_mfn)
{
@@ -423,13 +485,6 @@ HYPERVISOR_hvm_op(int op, void *arg)
}
static inline int
-HYPERVISOR_tmem_op(
- struct tmem_op *op)
-{
- return _hypercall1(int, tmem_op, op);
-}
-
-static inline int
HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
{
return _hypercall2(int, xenpmu_op, op, arg);
@@ -446,88 +501,4 @@ HYPERVISOR_dm_op(
return ret;
}
-static inline void
-MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
-{
- mcl->op = __HYPERVISOR_fpu_taskswitch;
- mcl->args[0] = set;
-
- trace_xen_mc_entry(mcl, 1);
-}
-
-static inline void
-MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
- pte_t new_val, unsigned long flags)
-{
- mcl->op = __HYPERVISOR_update_va_mapping;
- mcl->args[0] = va;
- if (sizeof(new_val) == sizeof(long)) {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = flags;
- } else {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = new_val.pte >> 32;
- mcl->args[3] = flags;
- }
-
- trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4);
-}
-
-static inline void
-MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
- struct desc_struct desc)
-{
- mcl->op = __HYPERVISOR_update_descriptor;
- if (sizeof(maddr) == sizeof(long)) {
- mcl->args[0] = maddr;
- mcl->args[1] = *(unsigned long *)&desc;
- } else {
- u32 *p = (u32 *)&desc;
-
- mcl->args[0] = maddr;
- mcl->args[1] = maddr >> 32;
- mcl->args[2] = *p++;
- mcl->args[3] = *p;
- }
-
- trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
-}
-
-static inline void
-MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
- int count, int *success_count, domid_t domid)
-{
- mcl->op = __HYPERVISOR_mmu_update;
- mcl->args[0] = (unsigned long)req;
- mcl->args[1] = count;
- mcl->args[2] = (unsigned long)success_count;
- mcl->args[3] = domid;
-
- trace_xen_mc_entry(mcl, 4);
-}
-
-static inline void
-MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
- int *success_count, domid_t domid)
-{
- mcl->op = __HYPERVISOR_mmuext_op;
- mcl->args[0] = (unsigned long)op;
- mcl->args[1] = count;
- mcl->args[2] = (unsigned long)success_count;
- mcl->args[3] = domid;
-
- trace_xen_mc_entry(mcl, 4);
-}
-
-static inline void
-MULTI_stack_switch(struct multicall_entry *mcl,
- unsigned long ss, unsigned long esp)
-{
- mcl->op = __HYPERVISOR_stack_switch;
- mcl->args[0] = ss;
- mcl->args[1] = esp;
-
- trace_xen_mc_entry(mcl, 2);
-}
-
#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index ff4b52e37e60..16f548a661cf 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -43,18 +43,12 @@ static inline uint32_t xen_cpuid_base(void)
return hypervisor_cpuid_base("XenVMMXenVMM", 2);
}
-#ifdef CONFIG_XEN
-extern bool __init xen_hvm_need_lapic(void);
+struct pci_dev;
-static inline bool __init xen_x2apic_para_available(void)
-{
- return xen_hvm_need_lapic();
-}
+#ifdef CONFIG_XEN_PV_DOM0
+bool xen_initdom_restore_msi(struct pci_dev *dev);
#else
-static inline bool __init xen_x2apic_para_available(void)
-{
- return (xen_cpuid_base() != 0);
-}
+static inline bool xen_initdom_restore_msi(struct pci_dev *dev) { return true; }
#endif
#ifdef CONFIG_HOTPLUG_CPU
@@ -62,4 +56,9 @@ void xen_arch_register_cpu(int num);
void xen_arch_unregister_cpu(int num);
#endif
+#ifdef CONFIG_PVH
+void __init xen_pvh_init(struct boot_params *boot_params);
+void __init mem_map_via_hcall(struct boot_params *boot_params_p);
+#endif
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h
deleted file mode 100644
index 63cd41b2e17a..000000000000
--- a/arch/x86/include/asm/xen/page-coherent.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_XEN_PAGE_COHERENT_H
-#define _ASM_X86_XEN_PAGE_COHERENT_H
-
-#include <asm/page.h>
-#include <linux/dma-mapping.h>
-
-static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, gfp_t flags,
- unsigned long attrs)
-{
- void *vstart = (void*)__get_free_pages(flags, get_order(size));
- *dma_handle = virt_to_phys(vstart);
- return vstart;
-}
-
-static inline void xen_free_coherent_pages(struct device *hwdev, size_t size,
- void *cpu_addr, dma_addr_t dma_handle,
- unsigned long attrs)
-{
- free_pages((unsigned long) cpu_addr, get_order(size));
-}
-
-#endif /* _ASM_X86_XEN_PAGE_COHERENT_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a162e559753..fa9ec20783fa 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -96,11 +96,7 @@ static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
asm volatile("1: mov %[val], %[ptr]\n"
"2:\n"
- ".section .fixup, \"ax\"\n"
- "3: sub $1, %[ret]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret])
: [ret] "+r" (ret), [ptr] "=m" (*addr)
: [val] "r" (val));
@@ -110,16 +106,12 @@ static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
static inline int xen_safe_read_ulong(const unsigned long *addr,
unsigned long *val)
{
- int ret = 0;
unsigned long rval = ~0ul;
+ int ret = 0;
asm volatile("1: mov %[ptr], %[rval]\n"
"2:\n"
- ".section .fixup, \"ax\"\n"
- "3: sub $1, %[ret]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret])
: [ret] "+r" (ret), [rval] "+r" (rval)
: [ptr] "m" (*addr));
*val = rval;
@@ -355,9 +347,6 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr);
void make_lowmem_page_readonly(void *vaddr);
void make_lowmem_page_readwrite(void *vaddr);
-#define xen_remap(cookie, size) ioremap((cookie), (size))
-#define xen_unmap(cookie) iounmap((cookie))
-
static inline bool xen_arch_need_swiotlb(struct device *dev,
phys_addr_t phys,
dma_addr_t dev_addr)
@@ -365,9 +354,4 @@ static inline bool xen_arch_need_swiotlb(struct device *dev,
return false;
}
-static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order)
-{
- return __get_free_pages(__GFP_NOWARN, order);
-}
-
#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 3506d8c598c1..9015b888edd6 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -14,29 +14,13 @@ static inline int pci_xen_hvm_init(void)
return -1;
}
#endif
-#if defined(CONFIG_XEN_DOM0)
+#ifdef CONFIG_XEN_PV_DOM0
int __init pci_xen_initial_domain(void);
-int xen_find_device_domain_owner(struct pci_dev *dev);
-int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
-int xen_unregister_device_domain_owner(struct pci_dev *dev);
#else
static inline int __init pci_xen_initial_domain(void)
{
return -1;
}
-static inline int xen_find_device_domain_owner(struct pci_dev *dev)
-{
- return -1;
-}
-static inline int xen_register_device_domain_owner(struct pci_dev *dev,
- uint16_t domain)
-{
- return -1;
-}
-static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
-{
- return -1;
-}
#endif
#if defined(CONFIG_PCI_MSI)
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
index 6b56d0d45d15..77a2d19cc990 100644
--- a/arch/x86/include/asm/xen/swiotlb-xen.h
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -3,15 +3,15 @@
#define _ASM_X86_SWIOTLB_XEN_H
#ifdef CONFIG_SWIOTLB_XEN
-extern int xen_swiotlb;
-extern int __init pci_xen_swiotlb_detect(void);
-extern void __init pci_xen_swiotlb_init(void);
extern int pci_xen_swiotlb_init_late(void);
#else
-#define xen_swiotlb (0)
-static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
-static inline void __init pci_xen_swiotlb_init(void) { }
static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; }
#endif
+int xen_swiotlb_fixup(void *buf, unsigned long nslabs);
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+ unsigned int address_bits,
+ dma_addr_t *dma_handle);
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order);
+
#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 2ee95a7769e6..7b0307acc410 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -57,7 +57,8 @@
op(i + 3, 3)
static void
-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 8;
@@ -108,7 +109,8 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
-xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 8;
@@ -142,8 +144,9 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 8;
@@ -201,8 +204,9 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 8;
@@ -238,8 +242,10 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 8;
@@ -304,8 +310,10 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 8;
@@ -343,8 +351,11 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 8;
@@ -416,8 +427,11 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 8;
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 67ceb790e639..7a6b9474591e 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -21,7 +21,8 @@
#include <asm/fpu/api.h>
static void
-xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 7;
@@ -64,8 +65,9 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
-xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 7;
@@ -113,8 +115,10 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 7;
@@ -168,8 +172,11 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
static void
-xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 7;
@@ -248,7 +255,8 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
#undef BLOCK
static void
-xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 6;
@@ -295,8 +303,9 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
-xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 6;
@@ -352,8 +361,10 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 6;
@@ -418,8 +429,11 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 6;
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 0c4e5b5e3852..7f81dd5897f4 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -26,7 +26,8 @@
BLOCK4(8) \
BLOCK4(12)
-static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
+ const unsigned long * __restrict p1)
{
unsigned long lines = bytes >> 9;
@@ -52,8 +53,9 @@ do { \
kernel_fpu_end();
}
-static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
- unsigned long *p2)
+static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
+ const unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 9;
@@ -82,8 +84,10 @@ do { \
kernel_fpu_end();
}
-static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
- unsigned long *p2, unsigned long *p3)
+static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
+ const unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 9;
@@ -115,8 +119,11 @@ do { \
kernel_fpu_end();
}
-static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
- unsigned long *p2, unsigned long *p3, unsigned long *p4)
+static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
+ const unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 9;
diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h
new file mode 100644
index 000000000000..769b939444ae
--- /dev/null
+++ b/arch/x86/include/uapi/asm/amd_hsmp.h
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_ASM_X86_AMD_HSMP_H_
+#define _UAPI_ASM_X86_AMD_HSMP_H_
+
+#include <linux/types.h>
+
+#pragma pack(4)
+
+#define HSMP_MAX_MSG_LEN 8
+
+/*
+ * HSMP Messages supported
+ */
+enum hsmp_message_ids {
+ HSMP_TEST = 1, /* 01h Increments input value by 1 */
+ HSMP_GET_SMU_VER, /* 02h SMU FW version */
+ HSMP_GET_PROTO_VER, /* 03h HSMP interface version */
+ HSMP_GET_SOCKET_POWER, /* 04h average package power consumption */
+ HSMP_SET_SOCKET_POWER_LIMIT, /* 05h Set the socket power limit */
+ HSMP_GET_SOCKET_POWER_LIMIT, /* 06h Get current socket power limit */
+ HSMP_GET_SOCKET_POWER_LIMIT_MAX,/* 07h Get maximum socket power value */
+ HSMP_SET_BOOST_LIMIT, /* 08h Set a core maximum frequency limit */
+ HSMP_SET_BOOST_LIMIT_SOCKET, /* 09h Set socket maximum frequency level */
+ HSMP_GET_BOOST_LIMIT, /* 0Ah Get current frequency limit */
+ HSMP_GET_PROC_HOT, /* 0Bh Get PROCHOT status */
+ HSMP_SET_XGMI_LINK_WIDTH, /* 0Ch Set max and min width of xGMI Link */
+ HSMP_SET_DF_PSTATE, /* 0Dh Alter APEnable/Disable messages behavior */
+ HSMP_SET_AUTO_DF_PSTATE, /* 0Eh Enable DF P-State Performance Boost algorithm */
+ HSMP_GET_FCLK_MCLK, /* 0Fh Get FCLK and MEMCLK for current socket */
+ HSMP_GET_CCLK_THROTTLE_LIMIT, /* 10h Get CCLK frequency limit in socket */
+ HSMP_GET_C0_PERCENT, /* 11h Get average C0 residency in socket */
+ HSMP_SET_NBIO_DPM_LEVEL, /* 12h Set max/min LCLK DPM Level for a given NBIO */
+ HSMP_GET_NBIO_DPM_LEVEL, /* 13h Get LCLK DPM level min and max for a given NBIO */
+ HSMP_GET_DDR_BANDWIDTH, /* 14h Get theoretical maximum and current DDR Bandwidth */
+ HSMP_GET_TEMP_MONITOR, /* 15h Get socket temperature */
+ HSMP_GET_DIMM_TEMP_RANGE, /* 16h Get per-DIMM temperature range and refresh rate */
+ HSMP_GET_DIMM_POWER, /* 17h Get per-DIMM power consumption */
+ HSMP_GET_DIMM_THERMAL, /* 18h Get per-DIMM thermal sensors */
+ HSMP_GET_SOCKET_FREQ_LIMIT, /* 19h Get current active frequency per socket */
+ HSMP_GET_CCLK_CORE_LIMIT, /* 1Ah Get CCLK frequency limit per core */
+ HSMP_GET_RAILS_SVI, /* 1Bh Get SVI-based Telemetry for all rails */
+ HSMP_GET_SOCKET_FMAX_FMIN, /* 1Ch Get Fmax and Fmin per socket */
+ HSMP_GET_IOLINK_BANDWITH, /* 1Dh Get current bandwidth on IO Link */
+ HSMP_GET_XGMI_BANDWITH, /* 1Eh Get current bandwidth on xGMI Link */
+ HSMP_SET_GMI3_WIDTH, /* 1Fh Set max and min GMI3 Link width */
+ HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */
+ HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */
+ HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */
+ HSMP_MSG_ID_MAX,
+};
+
+struct hsmp_message {
+ __u32 msg_id; /* Message ID */
+ __u16 num_args; /* Number of input argument words in message */
+ __u16 response_sz; /* Number of expected output/response words */
+ __u32 args[HSMP_MAX_MSG_LEN]; /* argument/response buffer */
+ __u16 sock_ind; /* socket number */
+};
+
+enum hsmp_msg_type {
+ HSMP_RSVD = -1,
+ HSMP_SET = 0,
+ HSMP_GET = 1,
+};
+
+struct hsmp_msg_desc {
+ int num_args;
+ int response_sz;
+ enum hsmp_msg_type type;
+};
+
+/*
+ * User may use these comments as reference, please find the
+ * supported list of messages and message definition in the
+ * HSMP chapter of respective family/model PPR.
+ *
+ * Not supported messages would return -ENOMSG.
+ */
+static const struct hsmp_msg_desc hsmp_msg_desc_table[] = {
+ /* RESERVED */
+ {0, 0, HSMP_RSVD},
+
+ /*
+ * HSMP_TEST, num_args = 1, response_sz = 1
+ * input: args[0] = xx
+ * output: args[0] = xx + 1
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SMU_VER, num_args = 0, response_sz = 1
+ * output: args[0] = smu fw ver
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_PROTO_VER, num_args = 0, response_sz = 1
+ * output: args[0] = proto version
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_POWER, num_args = 0, response_sz = 1
+ * output: args[0] = socket power in mWatts
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_SOCKET_POWER_LIMIT, num_args = 1, response_sz = 0
+ * input: args[0] = power limit value in mWatts
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_SOCKET_POWER_LIMIT, num_args = 0, response_sz = 1
+ * output: args[0] = socket power limit value in mWatts
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_POWER_LIMIT_MAX, num_args = 0, response_sz = 1
+ * output: args[0] = maximuam socket power limit in mWatts
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_BOOST_LIMIT, num_args = 1, response_sz = 0
+ * input: args[0] = apic id[31:16] + boost limit value in MHz[15:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_BOOST_LIMIT_SOCKET, num_args = 1, response_sz = 0
+ * input: args[0] = boost limit value in MHz
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_BOOST_LIMIT, num_args = 1, response_sz = 1
+ * input: args[0] = apic id
+ * output: args[0] = boost limit value in MHz
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_PROC_HOT, num_args = 0, response_sz = 1
+ * output: args[0] = proc hot status
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_XGMI_LINK_WIDTH, num_args = 1, response_sz = 0
+ * input: args[0] = min link width[15:8] + max link width[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_DF_PSTATE, num_args = 1, response_sz = 0
+ * input: args[0] = df pstate[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /* HSMP_SET_AUTO_DF_PSTATE, num_args = 0, response_sz = 0 */
+ {0, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_FCLK_MCLK, num_args = 0, response_sz = 2
+ * output: args[0] = fclk in MHz, args[1] = mclk in MHz
+ */
+ {0, 2, HSMP_GET},
+
+ /*
+ * HSMP_GET_CCLK_THROTTLE_LIMIT, num_args = 0, response_sz = 1
+ * output: args[0] = core clock in MHz
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_C0_PERCENT, num_args = 0, response_sz = 1
+ * output: args[0] = average c0 residency
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_NBIO_DPM_LEVEL, num_args = 1, response_sz = 0
+ * input: args[0] = nbioid[23:16] + max dpm level[15:8] + min dpm level[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_NBIO_DPM_LEVEL, num_args = 1, response_sz = 1
+ * input: args[0] = nbioid[23:16]
+ * output: args[0] = max dpm level[15:8] + min dpm level[7:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DDR_BANDWIDTH, num_args = 0, response_sz = 1
+ * output: args[0] = max bw in Gbps[31:20] + utilised bw in Gbps[19:8] +
+ * bw in percentage[7:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_TEMP_MONITOR, num_args = 0, response_sz = 1
+ * output: args[0] = temperature in degree celsius. [15:8] integer part +
+ * [7:5] fractional part
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_TEMP_RANGE, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = refresh rate[3] + temperature range[2:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_POWER, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = DIMM power in mW[31:17] + update rate in ms[16:8] +
+ * DIMM address[7:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] +
+ * DIMM address[7:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_FREQ_LIMIT, num_args = 0, response_sz = 1
+ * output: args[0] = frequency in MHz[31:16] + frequency source[15:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_CCLK_CORE_LIMIT, num_args = 1, response_sz = 1
+ * input: args[0] = apic id [31:0]
+ * output: args[0] = frequency in MHz[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_RAILS_SVI, num_args = 0, response_sz = 1
+ * output: args[0] = power in mW[31:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_FMAX_FMIN, num_args = 0, response_sz = 1
+ * output: args[0] = fmax in MHz[31:16] + fmin in MHz[15:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_IOLINK_BANDWITH, num_args = 1, response_sz = 1
+ * input: args[0] = link id[15:8] + bw type[2:0]
+ * output: args[0] = io bandwidth in Mbps[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_XGMI_BANDWITH, num_args = 1, response_sz = 1
+ * input: args[0] = link id[15:8] + bw type[2:0]
+ * output: args[0] = xgmi bandwidth in Mbps[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_GMI3_WIDTH, num_args = 1, response_sz = 0
+ * input: args[0] = min link width[15:8] + max link width[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_PCI_RATE, num_args = 1, response_sz = 1
+ * input: args[0] = link rate control value
+ * output: args[0] = previous link rate control value
+ */
+ {1, 1, HSMP_SET},
+
+ /*
+ * HSMP_SET_POWER_MODE, num_args = 1, response_sz = 0
+ * input: args[0] = power efficiency mode[2:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_PSTATE_MAX_MIN, num_args = 1, response_sz = 0
+ * input: args[0] = min df pstate[15:8] + max df pstate[7:0]
+ */
+ {1, 0, HSMP_SET},
+};
+
+/* Reset to default packing */
+#pragma pack()
+
+/* Define unique ioctl command for hsmp msgs using generic _IOWR */
+#define HSMP_BASE_IOCTL_NR 0xF8
+#define HSMP_IOCTL_CMD _IOWR(HSMP_BASE_IOCTL_NR, 0, struct hsmp_message)
+
+#endif /*_ASM_X86_AMD_HSMP_H_*/
diff --git a/arch/x86/include/uapi/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h
index 580e3c567046..6beb55bbefa4 100644
--- a/arch/x86/include/uapi/asm/auxvec.h
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -12,9 +12,9 @@
/* entries in ARCH_DLINFO: */
#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
+# define AT_VECTOR_SIZE_ARCH 3
#else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
+# define AT_VECTOR_SIZE_ARCH 2
#endif
#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 600a141c8805..bea5cdcdf532 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -10,6 +10,7 @@
#define SETUP_EFI 4
#define SETUP_APPLE_PROPERTIES 5
#define SETUP_JAILHOUSE 6
+#define SETUP_CC_BLOB 7
#define SETUP_INDIRECT (1<<31)
@@ -187,7 +188,8 @@ struct boot_params {
__u32 ext_ramdisk_image; /* 0x0c0 */
__u32 ext_ramdisk_size; /* 0x0c4 */
__u32 ext_cmd_line_ptr; /* 0x0c8 */
- __u8 _pad4[116]; /* 0x0cc */
+ __u8 _pad4[112]; /* 0x0cc */
+ __u32 cc_blob_address; /* 0x13c */
struct edid_info edid_info; /* 0x140 */
struct efi_info efi_info; /* 0x1c0 */
__u32 alt_mem_k; /* 0x1e0 */
@@ -234,7 +236,7 @@ struct boot_params {
* handling of page tables.
*
* These enums should only ever be used by x86 code, and the code that uses
- * it should be well contained and compartamentalized.
+ * it should be well contained and compartmentalized.
*
* KVM and Xen HVM do not have a subarch as these are expected to follow
* standard x86 boot entries. If there is a genuine need for "hypervisor" type
@@ -252,7 +254,7 @@ struct boot_params {
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
* which start at asm startup_xen() entry point and later jump to the C
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
- * currently supportd through this PV boot path.
+ * currently supported through this PV boot path.
* @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform
* systems which do not have the PCI legacy interfaces.
* @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
index d95d080b30e3..0007ba077c0c 100644
--- a/arch/x86/include/uapi/asm/debugreg.h
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -24,6 +24,7 @@
#define DR_TRAP3 (0x8) /* db3 */
#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
+#define DR_BUS_LOCK (0x800) /* bus_lock */
#define DR_STEP (0x4000) /* single-step */
#define DR_SWITCH (0x8000) /* task switch */
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
index 5fdfcb47000f..054604aba9f0 100644
--- a/arch/x86/include/uapi/asm/hwcap2.h
+++ b/arch/x86/include/uapi/asm/hwcap2.h
@@ -2,10 +2,12 @@
#ifndef _ASM_X86_HWCAP2_H
#define _ASM_X86_HWCAP2_H
+#include <linux/const.h>
+
/* MONITOR/MWAIT enabled in Ring 3 */
-#define HWCAP2_RING3MWAIT (1 << 0)
+#define HWCAP2_RING3MWAIT _BITUL(0)
/* Kernel allows FSGSBASE instructions available in Ring 3 */
-#define HWCAP2_FSGSBASE BIT(1)
+#define HWCAP2_FSGSBASE _BITUL(1)
#endif
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5a3022c8af82..21614807a2cb 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -159,6 +159,19 @@ struct kvm_sregs {
__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
};
+struct kvm_sregs2 {
+ /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 flags;
+ __u64 pdptrs[4];
+};
+#define KVM_SREGS2_FLAGS_PDPTRS_VALID 1
+
/* for KVM_GET_FPU and KVM_SET_FPU */
struct kvm_fpu {
__u8 fpr[8][16];
@@ -282,6 +295,7 @@ struct kvm_debug_exit_arch {
#define KVM_GUESTDBG_USE_HW_BP 0x00020000
#define KVM_GUESTDBG_INJECT_DB 0x00040000
#define KVM_GUESTDBG_INJECT_BP 0x00080000
+#define KVM_GUESTDBG_BLOCKIRQ 0x00100000
/* for KVM_SET_GUEST_DEBUG */
struct kvm_guest_debug_arch {
@@ -359,9 +373,23 @@ struct kvm_debugregs {
__u64 reserved[9];
};
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
struct kvm_xsave {
+ /*
+ * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+ * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * respectively, when invoked on the vm file descriptor.
+ *
+ * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * will always be at least 4096. Currently, it is only greater
+ * than 4096 if a dynamic feature has been enabled with
+ * ``arch_prctl()``, but this may change in the future.
+ *
+ * The offsets of the state save areas in struct kvm_xsave follow
+ * the contents of CPUID leaf 0xD on the host.
+ */
__u32 region[1024];
+ __u32 extra[0];
};
#define KVM_MAX_XCRS 16
@@ -400,11 +428,12 @@ struct kvm_sync_regs {
struct kvm_vcpu_events events;
};
-#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
-#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
-#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
-#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
-#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
@@ -424,6 +453,9 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP 0
+
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
@@ -437,6 +469,8 @@ struct kvm_vmx_nested_state_hdr {
__u16 flags;
} smm;
+ __u16 pad;
+
__u32 flags;
__u64 preemption_timer_deadline;
};
@@ -488,4 +522,8 @@ struct kvm_pmu_event_filter {
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
+/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
+#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
+#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 950afebfba88..6e64b27b2c1e 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -8,6 +8,7 @@
* should be used to determine that a VM is running under KVM.
*/
#define KVM_CPUID_SIGNATURE 0x40000000
+#define KVM_SIGNATURE "KVMKVMKVM\0\0\0"
/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
* a particular paravirtualization, the appropriate feature bit should
@@ -33,6 +34,8 @@
#define KVM_FEATURE_PV_SCHED_YIELD 13
#define KVM_FEATURE_ASYNC_PF_INT 14
#define KVM_FEATURE_MSI_EXT_DEST_ID 15
+#define KVM_FEATURE_HC_MAP_GPA_RANGE 16
+#define KVM_FEATURE_MIGRATION_CONTROL 17
#define KVM_HINTS_REALTIME 0
@@ -54,6 +57,7 @@
#define MSR_KVM_POLL_CONTROL 0x4b564d05
#define MSR_KVM_ASYNC_PF_INT 0x4b564d06
#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07
+#define MSR_KVM_MIGRATION_CONTROL 0x4b564d08
struct kvm_steal_time {
__u64 steal;
@@ -90,6 +94,16 @@ struct kvm_clock_pairing {
/* MSR_KVM_ASYNC_PF_INT */
#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0)
+/* MSR_KVM_MIGRATION_CONTROL */
+#define KVM_MIGRATION_READY (1 << 0)
+
+/* KVM_HC_MAP_GPA_RANGE */
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_4K 0
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_2M (1 << 0)
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_1G (1 << 1)
+#define KVM_MAP_GPA_RANGE_ENC_STAT(n) (n << 4)
+#define KVM_MAP_GPA_RANGE_ENCRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(1)
+#define KVM_MAP_GPA_RANGE_DECRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(0)
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index d4a8d0424bfb..775dbd3aff73 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -5,20 +5,6 @@
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-/*
- * Take the 4 protection key bits out of the vma->vm_flags
- * value and turn them in to the bits that we can put in
- * to a pte.
- *
- * Only override these if Protection Keys are available
- * (which is only on 64-bit).
- */
-#define arch_vm_get_page_prot(vm_flags) __pgprot( \
- ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
- ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
- ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
- ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
-
#define arch_calc_vm_prot_bits(prot, key) ( \
((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h
index b3d0664fadc9..ac83e25bbf37 100644
--- a/arch/x86/include/uapi/asm/msgbuf.h
+++ b/arch/x86/include/uapi/asm/msgbuf.h
@@ -12,7 +12,7 @@
* The msqid64_ds structure for x86 architecture with x32 ABI.
*
* On x86-32 and x86-64 we can just use the generic definition, but
- * x32 uses the same binary layout as x86_64, which is differnet
+ * x32 uses the same binary layout as x86_64, which is different
* from other 32-bit architectures.
*/
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..500b96e71f18 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -2,16 +2,22 @@
#ifndef _ASM_X86_PRCTL_H
#define _ASM_X86_PRCTL_H
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
-#define ARCH_GET_CPUID 0x1011
-#define ARCH_SET_CPUID 0x1012
+#define ARCH_GET_CPUID 0x1011
+#define ARCH_SET_CPUID 0x1012
-#define ARCH_MAP_VDSO_X32 0x2001
-#define ARCH_MAP_VDSO_32 0x2002
-#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_GET_XCOMP_SUPP 0x1021
+#define ARCH_GET_XCOMP_PERM 0x1022
+#define ARCH_REQ_XCOMP_PERM 0x1023
+#define ARCH_GET_XCOMP_GUEST_PERM 0x1024
+#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
+
+#define ARCH_MAP_VDSO_X32 0x2001
+#define ARCH_MAP_VDSO_32 0x2002
+#define ARCH_MAP_VDSO_64 0x2003
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..c47cc7f2feeb 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -130,6 +130,8 @@
#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */
+#define X86_CR4_CET _BITUL(X86_CR4_CET_BIT)
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index 9034f3007c4e..f4b81587e90b 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -27,6 +27,8 @@ enum sgx_page_flags {
_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
#define SGX_IOC_ENCLAVE_PROVISION \
_IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision)
+#define SGX_IOC_VEPC_REMOVE_ALL \
+ _IO(SGX_MAGIC, 0x04)
/**
* struct sgx_enclave_create - parameter structure for the
@@ -152,7 +154,7 @@ struct sgx_enclave_run {
* Most exceptions reported on ENCLU, including those that occur within the
* enclave, are fixed up and reported synchronously instead of being delivered
* via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are
- * never fixed up and are always delivered via standard signals. On synchrously
+ * never fixed up and are always delivered via standard signals. On synchronously
* reported exceptions, -EFAULT is returned and details about the exception are
* recorded in @run.exception, the optional sgx_enclave_exception struct.
*
diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h
index f0305dc660c9..13775bfdfee2 100644
--- a/arch/x86/include/uapi/asm/shmbuf.h
+++ b/arch/x86/include/uapi/asm/shmbuf.h
@@ -5,17 +5,21 @@
#if !defined(__x86_64__) || !defined(__ILP32__)
#include <asm-generic/shmbuf.h>
#else
+
+#include <asm/ipcbuf.h>
+#include <asm/posix_types.h>
+
/*
* The shmid64_ds structure for x86 architecture with x32 ABI.
*
* On x86-32 and x86-64 we can just use the generic definition, but
- * x32 uses the same binary layout as x86_64, which is differnet
+ * x32 uses the same binary layout as x86_64, which is different
* from other 32-bit architectures.
*/
struct shmid64_ds {
struct ipc64_perm shm_perm; /* operation perms */
- size_t shm_segsz; /* size of segment (bytes) */
+ __kernel_size_t shm_segsz; /* size of segment (bytes) */
__kernel_long_t shm_atime; /* last attach time */
__kernel_long_t shm_dtime; /* last detach time */
__kernel_long_t shm_ctime; /* last change time */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 844d60eb1882..d0d9b331d3a1 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -139,7 +139,7 @@ struct _fpstate_32 {
* The 64-bit FPU frame. (FXSAVE format and later)
*
* Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is
- * larger: 'struct _xstate'. Note that 'struct _xstate' embedds
+ * larger: 'struct _xstate'. Note that 'struct _xstate' embeds
* 'struct _fpstate' so that you can always assume the _fpstate portion
* exists so that you can check the magic value.
*
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
index 164a22a72984..777c3a0f4e23 100644
--- a/arch/x86/include/uapi/asm/signal.h
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -104,7 +104,7 @@ struct sigaction {
typedef struct sigaltstack {
void __user *ss_sp;
int ss_flags;
- size_t ss_size;
+ __kernel_size_t ss_size;
} stack_t;
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 554f75fe013c..f69c168391aa 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -108,8 +108,19 @@
#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1
+#define SVM_VMGEXIT_PSC 0x80000010
+#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011
+#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012
+#define SVM_VMGEXIT_AP_CREATION 0x80000013
+#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
+#define SVM_VMGEXIT_AP_CREATE 1
+#define SVM_VMGEXIT_AP_DESTROY 2
+#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
+/* Exit code reserved for hypervisor/software use */
+#define SVM_EXIT_SW 0xf0000000
+
#define SVM_EXIT_ERR -1
#define SVM_EXIT_REASONS \
@@ -215,6 +226,11 @@
{ SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
{ SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \
{ SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \
+ { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \
+ { SVM_VMGEXIT_GUEST_REQUEST, "vmgexit_guest_request" }, \
+ { SVM_VMGEXIT_EXT_GUEST_REQUEST, "vmgexit_ext_guest_request" }, \
+ { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \
+ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \
{ SVM_EXIT_ERR, "invalid_guest_state" }
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b8e650a985e3..946d761adbd3 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -27,6 +27,7 @@
#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000
#define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ddf08351f0b..4c8b6ae802ac 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -20,7 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
-CFLAGS_REMOVE_sev-es.o = -pg
+CFLAGS_REMOVE_sev.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -28,18 +28,13 @@ KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
-KASAN_SANITIZE_sev-es.o := n
+KASAN_SANITIZE_sev.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
KCSAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD_test_nx.o := y
-OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y
-
-ifdef CONFIG_FRAME_POINTER
-OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
-endif
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
@@ -47,8 +42,6 @@ endif
# non-deterministic coverage.
KCOV_INSTRUMENT := n
-CFLAGS_head$(BITS).o += -fno-stack-protector
-
CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o
@@ -69,7 +62,6 @@ obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o hw_breakpoint.o
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
-obj-y += pci-iommu_table.o
obj-y += resource.o
obj-y += irqflags.o
obj-y += static_call.o
@@ -82,7 +74,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_ISA_DMA_API) += i8237.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
@@ -103,6 +95,8 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
@@ -121,7 +115,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
-obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
@@ -133,12 +127,8 @@ obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
-obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
obj-$(CONFIG_UPROBES) += uprobes.o
-obj-y += sysfb.o
-obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o
-obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
@@ -149,7 +139,8 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index cf340d85946a..fc17b3f136fe 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -3,7 +3,7 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
-obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o
+obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7bdc0239a943..907cc98b1938 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -5,6 +5,7 @@
* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
* Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
*/
+#define pr_fmt(fmt) "ACPI: " fmt
#include <linux/init.h>
#include <linux/acpi.h>
@@ -42,8 +43,6 @@ EXPORT_SYMBOL(acpi_disabled);
# include <asm/proto.h>
#endif /* X86 */
-#define PREFIX "ACPI: "
-
int acpi_noirq; /* skip ACPI IRQ initialization */
static int acpi_nobgrt; /* skip ACPI BGRT */
int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
@@ -63,6 +62,14 @@ int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool acpi_support_online_capable;
+#endif
+
+#ifdef CONFIG_X86_64
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr;
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
#endif
#ifdef CONFIG_X86_IO_APIC
@@ -130,16 +137,17 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
madt = (struct acpi_table_madt *)table;
if (!madt) {
- printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+ pr_warn("Unable to map MADT\n");
return -ENODEV;
}
if (madt->address) {
acpi_lapic_addr = (u64) madt->address;
- printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
- madt->address);
+ pr_debug("Local APIC address 0x%08x\n", madt->address);
}
+ if (madt->header.revision >= 5)
+ acpi_support_online_capable = true;
default_acpi_madt_oem_check(madt->header.oem_id,
madt->header.oem_table_id);
@@ -161,7 +169,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
int cpu;
if (id >= MAX_LOCAL_APIC) {
- printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+ pr_info("skipped apicid that is too big\n");
return -EINVAL;
}
@@ -213,13 +221,13 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
*/
if (!apic->apic_id_valid(apic_id)) {
if (enabled)
- pr_warn(PREFIX "x2apic entry ignored\n");
+ pr_warn("x2apic entry ignored\n");
return 0;
}
acpi_register_lapic(apic_id, processor->uid, enabled);
#else
- printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+ pr_warn("x2apic entry ignored\n");
#endif
return 0;
@@ -241,6 +249,12 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
if (processor->id == 0xff)
return 0;
+ /* don't register processors that can not be onlined */
+ if (acpi_support_online_capable &&
+ !(processor->lapic_flags & ACPI_MADT_ENABLED) &&
+ !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
@@ -306,7 +320,7 @@ acpi_parse_x2apic_nmi(union acpi_subtable_headers *header,
acpi_table_print_madt_entry(&header->common);
if (x2apic_nmi->lint != 1)
- printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+ pr_warn("NMI not connected to LINT 1!\n");
return 0;
}
@@ -324,12 +338,65 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
acpi_table_print_madt_entry(&header->common);
if (lapic_nmi->lint != 1)
- printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+ pr_warn("NMI not connected to LINT 1!\n");
return 0;
}
-#endif /*CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_X86_64
+static int acpi_wakeup_cpu(int apicid, unsigned long start_ip)
+{
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*
+ * Mailbox memory is shared between the firmware and OS. Firmware will
+ * listen on mailbox command address, and once it receives the wakeup
+ * command, the CPU associated with the given apicid will be booted.
+ *
+ * The value of 'apic_id' and 'wakeup_vector' must be visible to the
+ * firmware before the wakeup command is visible. smp_store_release()
+ * ensures ordering and visibility.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ acpi_mp_wake_mailbox->wakeup_vector = start_ip;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ /*
+ * Wait for the CPU to wake up.
+ *
+ * The CPU being woken up is essentially in a spin loop waiting to be
+ * woken up. It should not take long for it wake up and acknowledge by
+ * zeroing out ->command.
+ *
+ * ACPI specification doesn't provide any guidance on how long kernel
+ * has to wait for a wake up acknowledgement. It also doesn't provide
+ * a way to cancel a wake up request if it takes too long.
+ *
+ * In TDX environment, the VMM has control over how long it takes to
+ * wake up secondary. It can postpone scheduling secondary vCPU
+ * indefinitely. Giving up on wake up request and reporting error opens
+ * possible attack vector for VMM: it can wake up a secondary CPU when
+ * kernel doesn't expect it. Wait until positive result of the wake up
+ * request.
+ */
+ while (READ_ONCE(acpi_mp_wake_mailbox->command))
+ cpu_relax();
+
+ return 0;
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
#define MP_ISA_BUS 0
@@ -368,7 +435,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
isa_irq_to_gsi[bus_irq] = gsi;
}
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+static void mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
int polarity)
{
#ifdef CONFIG_X86_MPPARSE
@@ -380,9 +447,9 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
u8 pin;
if (!acpi_ioapic)
- return 0;
+ return;
if (!dev || !dev_is_pci(dev))
- return 0;
+ return;
pdev = to_pci_dev(dev);
number = pdev->bus->number;
@@ -401,7 +468,6 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
mp_save_irq(&mp_irq);
#endif
- return 0;
}
static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
@@ -514,14 +580,14 @@ acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
if (intsrc->source_irq == 0) {
if (acpi_skip_timer_override) {
- printk(PREFIX "BIOS IRQ0 override ignored.\n");
+ pr_warn("BIOS IRQ0 override ignored.\n");
return 0;
}
if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
&& (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
- printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
+ pr_warn("BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
}
}
@@ -560,10 +626,10 @@ acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end
* If a PIC-mode SCI is not recognized or gives spurious IRQ7's
* it may require Edge Trigger -- use "acpi_sci=edge"
*
- * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
+ * Port 0x4d0-4d1 are ELCR1 and ELCR2, the Edge/Level Control Registers
* for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
- * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
- * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
+ * ELCR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
+ * ELCR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
*/
void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
@@ -572,7 +638,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
unsigned int old, new;
/* Real old ELCR mask */
- old = inb(0x4d0) | (inb(0x4d1) << 8);
+ old = inb(PIC_ELCR1) | (inb(PIC_ELCR2) << 8);
/*
* If we use ACPI to set PCI IRQs, then we should clear ELCR
@@ -597,9 +663,9 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
if (old == new)
return;
- printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
- outb(new, 0x4d0);
- outb(new >> 8, 0x4d1);
+ pr_warn("setting ELCR to %04x (from %04x)\n", new, old);
+ outb(new, PIC_ELCR1);
+ outb(new >> 8, PIC_ELCR2);
}
int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
@@ -754,7 +820,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
if (cpu < 0) {
- pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
+ pr_info("Unable to map lapic to logical cpu number\n");
return cpu;
}
@@ -830,7 +896,7 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
EXPORT_SYMBOL(acpi_unregister_ioapic);
/**
- * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base
+ * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base
* has been registered
* @handle: ACPI handle of the IOAPIC device
* @gsi_base: GSI base associated with the IOAPIC
@@ -870,8 +936,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
- printk(KERN_WARNING PREFIX "HPET timers must be located in "
- "memory.\n");
+ pr_warn("HPET timers must be located in memory.\n");
return -1;
}
@@ -883,9 +948,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
* want to allocate a resource there.
*/
if (!hpet_address) {
- printk(KERN_WARNING PREFIX
- "HPET id: %#x base: %#lx is invalid\n",
- hpet_tbl->id, hpet_address);
+ pr_warn("HPET id: %#x base: %#lx is invalid\n", hpet_tbl->id, hpet_address);
return 0;
}
#ifdef CONFIG_X86_64
@@ -896,21 +959,17 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
*/
if (hpet_address == 0xfed0000000000000UL) {
if (!hpet_force_user) {
- printk(KERN_WARNING PREFIX "HPET id: %#x "
- "base: 0xfed0000000000000 is bogus\n "
- "try hpet=force on the kernel command line to "
- "fix it up to 0xfed00000.\n", hpet_tbl->id);
+ pr_warn("HPET id: %#x base: 0xfed0000000000000 is bogus, try hpet=force on the kernel command line to fix it up to 0xfed00000.\n",
+ hpet_tbl->id);
hpet_address = 0;
return 0;
}
- printk(KERN_WARNING PREFIX
- "HPET id: %#x base: 0xfed0000000000000 fixed up "
- "to 0xfed00000.\n", hpet_tbl->id);
+ pr_warn("HPET id: %#x base: 0xfed0000000000000 fixed up to 0xfed00000.\n",
+ hpet_tbl->id);
hpet_address >>= 32;
}
#endif
- printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
- hpet_tbl->id, hpet_address);
+ pr_info("HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address);
/*
* Allocate and initialize the HPET firmware resource for adding into
@@ -955,24 +1014,24 @@ late_initcall(hpet_insert_resource);
static int __init acpi_parse_fadt(struct acpi_table_header *table)
{
if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
- pr_debug("ACPI: no legacy devices present\n");
+ pr_debug("no legacy devices present\n");
x86_platform.legacy.devices.pnpbios = 0;
}
if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
!(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
- pr_debug("ACPI: i8042 controller is absent\n");
+ pr_debug("i8042 controller is absent\n");
x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
}
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
- pr_debug("ACPI: not registering RTC platform device\n");
+ pr_debug("not registering RTC platform device\n");
x86_platform.legacy.rtc = 0;
}
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) {
- pr_debug("ACPI: probing for VGA not safe\n");
+ pr_debug("probing for VGA not safe\n");
x86_platform.legacy.no_vga = 1;
}
@@ -997,8 +1056,7 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
}
if (pmtmr_ioport)
- printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
- pmtmr_ioport);
+ pr_info("PM-Timer IO Port: %#x\n", pmtmr_ioport);
#endif
return 0;
}
@@ -1024,8 +1082,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
acpi_parse_lapic_addr_ovr, 0);
if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
+ pr_err("Error parsing LAPIC address override entry\n");
return count;
}
@@ -1057,8 +1114,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
sizeof(struct acpi_table_madt),
madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
if (ret < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC/X2APIC entries\n");
+ pr_err("Error parsing LAPIC/X2APIC entries\n");
return ret;
}
@@ -1066,11 +1122,11 @@ static int __init acpi_parse_madt_lapic_entries(void)
x2count = madt_proc[1].count;
}
if (!count && !x2count) {
- printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+ pr_err("No LAPIC entries present\n");
/* TBD: Cleanup to allow fallback to MPS */
return -ENODEV;
} else if (count < 0 || x2count < 0) {
- printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+ pr_err("Error parsing LAPIC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1080,12 +1136,35 @@ static int __init acpi_parse_madt_lapic_entries(void)
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
acpi_parse_lapic_nmi, 0);
if (count < 0 || x2count < 0) {
- printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+ pr_err("Error parsing LAPIC NMI entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
return 0;
}
+
+#ifdef CONFIG_X86_64
+static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ return -ENODEV;
+
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+ if (BAD_MADT_ENTRY(mp_wake, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
+
+ acpi_wake_cpu_handler_update(acpi_wakeup_cpu);
+
+ return 0;
+}
+#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -1139,7 +1218,7 @@ static void __init mp_config_acpi_legacy_irqs(void)
}
if (idx != mp_irq_entries) {
- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ pr_debug("ACPI: IRQ%d used by override.\n", i);
continue; /* IRQ already used */
}
@@ -1179,26 +1258,24 @@ static int __init acpi_parse_madt_ioapic_entries(void)
* if "noapic" boot option, don't look for IO-APICs
*/
if (skip_ioapic_setup) {
- printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
- "due to 'noapic' option.\n");
+ pr_info("Skipping IOAPIC probe due to 'noapic' option.\n");
return -ENODEV;
}
count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
MAX_IO_APICS);
if (!count) {
- printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+ pr_err("No IOAPIC entries present\n");
return -ENODEV;
} else if (count < 0) {
- printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+ pr_err("Error parsing IOAPIC entry\n");
return count;
}
count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
acpi_parse_int_src_ovr, nr_irqs);
if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing interrupt source overrides entry\n");
+ pr_err("Error parsing interrupt source overrides entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1218,7 +1295,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
acpi_parse_nmi_src, nr_irqs);
if (count < 0) {
- printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+ pr_err("Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1251,8 +1328,7 @@ static void __init early_acpi_process_madt(void)
/*
* Dell Precision Workstation 410, 610 come here.
*/
- printk(KERN_ERR PREFIX
- "Invalid BIOS MADT, disabling ACPI\n");
+ pr_err("Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
}
@@ -1284,13 +1360,20 @@ static void __init acpi_process_madt(void)
smp_found_config = 1;
}
+
+#ifdef CONFIG_X86_64
+ /*
+ * Parse MADT MP Wake entry.
+ */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP,
+ acpi_parse_mp_wake, 1);
+#endif
}
if (error == -EINVAL) {
/*
* Dell Precision Workstation 410, 610 come here.
*/
- printk(KERN_ERR PREFIX
- "Invalid BIOS MADT, disabling ACPI\n");
+ pr_err("Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
} else {
@@ -1300,8 +1383,7 @@ static void __init acpi_process_madt(void)
* Boot with "acpi=off" to use MPS on such a system.
*/
if (smp_found_config) {
- printk(KERN_WARNING PREFIX
- "No APIC-table, disabling MPS\n");
+ pr_warn("No APIC-table, disabling MPS\n");
smp_found_config = 0;
}
}
@@ -1311,11 +1393,9 @@ static void __init acpi_process_madt(void)
* processors, where MPS only supports physical.
*/
if (acpi_lapic && acpi_ioapic)
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
- "information\n");
+ pr_info("Using ACPI (MADT) for SMP configuration information\n");
else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) "
- "configuration information\n");
+ pr_info("Using ACPI for processor (LAPIC) configuration information\n");
#endif
return;
}
@@ -1323,8 +1403,7 @@ static void __init acpi_process_madt(void)
static int __init disable_acpi_irq(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
- d->ident);
+ pr_notice("%s detected: force use of acpi=noirq\n", d->ident);
acpi_noirq_set();
}
return 0;
@@ -1333,21 +1412,30 @@ static int __init disable_acpi_irq(const struct dmi_system_id *d)
static int __init disable_acpi_pci(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
- d->ident);
+ pr_notice("%s detected: force use of pci=noacpi\n", d->ident);
acpi_disable_pci();
}
return 0;
}
+static int __init disable_acpi_xsdt(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ pr_notice("%s detected: force use of acpi=rsdt\n", d->ident);
+ acpi_gbl_do_not_use_xsdt = TRUE;
+ } else {
+ pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n");
+ }
+ return 0;
+}
+
static int __init dmi_disable_acpi(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
+ pr_notice("%s detected: acpi off\n", d->ident);
disable_acpi();
} else {
- printk(KERN_NOTICE
- "Warning: DMI blacklist says broken, but acpi forced\n");
+ pr_notice("Warning: DMI blacklist says broken, but acpi forced\n");
}
return 0;
}
@@ -1464,6 +1552,19 @@ static const struct dmi_system_id acpi_dmi_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ /*
+ * Boxes that need ACPI XSDT use disabled due to corrupted tables
+ */
+ {
+ .callback = disable_acpi_xsdt,
+ .ident = "Advantech DAC-BJ01",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"),
+ DMI_MATCH(DMI_BIOS_VERSION, "V1.12"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"),
+ },
+ },
{}
};
@@ -1554,10 +1655,18 @@ void __init acpi_boot_table_init(void)
/*
* Initialize the ACPI boot-time table parser.
*/
- if (acpi_table_init()) {
+ if (acpi_locate_initial_tables())
disable_acpi();
- return;
- }
+ else
+ acpi_reserve_initial_tables();
+}
+
+int __init early_acpi_boot_init(void)
+{
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_init_complete();
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1566,22 +1675,13 @@ void __init acpi_boot_table_init(void)
*/
if (acpi_blacklisted()) {
if (acpi_force) {
- printk(KERN_WARNING PREFIX "acpi=force override\n");
+ pr_warn("acpi=force override\n");
} else {
- printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+ pr_warn("Disabling ACPI support\n");
disable_acpi();
- return;
+ return 1;
}
}
-}
-
-int __init early_acpi_boot_init(void)
-{
- /*
- * If acpi_disabled, bail out
- */
- if (acpi_disabled)
- return 1;
/*
* Process the Multiple APIC Description Table (MADT), if present
@@ -1657,7 +1757,7 @@ static int __init parse_acpi(char *arg)
else if (strcmp(arg, "noirq") == 0) {
acpi_noirq_set();
}
- /* "acpi=copy_dsdt" copys DSDT */
+ /* "acpi=copy_dsdt" copies DSDT */
else if (strcmp(arg, "copy_dsdt") == 0) {
acpi_gbl_copy_dsdt_locally = 1;
}
@@ -1693,9 +1793,7 @@ int __init acpi_mps_check(void)
#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
/* mptable code is not built-in*/
if (acpi_disabled || acpi_noirq) {
- printk(KERN_WARNING "MPS support code is not built-in.\n"
- "Using acpi=off or acpi=noirq or pci=noacpi "
- "may have problem\n");
+ pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n");
return 1;
}
#endif
@@ -1764,7 +1862,7 @@ int __acpi_release_global_lock(unsigned int *lock)
void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
{
- e820__range_add(addr, size, E820_TYPE_ACPI);
+ e820__range_add(addr, size, E820_TYPE_NVS);
e820__update_table_print();
}
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
new file mode 100644
index 000000000000..8b8cbf22461a
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * cppc.c: CPPC Interface for x86
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/topology.h>
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_ffh_supported(void)
+{
+ return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ *val &= mask;
+ *val >>= reg->bit_offset;
+ }
+ return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+ u64 rd_val;
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ val <<= reg->bit_offset;
+ val &= mask;
+ rd_val &= ~mask;
+ rd_val |= val;
+ err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
+ }
+ return err;
+}
+
+static void amd_set_max_freq_ratio(void)
+{
+ struct cppc_perf_caps perf_caps;
+ u64 highest_perf, nominal_perf;
+ u64 perf_ratio;
+ int rc;
+
+ rc = cppc_get_perf_caps(0, &perf_caps);
+ if (rc) {
+ pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ return;
+ }
+
+ highest_perf = amd_get_highest_perf();
+ nominal_perf = perf_caps.nominal_perf;
+
+ if (!highest_perf || !nominal_perf) {
+ pr_debug("Could not retrieve highest or nominal performance\n");
+ return;
+ }
+
+ perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
+ /* midpoint between max_boost and max_P */
+ perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
+ if (!perf_ratio) {
+ pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
+ return;
+ }
+
+ freq_invariance_set_perf_ratio(perf_ratio, false);
+}
+
+static DEFINE_MUTEX(freq_invariance_lock);
+
+void init_freq_invariance_cppc(void)
+{
+ static bool init_done;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+
+ mutex_lock(&freq_invariance_lock);
+ if (!init_done)
+ amd_set_max_freq_ratio();
+ init_done = true;
+ mutex_unlock(&freq_invariance_lock);
+}
diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c
deleted file mode 100644
index b961de569e7e..000000000000
--- a/arch/x86/kernel/acpi/cppc_msr.c
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * cppc_msr.c: MSR Interface for CPPC
- * Copyright (c) 2016, Intel Corporation.
- */
-
-#include <acpi/cppc_acpi.h>
-#include <asm/msr.h>
-
-/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
-
-bool cpc_ffh_supported(void)
-{
- return true;
-}
-
-int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
-{
- int err;
-
- err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
- if (!err) {
- u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
- reg->bit_offset);
-
- *val &= mask;
- *val >>= reg->bit_offset;
- }
- return err;
-}
-
-int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
-{
- u64 rd_val;
- int err;
-
- err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
- if (!err) {
- u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
- reg->bit_offset);
-
- val <<= reg->bit_offset;
- val &= mask;
- rd_val &= ~mask;
- rd_val |= val;
- err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
- }
- return err;
-}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 49ae4e1ac9cd..7945eae5b315 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -79,6 +79,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
*/
flags->bm_control = 0;
}
+ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) {
+ /*
+ * For all AMD Zen or newer CPUs that support C3, caches
+ * should not be flushed by software while entering C3
+ * type state. Set bm->check to 1 so that kernel doesn't
+ * need to execute cache flush operation.
+ */
+ flags->bm_check = 1;
+ /*
+ * In current AMD C state implementation ARB_DIS is no longer
+ * used. So set bm_control to zero to indicate ARB_DIS is not
+ * required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
@@ -197,7 +212,8 @@ static int __init ffh_cstate_init(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_INTEL &&
- c->x86_vendor != X86_VENDOR_AMD)
+ c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON)
return -1;
cpu_cstate_entry = alloc_percpu(struct cstate_entry);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index cc1fea76aab0..3b7f4cdbf2e0 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -15,6 +15,7 @@
#include <asm/desc.h>
#include <asm/cacheflush.h>
#include <asm/realmode.h>
+#include <asm/hypervisor.h>
#include <linux/ftrace.h>
#include "../../realmode/rm/wakeup.h"
@@ -41,7 +42,7 @@ unsigned long acpi_get_wakeup_address(void)
* x86_acpi_enter_sleep_state - enter sleep state
* @state: Sleep state to enter.
*
- * Wrapper around acpi_enter_sleep_state() to be called by assmebly.
+ * Wrapper around acpi_enter_sleep_state() to be called by assembly.
*/
asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state)
{
@@ -139,8 +140,10 @@ static int __init acpi_sleep_setup(char *str)
if (strncmp(str, "s3_beep", 7) == 0)
acpi_realmode_flags |= 4;
#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_hwsig", 8) == 0)
+ acpi_check_s4_hw_signature = 1;
if (strncmp(str, "s4_nohwsig", 10) == 0)
- acpi_no_s4_hw_signature();
+ acpi_check_s4_hw_signature = 0;
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
@@ -158,3 +161,21 @@ static int __init acpi_sleep_setup(char *str)
}
__setup("acpi_sleep=", acpi_sleep_setup);
+
+#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST)
+static int __init init_s4_sigcheck(void)
+{
+ /*
+ * If running on a hypervisor, honour the ACPI specification
+ * by default and trigger a clean reboot when the hardware
+ * signature in FACS is changed after hibernation.
+ */
+ if (acpi_check_s4_hw_signature == -1 &&
+ !hypervisor_is_type(X86_HYPER_NATIVE))
+ acpi_check_s4_hw_signature = 1;
+
+ return 0;
+}
+/* This must happen before acpi_init() which is a subsys initcall */
+arch_initcall(init_s4_sigcheck);
+#endif
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index daf88f8143c5..cf69081073b5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -60,7 +60,7 @@ save_registers:
popl saved_context_eflags
movl $ret_point, saved_eip
- ret
+ RET
restore_registers:
@@ -70,7 +70,7 @@ restore_registers:
movl saved_context_edi, %edi
pushl saved_context_eflags
popfl
- ret
+ RET
SYM_CODE_START(do_suspend_lowlevel)
call save_processor_state
@@ -86,7 +86,7 @@ SYM_CODE_START(do_suspend_lowlevel)
ret_point:
call restore_registers
call restore_processor_state
- ret
+ RET
SYM_CODE_END(do_suspend_lowlevel)
.data
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 56b6865afb2a..d5d8a352eafa 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -115,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
-#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
/*
* The suspend path may have poisoned some areas deeper in the stack,
* which we now need to unpoison.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8d778e46725d..e257f6c80372 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -28,6 +28,8 @@
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/fixmap.h>
+#include <asm/paravirt.h>
+#include <asm/asm-prototypes.h>
int __read_mostly alternatives_patched;
@@ -74,186 +76,30 @@ do { \
} \
} while (0)
-/*
- * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
- * that correspond to that nop. Getting from one nop to the next, we
- * add to the array the offset that is equal to the sum of all sizes of
- * nops preceding the one we are after.
- *
- * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
- * nice symmetry of sizes of the previous nops.
- */
-#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-static const unsigned char intelnops[] =
-{
- GENERIC_NOP1,
- GENERIC_NOP2,
- GENERIC_NOP3,
- GENERIC_NOP4,
- GENERIC_NOP5,
- GENERIC_NOP6,
- GENERIC_NOP7,
- GENERIC_NOP8,
- GENERIC_NOP5_ATOMIC
-};
-static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
-{
- NULL,
- intelnops,
- intelnops + 1,
- intelnops + 1 + 2,
- intelnops + 1 + 2 + 3,
- intelnops + 1 + 2 + 3 + 4,
- intelnops + 1 + 2 + 3 + 4 + 5,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
-};
-#endif
-
-#ifdef K8_NOP1
-static const unsigned char k8nops[] =
-{
- K8_NOP1,
- K8_NOP2,
- K8_NOP3,
- K8_NOP4,
- K8_NOP5,
- K8_NOP6,
- K8_NOP7,
- K8_NOP8,
- K8_NOP5_ATOMIC
-};
-static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
-{
- NULL,
- k8nops,
- k8nops + 1,
- k8nops + 1 + 2,
- k8nops + 1 + 2 + 3,
- k8nops + 1 + 2 + 3 + 4,
- k8nops + 1 + 2 + 3 + 4 + 5,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
-};
-#endif
-
-#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-static const unsigned char k7nops[] =
-{
- K7_NOP1,
- K7_NOP2,
- K7_NOP3,
- K7_NOP4,
- K7_NOP5,
- K7_NOP6,
- K7_NOP7,
- K7_NOP8,
- K7_NOP5_ATOMIC
-};
-static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+static const unsigned char x86nops[] =
{
- NULL,
- k7nops,
- k7nops + 1,
- k7nops + 1 + 2,
- k7nops + 1 + 2 + 3,
- k7nops + 1 + 2 + 3 + 4,
- k7nops + 1 + 2 + 3 + 4 + 5,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ BYTES_NOP1,
+ BYTES_NOP2,
+ BYTES_NOP3,
+ BYTES_NOP4,
+ BYTES_NOP5,
+ BYTES_NOP6,
+ BYTES_NOP7,
+ BYTES_NOP8,
};
-#endif
-#ifdef P6_NOP1
-static const unsigned char p6nops[] =
-{
- P6_NOP1,
- P6_NOP2,
- P6_NOP3,
- P6_NOP4,
- P6_NOP5,
- P6_NOP6,
- P6_NOP7,
- P6_NOP8,
- P6_NOP5_ATOMIC
-};
-static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
{
NULL,
- p6nops,
- p6nops + 1,
- p6nops + 1 + 2,
- p6nops + 1 + 2 + 3,
- p6nops + 1 + 2 + 3 + 4,
- p6nops + 1 + 2 + 3 + 4 + 5,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ x86nops,
+ x86nops + 1,
+ x86nops + 1 + 2,
+ x86nops + 1 + 2 + 3,
+ x86nops + 1 + 2 + 3 + 4,
+ x86nops + 1 + 2 + 3 + 4 + 5,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
-#endif
-
-/* Initialize these to a safe default */
-#ifdef CONFIG_X86_64
-const unsigned char * const *ideal_nops = p6_nops;
-#else
-const unsigned char * const *ideal_nops = intel_nops;
-#endif
-
-void __init arch_init_ideal_nops(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- /*
- * Due to a decoder implementation quirk, some
- * specific Intel CPUs actually perform better with
- * the "k8_nops" than with the SDM-recommended NOPs.
- */
- if (boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model >= 0x0f &&
- boot_cpu_data.x86_model != 0x1c &&
- boot_cpu_data.x86_model != 0x26 &&
- boot_cpu_data.x86_model != 0x27 &&
- boot_cpu_data.x86_model < 0x30) {
- ideal_nops = k8_nops;
- } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
- ideal_nops = p6_nops;
- } else {
-#ifdef CONFIG_X86_64
- ideal_nops = k8_nops;
-#else
- ideal_nops = intel_nops;
-#endif
- }
- break;
-
- case X86_VENDOR_HYGON:
- ideal_nops = p6_nops;
- return;
-
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 > 0xf) {
- ideal_nops = p6_nops;
- return;
- }
-
- fallthrough;
-
- default:
-#ifdef CONFIG_X86_64
- ideal_nops = k8_nops;
-#else
- if (boot_cpu_has(X86_FEATURE_K8))
- ideal_nops = k8_nops;
- else if (boot_cpu_has(X86_FEATURE_K7))
- ideal_nops = k7_nops;
- else
- ideal_nops = intel_nops;
-#endif
- }
-}
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void __init_or_module add_nops(void *insns, unsigned int len)
@@ -262,12 +108,14 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
unsigned int noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
- memcpy(insns, ideal_nops[noplen], noplen);
+ memcpy(insns, x86_nops[noplen], noplen);
insns += noplen;
len -= noplen;
}
}
+extern s32 __retpoline_sites[], __retpoline_sites_end[];
+extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -338,25 +186,69 @@ done:
}
/*
- * "noinline" to cause control flow change and thus invalidate I$ and
- * cause refetch after modification.
+ * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
+ *
+ * @instr: instruction byte stream
+ * @instrlen: length of the above
+ * @off: offset within @instr where the first NOP has been detected
+ *
+ * Return: number of NOPs found (and replaced).
*/
-static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
+static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
{
unsigned long flags;
- int i;
+ int i = off, nnops;
- for (i = 0; i < a->padlen; i++) {
+ while (i < instrlen) {
if (instr[i] != 0x90)
- return;
+ break;
+
+ i++;
}
+ nnops = i - off;
+
+ if (nnops <= 1)
+ return nnops;
+
local_irq_save(flags);
- add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+ add_nops(instr + off, nnops);
local_irq_restore(flags);
- DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
- instr, a->instrlen - a->padlen, a->padlen);
+ DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+
+ return nnops;
+}
+
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+{
+ struct insn insn;
+ int i = 0;
+
+ /*
+ * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
+ * ones.
+ */
+ for (;;) {
+ if (insn_decode_kernel(&insn, &instr[i]))
+ return;
+
+ /*
+ * See if this and any potentially following NOPs can be
+ * optimized.
+ */
+ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
+ i += optimize_nops_range(instr, len, i);
+ else
+ i += insn.length;
+
+ if (i >= len)
+ return;
+ }
}
/*
@@ -388,26 +280,32 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
+ /* Mask away "NOT" flag bit for feature to test. */
+ u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->instrlen > sizeof(insn_buff));
- BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
- if (!boot_cpu_has(a->cpuid)) {
- if (a->padlen > 1)
- optimize_nops(a, instr);
+ BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
- continue;
- }
+ /*
+ * Patch if either:
+ * - feature is present
+ * - feature not present but ALTINSTR_FLAG_INV is set to mean,
+ * patch if feature is *NOT* present.
+ */
+ if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
+ goto next;
- DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
- a->cpuid >> 5,
- a->cpuid & 0x1f,
+ DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
+ (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
+ feature >> 5,
+ feature & 0x1f,
instr, instr, a->instrlen,
- replacement, a->replacementlen, a->padlen);
+ replacement, a->replacementlen);
- DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
- DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+ DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
@@ -428,17 +326,229 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
if (a->replacementlen && is_jmp(replacement[0]))
recompute_jump(a, instr, replacement, insn_buff);
- if (a->instrlen > a->replacementlen) {
- add_nops(insn_buff + a->replacementlen,
- a->instrlen - a->replacementlen);
- insn_buff_sz += a->instrlen - a->replacementlen;
- }
+ for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
+ insn_buff[insn_buff_sz] = 0x90;
+
DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
text_poke_early(instr, insn_buff, insn_buff_sz);
+
+next:
+ optimize_nops(instr, a->instrlen);
+ }
+}
+
+#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
+
+/*
+ * CALL/JMP *%\reg
+ */
+static int emit_indirect(int op, int reg, u8 *bytes)
+{
+ int i = 0;
+ u8 modrm;
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ modrm = 0x10; /* Reg = 2; CALL r/m */
+ break;
+
+ case JMP32_INSN_OPCODE:
+ modrm = 0x20; /* Reg = 4; JMP r/m */
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
}
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+
+ modrm |= 0xc0; /* Mod = 3 */
+ modrm += reg;
+
+ bytes[i++] = 0xff; /* opcode */
+ bytes[i++] = modrm;
+
+ return i;
}
+/*
+ * Rewrite the compiler generated retpoline thunk calls.
+ *
+ * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
+ * indirect instructions, avoiding the extra indirection.
+ *
+ * For example, convert:
+ *
+ * CALL __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * CALL *%\reg
+ *
+ * It also tries to inline spectre_v2=retpoline,lfence when size permits.
+ */
+static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
+{
+ retpoline_thunk_t *target;
+ int reg, ret, i = 0;
+ u8 op, cc;
+
+ target = addr + insn->length + insn->immediate.value;
+ reg = target - __x86_indirect_thunk_array;
+
+ if (WARN_ON_ONCE(reg & ~0xf))
+ return -1;
+
+ /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
+ BUG_ON(reg == 4);
+
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
+ !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE))
+ return -1;
+
+ op = insn->opcode.bytes[0];
+
+ /*
+ * Convert:
+ *
+ * Jcc.d32 __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * Jncc.d8 1f
+ * [ LFENCE ]
+ * JMP *%\reg
+ * [ NOP ]
+ * 1:
+ */
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
+ cc = insn->opcode.bytes[1] & 0xf;
+ cc ^= 1; /* invert condition */
+
+ bytes[i++] = 0x70 + cc; /* Jcc.d8 */
+ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
+
+ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
+ op = JMP32_INSN_OPCODE;
+ }
+
+ /*
+ * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ bytes[i++] = 0x0f;
+ bytes[i++] = 0xae;
+ bytes[i++] = 0xe8; /* LFENCE */
+ }
+
+ ret = emit_indirect(op, reg, bytes + i);
+ if (ret < 0)
+ return ret;
+ i += ret;
+
+ for (; i < insn->length;)
+ bytes[i++] = BYTES_NOP1;
+
+ return i;
+}
+
+/*
+ * Generated by 'objtool --retpoline'.
+ */
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op1, op2;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op1 = insn.opcode.bytes[0];
+ op2 = insn.opcode.bytes[1];
+
+ switch (op1) {
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ break;
+
+ case 0x0f: /* escape */
+ if (op2 >= 0x80 && op2 <= 0x8f)
+ break;
+ fallthrough;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_retpoline(addr, &insn, bytes);
+ if (len == insn.length) {
+ optimize_nops(bytes, len);
+ DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
+ }
+ }
+}
+
+#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
+
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+
+#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
+
+#ifdef CONFIG_X86_KERNEL_IBT
+
+/*
+ * Generated by: objtool --ibt
+ */
+void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ u32 endbr, poison = gen_endbr_poison();
+ void *addr = (void *)s + *s;
+
+ if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
+ continue;
+
+ if (WARN_ON_ONCE(!is_endbr(endbr)))
+ continue;
+
+ DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+
+ /*
+ * When we have IBT, the lack of ENDBR will trigger #CP
+ */
+ DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
+ DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
+ text_poke_early(addr, &poison, 4);
+ }
+}
+
+#else
+
+void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { }
+
+#endif /* CONFIG_X86_KERNEL_IBT */
+
#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
@@ -605,7 +715,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
BUG_ON(p->len > MAX_PATCH_LEN);
/* prep the buffer with the original instructions */
memcpy(insn_buff, p->instr, p->len);
- used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
+ used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
BUG_ON(used > p->len);
@@ -640,34 +750,39 @@ asm (
" .pushsection .init.text, \"ax\", @progbits\n"
" .type int3_magic, @function\n"
"int3_magic:\n"
+ ANNOTATE_NOENDBR
" movl $1, (%" _ASM_ARG1 ")\n"
-" ret\n"
+ ASM_RET
" .size int3_magic, .-int3_magic\n"
" .popsection\n"
);
-extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
+extern void int3_selftest_ip(void); /* defined in asm below */
static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
+ unsigned long selftest = (unsigned long)&int3_selftest_ip;
struct die_args *args = data;
struct pt_regs *regs = args->regs;
+ OPTIMIZER_HIDE_VAR(selftest);
+
if (!regs || user_mode(regs))
return NOTIFY_DONE;
if (val != DIE_INT3)
return NOTIFY_DONE;
- if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
+ if (regs->ip - INT3_INSN_SIZE != selftest)
return NOTIFY_DONE;
int3_emulate_call(regs, (unsigned long)&int3_magic);
return NOTIFY_STOP;
}
-static void __init int3_selftest(void)
+/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
+static noinline void __init int3_selftest(void)
{
static __initdata struct notifier_block int3_exception_nb = {
.notifier_call = int3_exception_notify,
@@ -680,18 +795,12 @@ static void __init int3_selftest(void)
/*
* Basically: int3_magic(&val); but really complicated :-)
*
- * Stick the address of the INT3 instruction into int3_selftest_ip,
- * then trigger the INT3, padded with NOPs to match a CALL instruction
- * length.
+ * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
+ * notifier above will emulate CALL for us.
*/
- asm volatile ("1: int3; nop; nop; nop; nop\n\t"
- ".pushsection .init.data,\"aw\"\n\t"
- ".align " __ASM_SEL(4, 8) "\n\t"
- ".type int3_selftest_ip, @object\n\t"
- ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
- "int3_selftest_ip:\n\t"
- __ASM_SEL(.long, .quad) " 1b\n\t"
- ".popsection\n\t"
+ asm volatile ("int3_selftest_ip:\n\t"
+ ANNOTATE_NOENDBR
+ " int3; nop; nop; nop; nop\n\t"
: ASM_CALL_CONSTRAINT
: __ASM_SEL_RAW(a, D) (&val)
: "memory");
@@ -723,8 +832,43 @@ void __init alternative_instructions(void)
* patching.
*/
+ /*
+ * Paravirt patching and alternative patching can be combined to
+ * replace a function call with a short direct code sequence (e.g.
+ * by setting a constant return value instead of doing that in an
+ * external function).
+ * In order to make this work the following sequence is required:
+ * 1. set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching
+ * 2. apply paravirt patching (generally replacing an indirect
+ * function call with a direct one)
+ * 3. apply alternative patching (e.g. replacing a direct function
+ * call with a custom code sequence)
+ * Doing paravirt patching after alternative patching would clobber
+ * the optimization of the custom code with a function call again.
+ */
+ paravirt_set_cap();
+
+ /*
+ * First patch paravirt functions, such that we overwrite the indirect
+ * call with the direct call.
+ */
+ apply_paravirt(__parainstructions, __parainstructions_end);
+
+ /*
+ * Rewrite the retpolines, must be done before alternatives since
+ * those can rewrite the retpoline thunks.
+ */
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+
+ /*
+ * Then patch alternatives, such that those paravirt calls that are in
+ * alternatives can be overwritten by their immediate fragments.
+ */
apply_alternatives(__alt_instructions, __alt_instructions_end);
+ apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
@@ -741,8 +885,6 @@ void __init alternative_instructions(void)
}
#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
-
restart_nmi();
alternatives_patched = 1;
}
@@ -813,7 +955,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
* with a stale address space WITHOUT being in lazy mode after
* restoring the previous mm.
*/
- if (this_cpu_read(cpu_tlbstate.is_lazy))
+ if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
leave_mm(smp_processor_id());
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -852,7 +994,21 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
-static void *__text_poke(void *addr, const void *opcode, size_t len)
+static void text_poke_memcpy(void *dst, const void *src, size_t len)
+{
+ memcpy(dst, src, len);
+}
+
+static void text_poke_memset(void *dst, const void *src, size_t len)
+{
+ int c = *(const int *)src;
+
+ memset(dst, c, len);
+}
+
+typedef void text_poke_f(void *dst, const void *src, size_t len);
+
+static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
@@ -917,7 +1073,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
prev = use_temporary_mm(poking_mm);
kasan_disable_current();
- memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
+ func((u8 *)poking_addr + offset_in_page(addr), src, len);
kasan_enable_current();
/*
@@ -945,11 +1101,13 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
PAGE_SHIFT, false);
- /*
- * If the text does not match what we just wrote then something is
- * fundamentally screwy; there's nothing we can really do about that.
- */
- BUG_ON(memcmp(addr, opcode, len));
+ if (func == text_poke_memcpy) {
+ /*
+ * If the text does not match what we just wrote then something is
+ * fundamentally screwy; there's nothing we can really do about that.
+ */
+ BUG_ON(memcmp(addr, src, len));
+ }
local_irq_restore(flags);
pte_unmap_unlock(ptep, ptl);
@@ -976,7 +1134,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
{
lockdep_assert_held(&text_mutex);
- return __text_poke(addr, opcode, len);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
}
/**
@@ -995,7 +1153,72 @@ void *text_poke(void *addr, const void *opcode, size_t len)
*/
void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
{
- return __text_poke(addr, opcode, len);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
+}
+
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ mutex_lock(&text_mutex);
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
+ patched += s;
+ }
+ mutex_unlock(&text_mutex);
+ return addr;
+}
+
+/**
+ * text_poke_set - memset into (an unused part of) RX memory
+ * @addr: address to modify
+ * @c: the byte to fill the area with
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * This is useful to overwrite unused regions of RX memory with illegal
+ * instructions.
+ */
+void *text_poke_set(void *addr, int c, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ mutex_lock(&text_mutex);
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
+ patched += s;
+ }
+ mutex_unlock(&text_mutex);
+ return addr;
}
static void do_sync_core(void *info)
@@ -1009,10 +1232,13 @@ void text_poke_sync(void)
}
struct text_poke_loc {
- s32 rel_addr; /* addr := _stext + rel_addr */
- s32 rel32;
+ /* addr := _stext + rel_addr */
+ s32 rel_addr;
+ s32 disp;
+ u8 len;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
+ /* see text_poke_bp_batch() */
u8 old;
};
@@ -1027,7 +1253,8 @@ static struct bp_patching_desc *bp_desc;
static __always_inline
struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
{
- struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */
+ /* rcu_dereference */
+ struct bp_patching_desc *desc = __READ_ONCE(*descp);
if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
return NULL;
@@ -1061,7 +1288,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
{
struct bp_patching_desc *desc;
struct text_poke_loc *tp;
- int len, ret = 0;
+ int ret = 0;
void *ip;
if (user_mode(regs))
@@ -1101,8 +1328,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
goto out_put;
}
- len = text_opcode_size(tp->opcode);
- ip += len;
+ ip += tp->len;
switch (tp->opcode) {
case INT3_INSN_OPCODE:
@@ -1117,12 +1343,12 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
break;
case CALL_INSN_OPCODE:
- int3_emulate_call(regs, (long)ip + tp->rel32);
+ int3_emulate_call(regs, (long)ip + tp->disp);
break;
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- int3_emulate_jmp(regs, (long)ip + tp->rel32);
+ int3_emulate_jmp(regs, (long)ip + tp->disp);
break;
default:
@@ -1197,7 +1423,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
- int len = text_opcode_size(tp[i].opcode);
+ int len = tp[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
@@ -1274,21 +1500,37 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
const void *opcode, size_t len, const void *emulate)
{
struct insn insn;
+ int ret, i;
memcpy((void *)tp->text, opcode, len);
if (!emulate)
emulate = opcode;
- kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
- insn_get_length(&insn);
-
- BUG_ON(!insn_complete(&insn));
- BUG_ON(len != insn.length);
+ ret = insn_decode_kernel(&insn, emulate);
+ BUG_ON(ret < 0);
tp->rel_addr = addr - (void *)_stext;
+ tp->len = len;
tp->opcode = insn.opcode.bytes[0];
switch (tp->opcode) {
+ case RET_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ /*
+ * Control flow instructions without implied execution of the
+ * next instruction can be padded with INT3.
+ */
+ for (i = insn.length; i < len; i++)
+ BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
+ break;
+
+ default:
+ BUG_ON(len != insn.length);
+ };
+
+
+ switch (tp->opcode) {
case INT3_INSN_OPCODE:
case RET_INSN_OPCODE:
break;
@@ -1296,21 +1538,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case CALL_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- tp->rel32 = insn.immediate.value;
+ tp->disp = insn.immediate.value;
break;
default: /* assume NOP */
switch (len) {
case 2: /* NOP2 -- emulate as JMP8+0 */
- BUG_ON(memcmp(emulate, ideal_nops[len], len));
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP8_INSN_OPCODE;
- tp->rel32 = 0;
+ tp->disp = 0;
break;
case 5: /* NOP5 -- emulate as JMP32+0 */
- BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP32_INSN_OPCODE;
- tp->rel32 = 0;
+ tp->disp = 0;
break;
default: /* unknown instruction */
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 9ac696487b13..194d54eed537 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -38,11 +38,9 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/set_memory.h>
-#include <asm/swiotlb.h>
#include <asm/dma.h>
#include <asm/amd_nb.h>
#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
@@ -331,7 +329,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int i;
if (iommu_start == -1)
- return -1;
+ return -ENOMEM;
for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
@@ -380,13 +378,13 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *s, *ps, *start_sg, *sgmap;
- int need = 0, nextneed, i, out, start;
+ int need = 0, nextneed, i, out, start, ret;
unsigned long pages = 0;
unsigned int seg_size;
unsigned int max_seg_size;
if (nents == 0)
- return 0;
+ return -EINVAL;
out = 0;
start = 0;
@@ -414,8 +412,9 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
if (!iommu_merge || !nextneed || !need || s->offset ||
(s->length + seg_size > max_seg_size) ||
(ps->offset + ps->length) % PAGE_SIZE) {
- if (dma_map_cont(dev, start_sg, i - start,
- sgmap, pages, need) < 0)
+ ret = dma_map_cont(dev, start_sg, i - start,
+ sgmap, pages, need);
+ if (ret < 0)
goto error;
out++;
@@ -432,7 +431,8 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
- if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+ ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+ if (ret < 0)
goto error;
out++;
flush_gart();
@@ -456,9 +456,7 @@ error:
panic("dma_map_sg: overflow on %lu pages\n", pages);
iommu_full(dev, pages << PAGE_SHIFT, dir);
- for_each_sg(sg, s, nents, i)
- s->dma_address = DMA_MAPPING_ERROR;
- return 0;
+ return ret;
}
/* allocate and map a coherent mapping */
@@ -808,7 +806,7 @@ int __init gart_iommu_init(void)
flush_gart();
dma_ops = &gart_dma_ops;
x86_platform.iommu_shutdown = gart_iommu_shutdown;
- swiotlb = 0;
+ x86_swiotlb_enable = false;
return 0;
}
@@ -842,4 +840,3 @@ void __init gart_parse_options(char *p)
}
}
}
-IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index b4396952c9a6..190e0f763375 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Shared support code for AMD K8 northbridges and derivates.
+ * Shared support code for AMD K8 northbridges and derivatives.
* Copyright 2006 Andi Kleen, SUSE Labs.
*/
@@ -19,14 +19,19 @@
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
+#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
+#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
+#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
+#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
+#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
-/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
static u32 *flush_words;
@@ -36,6 +41,8 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
{}
};
@@ -57,6 +64,9 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
{}
};
@@ -72,6 +82,9 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
@@ -174,55 +187,8 @@ int amd_smn_write(u16 node, u32 address, u32 value)
}
EXPORT_SYMBOL_GPL(amd_smn_write);
-/*
- * Data Fabric Indirect Access uses FICAA/FICAD.
- *
- * Fabric Indirect Configuration Access Address (FICAA): Constructed based
- * on the device's Instance Id and the PCI function and register offset of
- * the desired register.
- *
- * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
- * and FICAD HI registers but so far we only need the LO register.
- */
-int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
-{
- struct pci_dev *F4;
- u32 ficaa;
- int err = -ENODEV;
-
- if (node >= amd_northbridges.num)
- goto out;
-
- F4 = node_to_amd_nb(node)->link;
- if (!F4)
- goto out;
-
- ficaa = 1;
- ficaa |= reg & 0x3FC;
- ficaa |= (func & 0x7) << 11;
- ficaa |= instance_id << 16;
-
- mutex_lock(&smn_mutex);
-
- err = pci_write_config_dword(F4, 0x5C, ficaa);
- if (err) {
- pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
- goto out_unlock;
- }
-
- err = pci_read_config_dword(F4, 0x98, lo);
- if (err)
- pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
-
-out_unlock:
- mutex_unlock(&smn_mutex);
-
-out:
- return err;
-}
-EXPORT_SYMBOL_GPL(amd_df_indirect_read);
-int amd_cache_northbridges(void)
+static int amd_cache_northbridges(void)
{
const struct pci_device_id *misc_ids = amd_nb_misc_ids;
const struct pci_device_id *link_ids = amd_nb_link_ids;
@@ -244,14 +210,14 @@ int amd_cache_northbridges(void)
}
misc = NULL;
- while ((misc = next_northbridge(misc, misc_ids)) != NULL)
+ while ((misc = next_northbridge(misc, misc_ids)))
misc_count++;
if (!misc_count)
return -ENODEV;
root = NULL;
- while ((root = next_northbridge(root, root_ids)) != NULL)
+ while ((root = next_northbridge(root, root_ids)))
root_count++;
if (root_count) {
@@ -324,7 +290,6 @@ int amd_cache_northbridges(void)
return 0;
}
-EXPORT_SYMBOL_GPL(amd_cache_northbridges);
/*
* Ignores subdevice/subvendor but as far as I can figure out
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 294ed4392a0e..7a5630d904b2 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned long pfn)
(pfn >= aperture_pfn_start + aperture_page_count));
}
+#ifdef CONFIG_PROC_VMCORE
+static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
+{
+ return !!gart_mem_pfn_is_ram(pfn);
+}
+
+static struct vmcore_cb gart_vmcore_cb = {
+ .pfn_is_ram = gart_oldmem_pfn_is_ram,
+};
+#endif
+
static void __init exclude_from_core(u64 aper_base, u32 aper_order)
{
aperture_pfn_start = aper_base >> PAGE_SHIFT;
aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
#ifdef CONFIG_PROC_VMCORE
- WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram));
+ register_vmcore_cb(&gart_vmcore_cb);
#endif
#ifdef CONFIG_PROC_KCORE
WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
@@ -109,14 +120,13 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
- aper_size, aper_size);
+ addr = memblock_phys_alloc_range(aper_size, aper_size,
+ GART_MIN_ADDR, GART_MAX_ADDR);
if (!addr) {
pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
- memblock_reserve(addr, aper_size);
pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
addr, addr + aper_size - 1, aper_size >> 10);
register_nosave_region(addr >> PAGE_SHIFT,
@@ -382,7 +392,7 @@ void __init early_gart_iommu_check(void)
static int __initdata printed_gart_size_msg;
-int __init gart_iommu_hole_init(void)
+void __init gart_iommu_hole_init(void)
{
u32 agp_aper_base = 0, agp_aper_order = 0;
u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
@@ -391,11 +401,11 @@ int __init gart_iommu_hole_init(void)
int i, node;
if (!amd_gart_present())
- return -ENODEV;
+ return;
if (gart_iommu_aperture_disabled || !fix_aperture ||
!early_pci_allowed())
- return -ENODEV;
+ return;
pr_info("Checking aperture...\n");
@@ -481,10 +491,8 @@ out:
* and fixed up the northbridge
*/
exclude_from_core(last_aper_base, last_aper_order);
-
- return 1;
}
- return 0;
+ return;
}
if (!fallback_aper_force) {
@@ -517,7 +525,7 @@ out:
panic("Not enough memory for aperture");
}
} else {
- return 0;
+ return;
}
/*
@@ -551,6 +559,4 @@ out:
}
set_up_gart_resume(aper_order, aper_alloc);
-
- return 1;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4f26700f314d..189d3a5e471a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -38,6 +38,7 @@
#include <asm/trace/irq_vectors.h>
#include <asm/irq_remapping.h>
+#include <asm/pc-conf-reg.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
#include <linux/atomic.h>
@@ -132,18 +133,14 @@ static int enabled_via_apicbase __ro_after_init;
*/
static inline void imcr_pic_to_apic(void)
{
- /* select IMCR register */
- outb(0x70, 0x22);
/* NMI and 8259 INTR go through APIC */
- outb(0x01, 0x23);
+ pc_conf_set(PC_CONF_MPS_IMCR, 0x01);
}
static inline void imcr_apic_to_pic(void)
{
- /* select IMCR register */
- outb(0x70, 0x22);
/* NMI and 8259 INTR go directly to BSP */
- outb(0x00, 0x23);
+ pc_conf_set(PC_CONF_MPS_IMCR, 0x00);
}
#endif
@@ -173,7 +170,7 @@ static __init int setup_apicpmtimer(char *s)
{
apic_calibrate_pmtmr = 1;
notsc_setup(NULL);
- return 0;
+ return 1;
}
__setup("apicpmtimer", setup_apicpmtimer);
#endif
@@ -323,6 +320,9 @@ int lapic_get_maxlvt(void)
#define APIC_DIVISOR 16
#define TSC_DIVISOR 8
+/* i82489DX specific */
+#define I82489DX_BASE_DIVIDER (((0x2) << 18))
+
/*
* This function sets up the local APIC timer, with a timeout of
* 'clocks' APIC bus clock. During calibration we actually call
@@ -343,8 +343,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
+ /*
+ * The i82489DX APIC uses bit 18 and 19 for the base divider. This
+ * overlaps with bit 18 on integrated APICs, but is not documented
+ * in the SDM. No problem though. i82489DX equipped systems do not
+ * have TSC deadline timer.
+ */
if (!lapic_is_integrated())
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+ lvtt_value |= I82489DX_BASE_DIVIDER;
if (!irqen)
lvtt_value |= APIC_LVT_MASKED;
@@ -619,7 +625,7 @@ static void setup_APIC_timer(void)
if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
- /* Make LAPIC timer preferrable over percpu HPET */
+ /* Make LAPIC timer preferable over percpu HPET */
lapic_clockevent.rating = 150;
}
@@ -666,7 +672,7 @@ void lapic_update_tsc_freq(void)
* In this functions we calibrate APIC bus clocks to the external timer.
*
* We want to do the calibration only once since we want to have local timer
- * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * irqs synchronous. CPUs connected by the same APIC bus have the very same bus
* frequency.
*
* This was previously done by reading the PIT/HPET and waiting for a wrap
@@ -1422,22 +1428,21 @@ void __init apic_intr_mode_init(void)
return;
case APIC_VIRTUAL_WIRE:
pr_info("APIC: Switch to virtual wire mode setup\n");
- default_setup_apic_routing();
break;
case APIC_VIRTUAL_WIRE_NO_CONFIG:
pr_info("APIC: Switch to virtual wire mode setup with no configuration\n");
upmode = true;
- default_setup_apic_routing();
break;
case APIC_SYMMETRIC_IO:
pr_info("APIC: Switch to symmetric I/O mode setup\n");
- default_setup_apic_routing();
break;
case APIC_SYMMETRIC_IO_NO_ROUTING:
pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n");
break;
}
+ default_setup_apic_routing();
+
if (x86_platform.apic_post_init)
x86_platform.apic_post_init();
@@ -1532,7 +1537,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr)
* Most probably by now the CPU has serviced that pending interrupt and it
* might not have done the ack_APIC_irq() because it thought, interrupt
* came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear
- * the ISR bit and cpu thinks it has already serivced the interrupt. Hence
+ * the ISR bit and cpu thinks it has already serviced the interrupt. Hence
* a vector might get locked. It was noticed for timer irq (vector
* 0x31). Issue an extra EOI to clear ISR.
*
@@ -1657,7 +1662,7 @@ static void setup_local_APIC(void)
*/
/*
* Actually disabling the focus CPU check just makes the hang less
- * frequent as it makes the interrupt distributon model be more
+ * frequent as it makes the interrupt distribution model be more
* like LRU than MRU (the short-term load is more even across CPUs).
*/
@@ -1875,7 +1880,7 @@ static __init void try_to_enable_x2apic(int remap_mode)
/*
* Without IR, all CPUs can be addressed by IOAPIC/MSI only
- * in physical mode, and CPUs with an APIC ID that cannnot
+ * in physical mode, and CPUs with an APIC ID that cannot
* be addressed must not be brought online.
*/
x2apic_set_max_apicid(apic_limit);
@@ -2554,6 +2559,16 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
}
EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+#ifdef CONFIG_X86_64
+void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++)
+ (*drv)->wakeup_secondary_cpu_64 = handler;
+}
+#endif
+
/*
* Override the generic EOI implementation with an optimized version.
* Only called during early boot when only one CPU is active and with
@@ -2604,6 +2619,7 @@ static void __init apic_bsp_setup(bool upmode)
end_local_APIC_setup();
irq_remap_enable_fault_handling();
setup_IO_APIC();
+ lapic_update_legacy_vectors();
}
#ifdef CONFIG_UP_LATE_INIT
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 73ff4dd426a8..a868b76cd3d4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -65,6 +65,7 @@
#include <asm/irq_remapping.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
+#include <asm/pgtable.h>
#define for_each_ioapic(idx) \
for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
@@ -764,7 +765,7 @@ static bool irq_active_low(int idx)
static bool EISA_ELCR(unsigned int irq)
{
if (irq < nr_legacy_irqs()) {
- unsigned int port = 0x4d0 + (irq >> 3);
+ unsigned int port = PIC_ELCR1 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
apic_printk(APIC_VERBOSE, KERN_INFO
@@ -928,7 +929,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
/*
* setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
- * and polarity attirbutes. So allow the first user to reprogram the
+ * and polarity attributes. So allow the first user to reprogram the
* pin with real trigger and polarity attributes.
*/
if (irq < nr_legacy_irqs() && data->count == 1) {
@@ -994,7 +995,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
/*
* Legacy ISA IRQ has already been allocated, just add pin to
- * the pin list assoicated with this IRQ and program the IOAPIC
+ * the pin list associated with this IRQ and program the IOAPIC
* entry. The IOAPIC entry
*/
if (irq_data && irq_data->parent_data) {
@@ -1752,7 +1753,7 @@ static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
* with masking the ioapic entry and then polling until
* Remote IRR was clear before reprogramming the
* ioapic I don't trust the Remote IRR bit to be
- * completey accurate.
+ * completely accurate.
*
* However there appears to be no other way to plug
* this race, so if the Remote IRR bit is not
@@ -1830,7 +1831,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
/*
* Tail end of clearing remote IRR bit (either by delivering the EOI
* message via io-apic EOI register write or simulating it using
- * mask+edge followed by unnask+level logic) manually when the
+ * mask+edge followed by unmask+level logic) manually when the
* level triggered interrupt is seen as the edge triggered interrupt
* at the cpu.
*/
@@ -1986,7 +1987,8 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_get_irqchip_state = ioapic_irq_get_chip_state,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
static struct irq_chip ioapic_ir_chip __read_mostly = {
@@ -1999,7 +2001,8 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_get_irqchip_state = ioapic_irq_get_chip_state,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
static inline void init_IO_APIC_traps(void)
@@ -2675,6 +2678,19 @@ static struct resource * __init ioapic_setup_resources(void)
return res;
}
+static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
+{
+ pgprot_t flags = FIXMAP_PAGE_NOCACHE;
+
+ /*
+ * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot
+ * bits, just like normal ioremap():
+ */
+ flags = pgprot_decrypted(flags);
+
+ __set_fixmap(idx, phys, flags);
+}
+
void __init io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
@@ -2707,7 +2723,7 @@ fake_ioapic_page:
__func__, PAGE_SIZE, PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
- set_fixmap_nocache(idx, ioapic_phys);
+ io_apic_set_fixmap(idx, ioapic_phys);
apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
__fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
ioapic_phys);
@@ -2836,7 +2852,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
ioapics[idx].mp_config.apicaddr = address;
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address);
if (bad_ioapic_register(idx)) {
clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
return -ENODEV;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 44ebe25e7703..7517eb05bdc1 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -19,6 +19,7 @@
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/xen/hypervisor.h>
struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
@@ -58,11 +59,13 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
* The quirk bit is not set in this case.
* - The new vector is the same as the old vector
* - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The interrupt is not yet started up
* - The new destination CPU is the same as the old destination CPU
*/
if (!irqd_msi_nomask_quirk(irqd) ||
cfg->vector == old_cfg.vector ||
old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ !irqd_is_started(irqd) ||
cfg->dest_apicid == old_cfg.dest_apicid) {
irq_msi_update_msg(irqd, cfg);
return ret;
@@ -150,17 +153,15 @@ static struct irq_chip pci_msi_controller = {
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_set_affinity = msi_set_affinity,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
msi_alloc_info_t *arg)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct msi_desc *desc = first_pci_msi_entry(pdev);
-
init_irq_alloc_info(arg, NULL);
- if (desc->msi_attrib.is_msix) {
+ if (to_pci_dev(dev)->msix_enabled) {
arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
} else {
arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
@@ -219,7 +220,8 @@ static struct irq_chip pci_msi_ir_controller = {
.irq_mask = pci_msi_mask_irq,
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
static struct msi_domain_info pci_msi_ir_domain_info = {
@@ -273,7 +275,8 @@ static struct irq_chip dmar_msi_controller = {
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = dmar_msi_compose_msg,
.irq_write_msi_msg = dmar_msi_write_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
static int dmar_msi_init(struct irq_domain *domain,
@@ -340,3 +343,8 @@ void dmar_free_hwirq(int irq)
irq_domain_free_irqs(irq, 1);
}
#endif
+
+bool arch_restore_msi_irqs(struct pci_dev *dev)
+{
+ return xen_initdom_restore_msi(dev);
+}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3c9c7492252f..3e6f6b448f6a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -543,6 +543,14 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
return -ENOSYS;
+ /*
+ * Catch any attempt to touch the cascade interrupt on a PIC
+ * equipped system.
+ */
+ if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY &&
+ virq == PIC_CASCADE_IR))
+ return -EINVAL;
+
for (i = 0; i < nr_irqs; i++) {
irqd = irq_domain_get_irq_data(domain, virq + i);
BUG_ON(!irqd);
@@ -730,11 +738,31 @@ void lapic_assign_legacy_vector(unsigned int irq, bool replace)
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
}
+void __init lapic_update_legacy_vectors(void)
+{
+ unsigned int i;
+
+ if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
+ return;
+
+ /*
+ * If the IO/APIC is disabled via config, kernel command line or
+ * lack of enumeration then all legacy interrupts are routed
+ * through the PIC. Make sure that they are marked as legacy
+ * vectors. PIC_CASCADE_IRQ has already been marked in
+ * lapic_assign_system_vectors().
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ lapic_assign_legacy_vector(i, true);
+ }
+}
+
void __init lapic_assign_system_vectors(void)
{
- unsigned int i, vector = 0;
+ unsigned int i, vector;
- for_each_set_bit_from(vector, system_vectors, NR_VECTORS)
+ for_each_set_bit(vector, system_vectors, NR_VECTORS)
irq_matrix_assign_system(vector_matrix, vector, false);
if (nr_legacy_irqs() > 1)
@@ -745,6 +773,11 @@ void __init lapic_assign_system_vectors(void)
/* Mark the preallocated legacy interrupts */
for (i = 0; i < nr_legacy_irqs(); i++) {
+ /*
+ * Don't touch the cascade interrupt. It's unusable
+ * on PIC equipped machines. See the large comment
+ * in the IO/APIC code.
+ */
if (i != PIC_CASCADE_IR)
irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i));
}
@@ -1045,7 +1078,7 @@ void irq_force_complete_move(struct irq_desc *desc)
*
* But in case of cpu hotplug this should be a non issue
* because if the affinity update happens right before all
- * cpus rendevouz in stop machine, there is no way that the
+ * cpus rendezvous in stop machine, there is no way that the
* interrupt can be blocked on the target cpu because all cpus
* loops first with interrupts enabled in stop machine, so the
* old vector is not yet cleaned up when the interrupt fires.
@@ -1054,7 +1087,7 @@ void irq_force_complete_move(struct irq_desc *desc)
* of the interrupt on the apic/system bus would be delayed
* beyond the point where the target cpu disables interrupts
* in stop machine. I doubt that it can happen, but at least
- * there is a theroretical chance. Virtualization might be
+ * there is a theoretical chance. Virtualization might be
* able to expose this, but AFAICT the IOAPIC emulation is not
* as stupid as the real hardware.
*
@@ -1266,7 +1299,7 @@ static void __init print_PIC(void)
pr_debug("... PIC ISR: %04x\n", v);
- v = inb(0x4d1) << 8 | inb(0x4d0);
+ v = inb(PIC_ELCR2) << 8 | inb(PIC_ELCR1);
pr_debug("... PIC ELCR: %04x\n", v);
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index f4da9bb69a88..e696e22d0531 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -15,9 +15,15 @@ struct cluster_mask {
struct cpumask mask;
};
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+/*
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
+
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
static struct cluster_mask *cluster_hotplug_mask;
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
static void x2apic_send_IPI(int cpu, int vector)
{
- u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+ u32 dest = x86_cpu_to_logical_apicid[cpu];
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
@@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
dest = 0;
for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
- dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+ dest |= x86_cpu_to_logical_apicid[clustercpu];
if (!dest)
continue;
@@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector)
static u32 x2apic_calc_apicid(unsigned int cpu)
{
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ return x86_cpu_to_logical_apicid[cpu];
}
static void init_x2apic_ldr(void)
@@ -103,7 +109,7 @@ static void init_x2apic_ldr(void)
u32 cluster, apicid = apic_read(APIC_LDR);
unsigned int cpu;
- this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+ x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
if (cmsk)
goto update;
@@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)
static int x2apic_cluster_probe(void)
{
+ u32 slots;
+
if (!x2apic_mode)
return 0;
+ slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+ x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+ if (!x86_cpu_to_logical_apicid)
+ return 0;
+
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
pr_err("Failed to register X2APIC_PREPARE\n");
+ kfree(x86_cpu_to_logical_apicid);
+ x86_cpu_to_logical_apicid = NULL;
return 0;
}
init_x2apic_ldr();
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 52bc217ca8c3..482855227964 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -199,7 +199,13 @@ static void __init uv_tsc_check_sync(void)
int mmr_shift;
char *state;
- /* Different returns from different UV BIOS versions */
+ /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */
+ if (!is_uv(UV2|UV3|UV4)) {
+ mark_tsc_async_resets("UV5+");
+ return;
+ }
+
+ /* UV2,3,4, UV BIOS TSC sync state available */
mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
mmr_shift =
is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
@@ -369,6 +375,15 @@ static int __init early_get_arch_type(void)
return ret;
}
+/* UV system found, check which APIC MODE BIOS already selected */
+static void __init early_set_apic_mode(void)
+{
+ if (x2apic_enabled())
+ uv_system_type = UV_X2APIC;
+ else
+ uv_system_type = UV_LEGACY_APIC;
+}
+
static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
{
/* Save OEM_ID passed from ACPI MADT */
@@ -404,11 +419,12 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
else
uv_hubless_system |= 0x8;
- /* Copy APIC type */
+ /* Copy OEM Table ID */
uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",
oem_id, oem_table_id, uv_system_type, uv_hubless_system);
+
return 0;
}
@@ -453,6 +469,7 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
early_set_hub_type();
/* Other UV setup functions */
+ early_set_apic_mode();
early_get_pnodeid();
early_get_apic_socketid_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
@@ -472,29 +489,14 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
if (uv_set_system_type(_oem_id, _oem_table_id) == 0)
return 0;
- /* Save and Decode OEM Table ID */
+ /* Save for display of the OEM Table ID */
uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
- /* This is the most common hardware variant, x2apic mode */
- if (!strcmp(oem_table_id, "UVX"))
- uv_system_type = UV_X2APIC;
-
- /* Only used for very small systems, usually 1 chassis, legacy mode */
- else if (!strcmp(oem_table_id, "UVL"))
- uv_system_type = UV_LEGACY_APIC;
-
- else
- goto badbios;
-
pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n",
oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY),
uv_min_hub_revision_id);
return 0;
-
-badbios:
- pr_err("UV: UVarchtype:%s not supported\n", uv_archtype);
- BUG();
}
enum uv_system_type get_uv_system_type(void)
@@ -1344,7 +1346,7 @@ static void __init decode_gam_params(unsigned long ptr)
static void __init decode_gam_rng_tbl(unsigned long ptr)
{
struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
- unsigned long lgre = 0;
+ unsigned long lgre = 0, gend = 0;
int index = 0;
int sock_min = 999999, pnode_min = 99999;
int sock_max = -1, pnode_max = -1;
@@ -1378,6 +1380,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
flag, size, suffix[order],
gre->type, gre->nasid, gre->sockid, gre->pnode);
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT;
+
/* update to next range start */
lgre = gre->limit;
if (sock_min > gre->sockid)
@@ -1395,7 +1400,8 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
_max_pnode = pnode_max;
_gr_table_len = index;
- pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+ pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n",
+ index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend));
}
/* Walk through UVsystab decoding the fields */
@@ -1671,6 +1677,9 @@ static __init int uv_system_init_hubless(void)
if (rc < 0)
return rc;
+ /* Set section block size for current node memory */
+ set_block_size();
+
/* Create user access node */
if (rc >= 0)
uv_setup_proc_files(1);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 660270359d39..60e330cdbd17 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -94,7 +94,7 @@
* Remove APM dependencies in arch/i386/kernel/process.c
* Remove APM dependencies in drivers/char/sysrq.c
* Reset time across standby.
- * Allow more inititialisation on SMP.
+ * Allow more initialisation on SMP.
* Remove CONFIG_APM_POWER_OFF and make it boot time
* configurable (default on).
* Make debug only a boot time parameter (remove APM_DEBUG).
@@ -232,6 +232,7 @@
#include <asm/paravirt.h>
#include <asm/reboot.h>
#include <asm/nospec-branch.h>
+#include <asm/ibt.h>
#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
extern int (*console_blank_hook)(int);
@@ -598,6 +599,7 @@ static long __apm_bios_call(void *_call)
struct desc_struct save_desc_40;
struct desc_struct *gdt;
struct apm_bios_call *call = _call;
+ u64 ibt;
cpu = get_cpu();
BUG_ON(cpu != 0);
@@ -607,11 +609,13 @@ static long __apm_bios_call(void *_call)
apm_irq_save(flags);
firmware_restrict_branch_speculation_start();
+ ibt = ibt_save();
APM_DO_SAVE_SEGS;
apm_bios_call_asm(call->func, call->ebx, call->ecx,
&call->eax, &call->ebx, &call->ecx, &call->edx,
&call->esi);
APM_DO_RESTORE_SEGS;
+ ibt_restore(ibt);
firmware_restrict_branch_speculation_end();
apm_irq_restore(flags);
gdt[0x40 / 8] = save_desc_40;
@@ -676,6 +680,7 @@ static long __apm_bios_call_simple(void *_call)
struct desc_struct save_desc_40;
struct desc_struct *gdt;
struct apm_bios_call *call = _call;
+ u64 ibt;
cpu = get_cpu();
BUG_ON(cpu != 0);
@@ -685,10 +690,12 @@ static long __apm_bios_call_simple(void *_call)
apm_irq_save(flags);
firmware_restrict_branch_speculation_start();
+ ibt = ibt_save();
APM_DO_SAVE_SEGS;
error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
&call->eax);
APM_DO_RESTORE_SEGS;
+ ibt_restore(ibt);
firmware_restrict_branch_speculation_end();
apm_irq_restore(flags);
gdt[0x40 / 8] = save_desc_40;
@@ -766,7 +773,7 @@ static int apm_driver_version(u_short *val)
* not cleared until it is acknowledged.
*
* Additional information is returned in the info pointer, providing
- * that APM 1.2 is in use. If no messges are pending the value 0x80
+ * that APM 1.2 is in use. If no messages are pending the value 0x80
* is returned (No power management events pending).
*/
static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
@@ -1025,7 +1032,7 @@ static int apm_enable_power_management(int enable)
* status which gives the rough battery status, and current power
* source. The bat value returned give an estimate as a percentage
* of life and a status value for the battery. The estimated life
- * if reported is a lifetime in secodnds/minutes at current powwer
+ * if reported is a lifetime in seconds/minutes at current power
* consumption.
*/
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 60b9f42ce3c1..437308004ef2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -18,6 +18,7 @@
#include <asm/bootparam.h>
#include <asm/suspend.h>
#include <asm/tlbflush.h>
+#include <asm/tdx.h>
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -38,9 +39,6 @@ static void __used common(void)
#endif
BLANK();
- OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
-
- BLANK();
OFFSET(pbe_address, pbe, address);
OFFSET(pbe_orig_address, pbe, orig_address);
OFFSET(pbe_next, pbe, next);
@@ -61,13 +59,6 @@ static void __used common(void)
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
#endif
-#ifdef CONFIG_PARAVIRT_XXL
- BLANK();
- OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
- OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
- OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
-#endif
-
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
@@ -76,6 +67,22 @@ static void __used common(void)
#endif
BLANK();
+ OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx);
+ OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx);
+ OFFSET(TDX_MODULE_r8, tdx_module_output, r8);
+ OFFSET(TDX_MODULE_r9, tdx_module_output, r9);
+ OFFSET(TDX_MODULE_r10, tdx_module_output, r10);
+ OFFSET(TDX_MODULE_r11, tdx_module_output, r11);
+
+ BLANK();
+ OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10);
+ OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11);
+ OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12);
+ OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13);
+ OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14);
+ OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15);
+
+ BLANK();
OFFSET(BP_scratch, boot_params, scratch);
OFFSET(BP_secure_boot, boot_params, secure_boot);
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6e043f295a60..2b411cd00a4e 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -53,11 +53,6 @@ void foo(void)
offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
offsetofend(struct cpu_entry_area, entry_stack_page.stack));
-#ifdef CONFIG_STACKPROTECTOR
- BLANK();
- OFFSET(stack_canary_offset, stack_canary, canary);
-#endif
-
BLANK();
DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map));
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index b14533af7676..9b698215d261 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,7 +5,7 @@
#include <asm/ia32.h>
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
#include <asm/kvm_para.h>
#endif
@@ -20,7 +20,7 @@ int main(void)
BLANK();
#endif
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
BLANK();
#endif
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 83d9cad4e68b..44c3601cfdc4 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -47,14 +47,16 @@ int audit_classify_syscall(int abi, unsigned syscall)
#endif
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_execve:
case __NR_execveat:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 0;
+ return AUDITSC_NATIVE;
}
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 637b499450d1..9661e3e802be 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
+obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o
obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 347a956f71ca..0c0b09796ced 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -394,35 +394,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
}
-static void amd_detect_ppin(struct cpuinfo_x86 *c)
-{
- unsigned long long val;
-
- if (!cpu_has(c, X86_FEATURE_AMD_PPIN))
- return;
-
- /* When PPIN is defined in CPUID, still need to check PPIN_CTL MSR */
- if (rdmsrl_safe(MSR_AMD_PPIN_CTL, &val))
- goto clear_ppin;
-
- /* PPIN is locked in disabled mode, clear feature bit */
- if ((val & 3UL) == 1UL)
- goto clear_ppin;
-
- /* If PPIN is disabled, try to enable it */
- if (!(val & 2UL)) {
- wrmsrl_safe(MSR_AMD_PPIN_CTL, val | 2UL);
- rdmsrl_safe(MSR_AMD_PPIN_CTL, &val);
- }
-
- /* If PPIN_EN bit is 1, return from here; otherwise fall through */
- if (val & 2UL)
- return;
-
-clear_ppin:
- clear_cpu_cap(c, X86_FEATURE_AMD_PPIN);
-}
-
u32 amd_get_nodes_per_socket(void)
{
return nodes_per_socket;
@@ -438,7 +409,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = per_cpu(cpu_llc_id, cpu);
+ node = get_llc_id(cpu);
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -585,6 +556,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
* the SME physical address space reduction value.
* If BIOS has not enabled SME then don't advertise the
* SME feature (set in scattered.c).
+ * If the kernel has not enabled SME via any means then
+ * don't advertise the SME feature.
* For SEV: If BIOS has not enabled SEV then don't advertise the
* SEV and SEV_ES feature (set in scattered.c).
*
@@ -593,8 +566,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
*/
if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
/* Check if memory encryption is enabled */
- rdmsrl(MSR_K8_SYSCFG, msr);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
goto clear_all;
/*
@@ -607,6 +580,9 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
if (IS_ENABLED(CONFIG_X86_32))
goto clear_all;
+ if (!sme_me_mask)
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+
rdmsrl(MSR_K7_HWCR, msr);
if (!(msr & MSR_K7_HWCR_SMMLOCK))
goto clear_sev;
@@ -628,11 +604,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
early_init_amd_mc(c);
-#ifdef CONFIG_X86_32
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_K7);
-#endif
-
if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
@@ -651,6 +622,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & BIT(12))
set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
@@ -948,7 +923,6 @@ static void init_amd(struct cpuinfo_x86 *c)
amd_detect_cmp(c);
amd_get_topology(c);
srat_detect_node(c);
- amd_detect_ppin(c);
init_amd_cacheinfo(c);
@@ -990,6 +964,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
@@ -1170,3 +1146,19 @@ void set_dr_addr_mask(unsigned long mask, int dr)
break;
}
}
+
+u32 amd_get_highest_perf(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+ (c->x86_model >= 0x70 && c->x86_model < 0x80)))
+ return 166;
+
+ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+ (c->x86_model >= 0x40 && c->x86_model < 0x70)))
+ return 166;
+
+ return 255;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 22911deacb6e..1f60a2b27936 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -6,146 +6,446 @@
* Copyright (C) 2017 Intel Corp.
* Author: Len Brown <len.brown@intel.com>
*/
-
+#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
-#include <linux/cpufreq.h>
-#include <linux/smp.h>
-#include <linux/sched/isolation.h>
#include <linux/rcupdate.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/topology.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
#include "cpu.h"
-struct aperfmperf_sample {
- unsigned int khz;
- atomic_t scfpending;
- ktime_t time;
- u64 aperf;
- u64 mperf;
+struct aperfmperf {
+ seqcount_t seq;
+ unsigned long last_update;
+ u64 acnt;
+ u64 mcnt;
+ u64 aperf;
+ u64 mperf;
};
-static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
+ .seq = SEQCNT_ZERO(cpu_samples.seq)
+};
-#define APERFMPERF_CACHE_THRESHOLD_MS 10
-#define APERFMPERF_REFRESH_DELAY_MS 10
-#define APERFMPERF_STALE_THRESHOLD_MS 1000
+static void init_counter_refs(void)
+{
+ u64 aperf, mperf;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ this_cpu_write(cpu_samples.aperf, aperf);
+ this_cpu_write(cpu_samples.mperf, mperf);
+}
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
- * aperfmperf_snapshot_khz()
- * On the current CPU, snapshot APERF, MPERF, and jiffies
- * unless we already did it within 10ms
- * calculate kHz, save snapshot
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
*/
-static void aperfmperf_snapshot_khz(void *dummy)
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
{
- u64 aperf, aperf_delta;
- u64 mperf, mperf_delta;
- struct aperfmperf_sample *s = this_cpu_ptr(&samples);
- unsigned long flags;
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
- local_irq_save(flags);
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
- local_irq_restore(flags);
+static bool __init turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#define X86_MATCH(model) \
+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(XEON_PHI_KNL),
+ X86_MATCH(XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(ATOM_GOLDMONT),
+ X86_MATCH(ATOM_GOLDMONT_D),
+ X86_MATCH(ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
- aperf_delta = aperf - s->aperf;
- mperf_delta = mperf - s->mperf;
+static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool __init intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
/*
- * There is no architectural guarantee that MPERF
- * increments faster than we can read it.
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
*/
- if (mperf_delta == 0)
- return;
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
- s->time = ktime_get();
- s->aperf = aperf;
- s->mperf = mperf;
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
- atomic_set_release(&s->scfpending, 0);
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
+ arch_set_max_freq_ratio(turbo_disabled());
+
+ return true;
}
-static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
+#ifdef CONFIG_PM_SLEEP
+static struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static void register_freq_invariance_syscore_ops(void)
{
- s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
- struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+ register_syscore_ops(&freq_invariance_syscore_ops);
+}
+#else
+static inline void register_freq_invariance_syscore_ops(void) {}
+#endif
- /* Don't bother re-computing within the cache threshold time. */
- if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
- return true;
+static void freq_invariance_enable(void)
+{
+ if (static_branch_unlikely(&arch_scale_freq_key)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ static_branch_enable(&arch_scale_freq_key);
+ register_freq_invariance_syscore_ops();
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
+}
+
+void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
+{
+ arch_turbo_freq_ratio = ratio;
+ arch_set_max_freq_ratio(turbo_disabled);
+ freq_invariance_enable();
+}
+
+static void __init bp_init_freq_invariance(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
- if (!atomic_xchg(&s->scfpending, 1) || wait)
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+ if (intel_set_max_freq_ratio())
+ freq_invariance_enable();
+}
- /* Return false if the previous iteration was too long ago. */
- return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ static_branch_disable(&arch_scale_freq_key);
}
-unsigned int aperfmperf_get_khz(int cpu)
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+
+static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- if (!cpu_khz)
- return 0;
+ u64 freq_scale;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
+ if (!arch_scale_freq_invariant())
+ return;
- if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
- return 0;
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
- if (rcu_is_idle_cpu(cpu))
- return 0; /* Idle CPUs are completely uninteresting. */
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ goto error;
- aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
- return per_cpu(samples.khz, cpu);
+ freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
}
+#else
+static inline void bp_init_freq_invariance(void) { }
+static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
+#endif /* CONFIG_X86_64 && CONFIG_SMP */
-void arch_freq_prepare_all(void)
+void arch_scale_freq_tick(void)
{
- ktime_t now = ktime_get();
- bool wait = false;
- int cpu;
+ struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
+ u64 acnt, mcnt, aperf, mperf;
- if (!cpu_khz)
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ acnt = aperf - s->aperf;
+ mcnt = mperf - s->mperf;
- for_each_online_cpu(cpu) {
- if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
- continue;
- if (rcu_is_idle_cpu(cpu))
- continue; /* Idle CPUs are completely uninteresting. */
- if (!aperfmperf_snapshot_cpu(cpu, now, false))
- wait = true;
- }
+ s->aperf = aperf;
+ s->mperf = mperf;
+
+ raw_write_seqcount_begin(&s->seq);
+ s->last_update = jiffies;
+ s->acnt = acnt;
+ s->mcnt = mcnt;
+ raw_write_seqcount_end(&s->seq);
- if (wait)
- msleep(APERFMPERF_REFRESH_DELAY_MS);
+ scale_freq_tick(acnt, mcnt);
}
+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
unsigned int arch_freq_get_on_cpu(int cpu)
{
- struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned int seq, freq;
+ unsigned long last;
+ u64 acnt, mcnt;
- if (!cpu_khz)
- return 0;
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ goto fallback;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
- if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
- return 0;
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ goto fallback;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+
+fallback:
+ freq = cpufreq_quick_get(cpu);
+ return freq ? freq : cpu_khz;
+}
- if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
- return per_cpu(samples.khz, cpu);
+static int __init bp_init_aperfmperf(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
- msleep(APERFMPERF_REFRESH_DELAY_MS);
- atomic_set(&s->scfpending, 1);
- smp_mb(); /* ->scfpending before smp_call_function_single(). */
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+ init_counter_refs();
+ bp_init_freq_invariance();
+ return 0;
+}
+early_initcall(bp_init_aperfmperf);
- return per_cpu(samples.khz, cpu);
+void ap_init_aperfmperf(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ init_counter_refs();
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d41b70fe4918..74c62cc47a5f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,13 +16,14 @@
#include <linux/prctl.h>
#include <linux/sched/smt.h>
#include <linux/pgtable.h>
+#include <linux/bpf.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
@@ -40,9 +41,12 @@ static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
-static void __init mds_print_mitigation(void);
+static void __init md_clear_update_mitigation(void);
+static void __init md_clear_select_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init mmio_select_mitigation(void);
static void __init srbds_select_mitigation(void);
+static void __init l1d_flush_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -76,6 +80,17 @@ EXPORT_SYMBOL_GPL(mds_user_clear);
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
+/*
+ * Controls whether l1d flush based mitigations are enabled,
+ * based on hw features and admin setting via boot parameter
+ * defaults to false
+ */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+
+/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
+DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
+
void __init check_bugs(void)
{
identify_boot_cpu();
@@ -108,15 +123,9 @@ void __init check_bugs(void)
spectre_v2_select_mitigation();
ssb_select_mitigation();
l1tf_select_mitigation();
- mds_select_mitigation();
- taa_select_mitigation();
+ md_clear_select_mitigation();
srbds_select_mitigation();
-
- /*
- * As MDS and TAA mitigations are inter-related, print MDS
- * mitigation until after TAA mitigation selection is done.
- */
- mds_print_mitigation();
+ l1d_flush_select_mitigation();
arch_smt_update();
@@ -257,14 +266,6 @@ static void __init mds_select_mitigation(void)
}
}
-static void __init mds_print_mitigation(void)
-{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
- return;
-
- pr_info("%s\n", mds_strings[mds_mitigation]);
-}
-
static int __init mds_cmdline(char *str)
{
if (!boot_cpu_has_bug(X86_BUG_MDS))
@@ -319,7 +320,7 @@ static void __init taa_select_mitigation(void)
/* TSX previously disabled by tsx=off */
if (!boot_cpu_has(X86_FEATURE_RTM)) {
taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
- goto out;
+ return;
}
if (cpu_mitigations_off()) {
@@ -333,7 +334,7 @@ static void __init taa_select_mitigation(void)
*/
if (taa_mitigation == TAA_MITIGATION_OFF &&
mds_mitigation == MDS_MITIGATION_OFF)
- goto out;
+ return;
if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
taa_mitigation = TAA_MITIGATION_VERW;
@@ -365,18 +366,6 @@ static void __init taa_select_mitigation(void)
if (taa_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
-
- /*
- * Update MDS mitigation, if necessary, as the mds_user_clear is
- * now enabled for TAA mitigation.
- */
- if (mds_mitigation == MDS_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MDS)) {
- mds_mitigation = MDS_MITIGATION_FULL;
- mds_select_mitigation();
- }
-out:
- pr_info("%s\n", taa_strings[taa_mitigation]);
}
static int __init tsx_async_abort_parse_cmdline(char *str)
@@ -401,6 +390,151 @@ static int __init tsx_async_abort_parse_cmdline(char *str)
early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "MMIO Stale Data: " fmt
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
+static bool mmio_nosmt __ro_after_init = false;
+
+static const char * const mmio_strings[] = {
+ [MMIO_MITIGATION_OFF] = "Vulnerable",
+ [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+};
+
+static void __init mmio_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
+ cpu_mitigations_off()) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ return;
+ }
+
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * Enable CPU buffer clear mitigation for host and VMM, if also affected
+ * by MDS or TAA. Otherwise, enable mitigation for VMM only.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
+ boot_cpu_has(X86_FEATURE_RTM)))
+ static_branch_enable(&mds_user_clear);
+ else
+ static_branch_enable(&mmio_stale_data_clear);
+
+ /*
+ * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
+ * be propagated to uncore buffers, clearing the Fill buffers on idle
+ * is required irrespective of SMT state.
+ */
+ if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
+ static_branch_enable(&mds_idle_clear);
+
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(ia32_cap & ARCH_CAP_MDS_NO)))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+
+ if (mmio_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+}
+
+static int __init mmio_stale_data_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static void __init md_clear_update_mitigation(void)
+{
+ if (cpu_mitigations_off())
+ return;
+
+ if (!static_key_enabled(&mds_user_clear))
+ goto out;
+
+ /*
+ * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
+ * mitigation, if necessary.
+ */
+ if (mds_mitigation == MDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_select_mitigation();
+ }
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_select_mitigation();
+ }
+ if (mmio_mitigation == MMIO_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_select_mitigation();
+ }
+out:
+ if (boot_cpu_has_bug(X86_BUG_MDS))
+ pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ pr_info("TAA: %s\n", taa_strings[taa_mitigation]);
+ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init md_clear_select_mitigation(void)
+{
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+
+ /*
+ * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
+ * and print their mitigation after MDS, TAA and MMIO Stale Data
+ * mitigation selection is done.
+ */
+ md_clear_update_mitigation();
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) "SRBDS: " fmt
enum srbds_mitigations {
@@ -436,6 +570,13 @@ void update_srbds_msr(void)
if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
return;
+ /*
+ * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX
+ * being disabled and it hasn't received the SRBDS MSR microcode.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ return;
+
rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
switch (srbds_mitigation) {
@@ -461,11 +602,13 @@ static void __init srbds_select_mitigation(void)
return;
/*
- * Check to see if this is one of the MDS_NO systems supporting
- * TSX that are only exposed to SRBDS when TSX is enabled.
+ * Check to see if this is one of the MDS_NO systems supporting TSX that
+ * are only exposed to SRBDS when TSX is enabled or when CPU is affected
+ * by Processor MMIO Stale Data vulnerability.
*/
ia32_cap = x86_read_arch_cap_msr();
- if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM))
+ if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
@@ -492,6 +635,34 @@ static int __init srbds_parse_cmdline(char *str)
early_param("srbds", srbds_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "L1D Flush : " fmt
+
+enum l1d_flush_mitigations {
+ L1D_FLUSH_OFF = 0,
+ L1D_FLUSH_ON,
+};
+
+static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF;
+
+static void __init l1d_flush_select_mitigation(void)
+{
+ if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ return;
+
+ static_branch_enable(&switch_mm_cond_l1d_flush);
+ pr_info("Conditional flush on switch_mm() enabled\n");
+}
+
+static int __init l1d_flush_parse_cmdline(char *str)
+{
+ if (!strcmp(str, "on"))
+ l1d_flush_mitigation = L1D_FLUSH_ON;
+
+ return 0;
+}
+early_param("l1d_flush", l1d_flush_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -613,6 +784,32 @@ static inline const char *spectre_v2_module_string(void)
static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
+#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
+#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+
+#ifdef CONFIG_BPF_SYSCALL
+void unpriv_ebpf_notify(int new_state)
+{
+ if (new_state)
+ return;
+
+ /* Unprivileged eBPF is enabled */
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_EIBRS:
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ break;
+ case SPECTRE_V2_EIBRS_LFENCE:
+ if (sched_smt_active())
+ pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ break;
+ default:
+ break;
+ }
+}
+#endif
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt);
@@ -627,7 +824,10 @@ enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_FORCE,
SPECTRE_V2_CMD_RETPOLINE,
SPECTRE_V2_CMD_RETPOLINE_GENERIC,
- SPECTRE_V2_CMD_RETPOLINE_AMD,
+ SPECTRE_V2_CMD_RETPOLINE_LFENCE,
+ SPECTRE_V2_CMD_EIBRS,
+ SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+ SPECTRE_V2_CMD_EIBRS_LFENCE,
};
enum spectre_v2_user_cmd {
@@ -700,6 +900,13 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
return SPECTRE_V2_USER_CMD_AUTO;
}
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return (mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE);
+}
+
static void __init
spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
{
@@ -721,11 +928,11 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
case SPECTRE_V2_USER_CMD_FORCE:
mode = SPECTRE_V2_USER_STRICT;
break;
+ case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
mode = SPECTRE_V2_USER_PRCTL;
break;
- case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_SECCOMP:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
if (IS_ENABLED(CONFIG_SECCOMP))
@@ -767,7 +974,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
*/
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
!smt_possible ||
- spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return;
/*
@@ -787,9 +994,11 @@ set_mode:
static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
- [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
- [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
- [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
+ [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
};
static const struct {
@@ -800,8 +1009,12 @@ static const struct {
{ "off", SPECTRE_V2_CMD_NONE, false },
{ "on", SPECTRE_V2_CMD_FORCE, true },
{ "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
- { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
+ { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
{ "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "eibrs", SPECTRE_V2_CMD_EIBRS, false },
+ { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false },
+ { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false },
{ "auto", SPECTRE_V2_CMD_AUTO, false },
};
@@ -838,17 +1051,30 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
- cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
- cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+ cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
!IS_ENABLED(CONFIG_RETPOLINE)) {
- pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option);
+ pr_err("%s selected but not compiled in. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_EIBRS ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n",
+ mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
- boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+ if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+ !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n",
+ mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
@@ -857,6 +1083,16 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return cmd;
}
+static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
+{
+ if (!IS_ENABLED(CONFIG_RETPOLINE)) {
+ pr_err("Kernel not compiled with retpoline; no mitigation available!");
+ return SPECTRE_V2_NONE;
+ }
+
+ return SPECTRE_V2_RETPOLINE;
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -877,49 +1113,64 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- mode = SPECTRE_V2_IBRS_ENHANCED;
- /* Force it so VMEXIT will restore correctly */
- x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
- goto specv2_set_mode;
+ mode = SPECTRE_V2_EIBRS;
+ break;
}
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_auto;
+
+ mode = spectre_v2_select_retpoline();
break;
- case SPECTRE_V2_CMD_RETPOLINE_AMD:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_amd;
+
+ case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
+ pr_err(SPECTRE_V2_LFENCE_MSG);
+ mode = SPECTRE_V2_LFENCE;
break;
+
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_generic;
+ mode = SPECTRE_V2_RETPOLINE;
break;
+
case SPECTRE_V2_CMD_RETPOLINE:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_auto;
+ mode = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS:
+ mode = SPECTRE_V2_EIBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_LFENCE:
+ mode = SPECTRE_V2_EIBRS_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
+ mode = SPECTRE_V2_EIBRS_RETPOLINE;
break;
}
- pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!");
- return;
-retpoline_auto:
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- retpoline_amd:
- if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
- pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
- goto retpoline_generic;
- }
- mode = SPECTRE_V2_RETPOLINE_AMD;
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
- } else {
- retpoline_generic:
- mode = SPECTRE_V2_RETPOLINE_GENERIC;
+ if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+
+ if (spectre_v2_in_eibrs_mode(mode)) {
+ /* Force it so VMEXIT will restore correctly */
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ }
+
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ case SPECTRE_V2_EIBRS:
+ break;
+
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+ fallthrough;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
}
-specv2_set_mode:
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
@@ -945,7 +1196,7 @@ specv2_set_mode:
* the CPU supports Enhanced IBRS, kernel might un-intentionally not
* enable IBRS around firmware calls.
*/
- if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) {
+ if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
@@ -991,6 +1242,8 @@ static void update_indir_branch_cond(void)
/* Update the static key controlling the MDS CPU buffer clear in idle */
static void update_mds_branch_idle(void)
{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
/*
* Enable the idle clearing if SMT is active on CPUs which are
* affected only by MSBDS and not any other MDS variant.
@@ -1002,19 +1255,26 @@ static void update_mds_branch_idle(void)
if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
return;
- if (sched_smt_active())
+ if (sched_smt_active()) {
static_branch_enable(&mds_idle_clear);
- else
+ } else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
+ (ia32_cap & ARCH_CAP_FBSDP_NO)) {
static_branch_disable(&mds_idle_clear);
+ }
}
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
void cpu_bugs_smt_update(void)
{
mutex_lock(&spec_ctrl_mutex);
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
break;
@@ -1050,6 +1310,16 @@ void cpu_bugs_smt_update(void)
break;
}
+ switch (mmio_mitigation) {
+ case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(MMIO_MSG_SMT);
+ break;
+ case MMIO_MITIGATION_OFF:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -1132,7 +1402,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
return mode;
switch (cmd) {
- case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_SECCOMP:
/*
* Choose prctl+seccomp as the default mode if seccomp is
@@ -1146,6 +1415,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
case SPEC_STORE_BYPASS_CMD_ON:
mode = SPEC_STORE_BYPASS_DISABLE;
break;
+ case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_PRCTL:
mode = SPEC_STORE_BYPASS_PRCTL;
break;
@@ -1215,6 +1485,24 @@ static void task_update_spec_tif(struct task_struct *tsk)
speculation_ctrl_update_current();
}
+static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+
+ if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+ return -EPERM;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+ return 0;
+ case PR_SPEC_DISABLE:
+ clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+ return 0;
+ default:
+ return -ERANGE;
+ }
+}
+
static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
{
if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
@@ -1324,6 +1612,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
return ssb_prctl_set(task, ctrl);
case PR_SPEC_INDIRECT_BRANCH:
return ib_prctl_set(task, ctrl);
+ case PR_SPEC_L1D_FLUSH:
+ return l1d_flush_prctl_set(task, ctrl);
default:
return -ENODEV;
}
@@ -1340,6 +1630,17 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
}
#endif
+static int l1d_flush_prctl_get(struct task_struct *task)
+{
+ if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+ return PR_SPEC_FORCE_DISABLE;
+
+ if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH))
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ else
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+}
+
static int ssb_prctl_get(struct task_struct *task)
{
switch (ssb_mode) {
@@ -1390,6 +1691,8 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
return ssb_prctl_get(task);
case PR_SPEC_INDIRECT_BRANCH:
return ib_prctl_get(task);
+ case PR_SPEC_L1D_FLUSH:
+ return l1d_flush_prctl_get(task);
default:
return -ENODEV;
}
@@ -1619,9 +1922,23 @@ static ssize_t tsx_async_abort_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t mmio_stale_data_show_state(char *buf)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mmio_strings[mmio_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
static char *stibp_state(void)
{
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return "";
switch (spectre_v2_user_stibp) {
@@ -1651,6 +1968,27 @@ static char *ibpb_state(void)
return "";
}
+static ssize_t spectre_v2_show_state(char *buf)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+ return sprintf(buf, "Vulnerable: LFENCE\n");
+
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+
+ return sprintf(buf, "%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ spectre_v2_module_string());
+}
+
static ssize_t srbds_show_state(char *buf)
{
return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
@@ -1676,12 +2014,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
case X86_BUG_SPECTRE_V2:
- return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
- ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
- spectre_v2_module_string());
+ return spectre_v2_show_state(buf);
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
@@ -1703,6 +2036,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_SRBDS:
return srbds_show_state(buf);
+ case X86_BUG_MMIO_STALE_DATA:
+ return mmio_stale_data_show_state(buf);
+
default:
break;
}
@@ -1754,4 +2090,9 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *
{
return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
}
+
+ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+}
#endif
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 3ca9be482a9e..fe98a1465be6 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
l2 = new_l2;
#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l2_id;
+ per_cpu(cpu_l2c_id, cpu) = l2_id;
#endif
}
@@ -877,7 +878,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
struct _cpuid4_info_regs *base)
{
- struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cpu_cacheinfo *this_cpu_ci;
struct cacheinfo *this_leaf;
int i, sibling;
@@ -985,7 +986,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf,
this_leaf->priv = base->nb;
}
-static int __init_cache_level(unsigned int cpu)
+int init_cache_level(unsigned int cpu)
{
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -1014,7 +1015,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
id4_regs->id = c->apicid >> index_msb;
}
-static int __populate_cache_leaves(unsigned int cpu)
+int populate_cache_leaves(unsigned int cpu)
{
unsigned int idx, ret;
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -1033,6 +1034,3 @@ static int __populate_cache_leaves(unsigned int cpu)
return 0;
}
-
-DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
-DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ab640abe26b6..4730b0a58f24 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -42,7 +42,7 @@
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/mtrr.h>
#include <asm/hwcap2.h>
#include <linux/numa.h>
@@ -58,6 +58,9 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/uv/uv.h>
+#include <asm/sigframe.h>
+#include <asm/traps.h>
+#include <asm/sev.h>
#include "cpu.h"
@@ -78,6 +81,92 @@ EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+u16 get_llc_id(unsigned int cpu)
+{
+ return per_cpu(cpu_llc_id, cpu);
+}
+EXPORT_SYMBOL_GPL(get_llc_id);
+
+/* L2 cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
+
+static struct ppin_info {
+ int feature;
+ int msr_ppin_ctl;
+ int msr_ppin;
+} ppin_info[] = {
+ [X86_VENDOR_INTEL] = {
+ .feature = X86_FEATURE_INTEL_PPIN,
+ .msr_ppin_ctl = MSR_PPIN_CTL,
+ .msr_ppin = MSR_PPIN
+ },
+ [X86_VENDOR_AMD] = {
+ .feature = X86_FEATURE_AMD_PPIN,
+ .msr_ppin_ctl = MSR_AMD_PPIN_CTL,
+ .msr_ppin = MSR_AMD_PPIN
+ },
+};
+
+static const struct x86_cpu_id ppin_cpuids[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_AMD_PPIN, &ppin_info[X86_VENDOR_AMD]),
+ X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
+
+ /* Legacy models without CPUID enumeration */
+ X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+
+ {}
+};
+
+static void ppin_init(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *id;
+ unsigned long long val;
+ struct ppin_info *info;
+
+ id = x86_match_cpu(ppin_cpuids);
+ if (!id)
+ return;
+
+ /*
+ * Testing the presence of the MSR is not enough. Need to check
+ * that the PPIN_CTL allows reading of the PPIN.
+ */
+ info = (struct ppin_info *)id->driver_data;
+
+ if (rdmsrl_safe(info->msr_ppin_ctl, &val))
+ goto clear_ppin;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN locked in disabled mode */
+ goto clear_ppin;
+ }
+
+ /* If PPIN is disabled, try to enable */
+ if (!(val & 2UL)) {
+ wrmsrl_safe(info->msr_ppin_ctl, val | 2UL);
+ rdmsrl_safe(info->msr_ppin_ctl, &val);
+ }
+
+ /* Is the enable bit set? */
+ if (val & 2UL) {
+ c->ppin = __rdmsr(info->msr_ppin);
+ set_cpu_cap(c, info->feature);
+ return;
+ }
+
+clear_ppin:
+ clear_cpu_cap(c, info->feature);
+}
+
/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
@@ -161,7 +250,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
[GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- GDT_STACK_CANARY_INIT
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
@@ -211,13 +299,6 @@ static int __init cachesize_setup(char *str)
}
__setup("cachesize=", cachesize_setup);
-static int __init x86_sep_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_SEP);
- return 1;
-}
-__setup("nosep", x86_sep_setup);
-
/* Standard macro to see if a specific flag is changeable */
static inline int flag_is_changeable_p(u32 flag)
{
@@ -289,26 +370,12 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
}
#endif
-static __init int setup_disable_smep(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_SMEP);
- return 1;
-}
-__setup("nosmep", setup_disable_smep);
-
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
cr4_set_bits(X86_CR4_SMEP);
}
-static __init int setup_disable_smap(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_SMAP);
- return 1;
-}
-__setup("nosmap", setup_disable_smap);
-
static __always_inline void setup_smap(struct cpuinfo_x86 *c)
{
unsigned long eflags = native_save_fl();
@@ -316,13 +383,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
/* This should have been cleared long ago */
BUG_ON(eflags & X86_EFLAGS_AC);
- if (cpu_has(c, X86_FEATURE_SMAP)) {
-#ifdef CONFIG_X86_SMAP
+ if (cpu_has(c, X86_FEATURE_SMAP))
cr4_set_bits(X86_CR4_SMAP);
-#else
- cr4_clear_bits(X86_CR4_SMAP);
-#endif
- }
}
static __always_inline void setup_umip(struct cpuinfo_x86 *c)
@@ -351,7 +413,8 @@ out:
/* These bits should not change their value after CPU init is finished. */
static const unsigned long cr4_pinned_mask =
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE;
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+ X86_CR4_FSGSBASE | X86_CR4_CET;
static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
static unsigned long cr4_pinned_bits __ro_after_init;
@@ -374,7 +437,7 @@ set_register:
}
EXPORT_SYMBOL(native_write_cr0);
-void native_write_cr4(unsigned long val)
+void __no_profile native_write_cr4(unsigned long val)
{
unsigned long bits_changed = 0;
@@ -466,27 +529,22 @@ static bool pku_disabled;
static __always_inline void setup_pku(struct cpuinfo_x86 *c)
{
- struct pkru_state *pk;
+ if (c == &boot_cpu_data) {
+ if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+ return;
+ /*
+ * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+ * bit to be set. Enforce it.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSPKE);
- /* check the boot processor, plus compile options for PKU: */
- if (!cpu_feature_enabled(X86_FEATURE_PKU))
- return;
- /* checks the actual processor's cpuid bits: */
- if (!cpu_has(c, X86_FEATURE_PKU))
- return;
- if (pku_disabled)
+ } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
return;
+ }
cr4_set_bits(X86_CR4_PKE);
- pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
- if (pk)
- pk->pkru = init_pkru_value;
- /*
- * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
- * cpuid bit to be set. We need to ensure that we
- * update that bit in this CPU's "cpu_info".
- */
- set_cpu_cap(c, X86_FEATURE_OSPKE);
+ /* Load the default PKRU value */
+ pkru_write_default();
}
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -510,6 +568,58 @@ static __init int setup_disable_pku(char *arg)
__setup("nopku", setup_disable_pku);
#endif /* CONFIG_X86_64 */
+#ifdef CONFIG_X86_KERNEL_IBT
+
+__noendbr u64 ibt_save(void)
+{
+ u64 msr = 0;
+
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ rdmsrl(MSR_IA32_S_CET, msr);
+ wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
+ }
+
+ return msr;
+}
+
+__noendbr void ibt_restore(u64 save)
+{
+ u64 msr;
+
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ rdmsrl(MSR_IA32_S_CET, msr);
+ msr &= ~CET_ENDBR_EN;
+ msr |= (save & CET_ENDBR_EN);
+ wrmsrl(MSR_IA32_S_CET, msr);
+ }
+}
+
+#endif
+
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+ u64 msr = CET_ENDBR_EN;
+
+ if (!HAS_KERNEL_IBT ||
+ !cpu_feature_enabled(X86_FEATURE_IBT))
+ return;
+
+ wrmsrl(MSR_IA32_S_CET, msr);
+ cr4_set_bits(X86_CR4_CET);
+
+ if (!ibt_selftest()) {
+ pr_err("IBT selftest: Failed!\n");
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+ return;
+ }
+}
+
+__noendbr void cet_disable(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_IBT))
+ wrmsrl(MSR_IA32_S_CET, 0);
+}
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -599,7 +709,6 @@ void load_percpu_segment(int cpu)
__loadsegment_simple(gs, 0);
wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
#endif
- load_stack_canary_segment();
}
#ifdef CONFIG_X86_32
@@ -1044,6 +1153,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
@@ -1100,18 +1211,42 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
X86_FEATURE_ANY, issues)
#define SRBDS BIT(0)
+/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
+#define MMIO BIT(1)
+/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
+#define MMIO_SBDS BIT(2)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) |
+ BIT(7) | BIT(0xB), MMIO),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO),
+ VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS),
{}
};
@@ -1132,6 +1267,13 @@ u64 x86_read_arch_cap_msr(void)
return ia32_cap;
}
+static bool arch_cap_mmio_immune(u64 ia32_cap)
+{
+ return (ia32_cap & ARCH_CAP_FBSDP_NO &&
+ ia32_cap & ARCH_CAP_PSDP_NO &&
+ ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
+}
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -1185,12 +1327,27 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
/*
* SRBDS affects CPUs which support RDRAND or RDSEED and are listed
* in the vulnerability blacklist.
+ *
+ * Some of the implications and mitigation of Shared Buffers Data
+ * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as
+ * SRBDS.
*/
if ((cpu_has(c, X86_FEATURE_RDRAND) ||
cpu_has(c, X86_FEATURE_RDSEED)) &&
- cpu_matches(cpu_vuln_blacklist, SRBDS))
+ cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS))
setup_force_cpu_bug(X86_BUG_SRBDS);
+ /*
+ * Processor MMIO Stale Data bug enumeration
+ *
+ * Affected CPU list is generally enough to enumerate the vulnerability,
+ * but for virtualization case check for ARCH_CAP MSR bits also, VMM may
+ * not want the guest to enumerate the bug.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, MMIO) &&
+ !arch_cap_mmio_immune(ia32_cap))
+ setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -1231,8 +1388,8 @@ static void detect_nopl(void)
static void __init cpu_parse_early_param(void)
{
char arg[128];
- char *argptr = arg;
- int arglen, res, bit;
+ char *argptr = arg, *opt;
+ int arglen, taint = 0;
#ifdef CONFIG_X86_32
if (cmdline_find_option_bool(boot_command_line, "no387"))
@@ -1260,21 +1417,61 @@ static void __init cpu_parse_early_param(void)
return;
pr_info("Clearing CPUID bits:");
- do {
- res = get_option(&argptr, &bit);
- if (res == 0 || res == 3)
- break;
- /* If the argument was too long, the last bit may be cut off */
- if (res == 1 && arglen >= sizeof(arg))
- break;
+ while (argptr) {
+ bool found __maybe_unused = false;
+ unsigned int bit;
+
+ opt = strsep(&argptr, ",");
- if (bit >= 0 && bit < NCAPINTS * 32) {
- pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ /*
+ * Handle naked numbers first for feature flags which don't
+ * have names.
+ */
+ if (!kstrtouint(opt, 10, &bit)) {
+ if (bit < NCAPINTS * 32) {
+
+#ifdef CONFIG_X86_FEATURE_NAMES
+ /* empty-string, i.e., ""-defined feature flags */
+ if (!x86_cap_flags[bit])
+ pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit));
+ else
+#endif
+ pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+
+ setup_clear_cpu_cap(bit);
+ taint++;
+ }
+ /*
+ * The assumption is that there are no feature names with only
+ * numbers in the name thus go to the next argument.
+ */
+ continue;
+ }
+
+#ifdef CONFIG_X86_FEATURE_NAMES
+ for (bit = 0; bit < 32 * NCAPINTS; bit++) {
+ if (!x86_cap_flag(bit))
+ continue;
+
+ if (strcmp(x86_cap_flag(bit), opt))
+ continue;
+
+ pr_cont(" %s", opt);
setup_clear_cpu_cap(bit);
+ taint++;
+ found = true;
+ break;
}
- } while (res == 2);
+
+ if (!found)
+ pr_cont(" (unknown: %s)", opt);
+#endif
+ }
pr_cont("\n");
+
+ if (taint)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
}
/*
@@ -1330,10 +1527,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_set_bug_bits(c);
- cpu_set_core_cap_bits(c);
+ sld_setup(c);
fpu__init_system(c);
+ init_sigframe_size();
+
#ifdef CONFIG_X86_32
/*
* Regardless of whether PCID is enumerated, the SDM says
@@ -1393,9 +1592,8 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1404,7 +1602,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
* where GS is unused by the prev and next threads.
*
* Since neither vendor documents this anywhere that I can see,
- * detect it directly instead of hardcoding the choice by
+ * detect it directly instead of hard-coding the choice by
* vendor.
*
* I've designated AMD's behavior as the "bug" because it's
@@ -1416,10 +1614,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1455,8 +1686,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
@@ -1594,6 +1823,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
x86_init_rdrand(c);
setup_pku(c);
+ setup_cet(c);
/*
* Clear/Set all flags overridden by options, need do it
@@ -1617,6 +1847,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->x86_capability[i] |= boot_cpu_data.x86_capability[i];
}
+ ppin_init(c);
+
/* Init Machine Check Exception if available. */
mcheck_cpu_init(c);
@@ -1660,6 +1892,8 @@ void enable_sep_cpu(void)
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ pr_info("CET detected: Indirect Branch Tracking enabled\n");
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
@@ -1681,15 +1915,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
-}
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
- setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);
- return 1;
+ tsx_ap_init();
}
-__setup("noclflush", setup_noclflush);
void print_cpu_info(struct cpuinfo_x86 *c)
{
@@ -1719,9 +1947,8 @@ void print_cpu_info(struct cpuinfo_x86 *c)
}
/*
- * clearcpuid= was already parsed in fpu__init_parse_early_param.
- * But we need to keep a dummy __setup around otherwise it would
- * show up as an environment variable for init.
+ * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy
+ * function prevents it from becoming an environment variable for init.
*/
static __init int setup_clearcpuid(char *arg)
{
@@ -1748,6 +1975,19 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+
+static void wrmsrl_cstar(unsigned long val)
+{
+ /*
+ * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR
+ * is so far ignored by the CPU, but raises a #VE trap in a TDX
+ * guest. Avoid the pointless write on all Intel CPUs.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ wrmsrl(MSR_CSTAR, val);
+}
+
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
@@ -1755,7 +1995,7 @@ void syscall_init(void)
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
- wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
+ wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1767,16 +2007,22 @@ void syscall_init(void)
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
- wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
+ wrmsrl_cstar((unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
- /* Flags to clear on syscall */
+ /*
+ * Flags to clear on syscall; clear as much as possible
+ * to minimize user space-kernel interference.
+ */
wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
- X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
+ X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
+ X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
+ X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
+ X86_EFLAGS_AC|X86_EFLAGS_ID);
}
#else /* CONFIG_X86_64 */
@@ -1796,7 +2042,8 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
#ifdef CONFIG_STACKPROTECTOR
-DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
+EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
#endif /* CONFIG_X86_64 */
@@ -1850,8 +2097,8 @@ static inline void setup_getcpu(int cpu)
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };
- if (boot_cpu_has(X86_FEATURE_RDTSCP))
- write_rdtscp_aux(cpudata);
+ if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
+ wrmsr(MSR_TSC_AUX, cpudata, 0);
/* Store CPU and node number in limit. */
d.limit0 = cpudata;
@@ -1931,19 +2178,21 @@ void cpu_init_exception_handling(void)
load_TR_desc();
+ /* GHCB needs to be setup to handle #VC. */
+ setup_ghcb();
+
/* Finally load the IDT */
load_current_idt();
}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
+ * initialized (naturally) in the bootstrap process, such as the GDT. We
+ * reload it nevertheless, this function acts as a 'CPU state barrier',
+ * nothing should get across.
*/
void cpu_init(void)
{
- struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
@@ -1956,8 +2205,6 @@ void cpu_init(void)
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
- setup_getcpu(cpu);
-
pr_debug("Initializing CPU#%d\n", cpu);
if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
@@ -1969,7 +2216,6 @@ void cpu_init(void)
* and set up the GDT descriptor:
*/
switch_to_new_gdt(cpu);
- load_current_idt();
if (IS_ENABLED(CONFIG_X86_64)) {
loadsegment(fs, 0);
@@ -1989,12 +2235,6 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, cur);
- /* Initialize the TSS. */
- tss_setup_ist(tss);
- tss_setup_io_bitmap(tss);
- set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
-
- load_TR_desc();
/*
* sp0 points to the entry trampoline stack regardless of what task
* is running.
@@ -2016,6 +2256,19 @@ void cpu_init(void)
load_fixmap_gdt(cpu);
}
+#ifdef CONFIG_SMP
+void cpu_init_secondary(void)
+{
+ /*
+ * Relies on the BP having set-up the IDT tables, which are loaded
+ * on this CPU in cpu_init_exception_handling().
+ */
+ cpu_init_exception_handling();
+ cpu_init();
+}
+#endif
+
+#ifdef CONFIG_MICROCODE_LATE_LOADING
/*
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
@@ -2045,6 +2298,7 @@ void microcode_check(void)
pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
}
+#endif
/*
* Invoked from core CPU hotplug code after hotplug operations
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 67944128876d..2a8e584fc991 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -48,16 +48,17 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
enum tsx_ctrl_states {
TSX_CTRL_ENABLE,
TSX_CTRL_DISABLE,
+ TSX_CTRL_RTM_ALWAYS_ABORT,
TSX_CTRL_NOT_SUPPORTED,
};
extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
-extern void tsx_enable(void);
-extern void tsx_disable(void);
+void tsx_ap_init(void);
#else
static inline void tsx_init(void) { }
+static inline void tsx_ap_init(void) { }
#endif /* CONFIG_CPU_SUP_INTEL */
extern void get_cpu_cap(struct cpuinfo_x86 *c);
@@ -73,6 +74,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 42af31b64c2c..c881bcafba7d 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,12 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_XFD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
+ { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
{}
};
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 1d9b8aaea06c..7227c15299d0 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -291,7 +291,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
- c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
+ c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 3b1b01f2b248..da696eb4821a 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
}
#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
-static void clear_sgx_caps(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_SGX);
- setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
static int __init nosgx(char *str)
{
- clear_sgx_caps();
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
return 0;
}
@@ -110,23 +104,30 @@ early_param("nosgx", nosgx);
void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
bool tboot = tboot_enabled();
- bool enable_sgx;
+ bool enable_vmx;
u64 msr;
if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
- clear_sgx_caps();
+ clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
- /*
- * Enable SGX if and only if the kernel supports SGX and Launch Control
- * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
- */
- enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
- cpu_has(c, X86_FEATURE_SGX_LC) &&
- IS_ENABLED(CONFIG_X86_SGX);
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
if (msr & FEAT_CTL_LOCKED)
goto update_caps;
@@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
* i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
* for the kernel, e.g. using VMX to hide malicious code.
*/
- if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ if (enable_vmx) {
msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
if (tboot)
msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
}
- if (enable_sgx)
- msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
wrmsrl(MSR_IA32_FEAT_CTL, msr);
@@ -173,10 +177,29 @@ update_caps:
}
update_sgx:
- if (!(msr & FEAT_CTL_SGX_ENABLED) ||
- !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
- if (enable_sgx)
- pr_err_once("SGX disabled by BIOS\n");
- clear_sgx_caps();
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index ae59115d18f9..3fcdda4c1e11 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -215,12 +215,12 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c)
u32 ecx;
ecx = cpuid_ecx(0x8000001e);
- nodes_per_socket = ((ecx >> 8) & 7) + 1;
+ __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
} else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- nodes_per_socket = ((value >> 3) & 7) + 1;
+ __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
}
if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
@@ -260,6 +260,10 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
if (c->x86_power & BIT(12))
set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#endif
@@ -331,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0e422a544835..fd5dead8371c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,9 +7,13 @@
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/semaphore.h>
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -41,12 +45,13 @@ enum split_lock_detect_state {
sld_off = 0,
sld_warn,
sld_fatal,
+ sld_ratelimit,
};
/*
- * Default to sld_off because most systems do not support split lock detection
- * split_lock_setup() will switch this to sld_warn on systems that support
- * split lock detect, unless there is a command line override.
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
*/
static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
static u64 msr_test_ctrl_cache __ro_after_init;
@@ -89,7 +94,7 @@ static bool ring3mwait_disabled __read_mostly;
static int __init ring3mwait_disable(char *__unused)
{
ring3mwait_disabled = true;
- return 0;
+ return 1;
}
__setup("ring3mwait=disable", ring3mwait_disable);
@@ -179,6 +184,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
return false;
}
+int intel_cpu_collect_info(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ unsigned int family, model;
+ struct cpu_signature csig = { 0 };
+ unsigned int eax, ebx, ecx, edx;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ family = x86_family(eax);
+ model = x86_model(eax);
+
+ if (model >= 5 || family > 6) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ csig.rev = intel_get_microcode_revision();
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
+
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
@@ -301,7 +338,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* The operating system must reload CR3 to cause the TLB to be flushed"
*
* As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
- * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
+ * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
* to be modified.
*/
if (c->x86 == 5 && c->x86_model == 9) {
@@ -603,6 +640,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
}
static void split_lock_init(void);
+static void bus_lock_init(void);
static void init_intel(struct cpuinfo_x86 *c)
{
@@ -714,12 +752,8 @@ static void init_intel(struct cpuinfo_x86 *c)
init_intel_misc_features(c);
- if (tsx_ctrl_state == TSX_CTRL_ENABLE)
- tsx_enable();
- if (tsx_ctrl_state == TSX_CTRL_DISABLE)
- tsx_disable();
-
split_lock_init();
+ bus_lock_init();
intel_init_thermal(c);
}
@@ -995,13 +1029,32 @@ static const struct {
{ "off", sld_off },
{ "warn", sld_warn },
{ "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
};
+static struct ratelimit_state bld_ratelimit;
+
+static DEFINE_SEMAPHORE(buslock_sem);
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
- int len = strlen(opt);
+ int len = strlen(opt), ratelimit;
- return len == arglen && !strncmp(arg, opt, len);
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
+
+ return len == arglen;
}
static bool split_lock_verify_msr(bool on)
@@ -1020,16 +1073,15 @@ static bool split_lock_verify_msr(bool on)
return ctrl == tmp;
}
-static void __init split_lock_setup(void)
+static void __init sld_state_setup(void)
{
enum split_lock_detect_state state = sld_warn;
char arg[20];
int i, ret;
- if (!split_lock_verify_msr(false)) {
- pr_info("MSR access failed: Disabled\n");
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
return;
- }
ret = cmdline_find_option(boot_command_line, "split_lock_detect",
arg, sizeof(arg));
@@ -1041,17 +1093,14 @@ static void __init split_lock_setup(void)
}
}
}
+ sld_state = state;
+}
- switch (state) {
- case sld_off:
- pr_info("disabled\n");
+static void __init __split_lock_setup(void)
+{
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
return;
- case sld_warn:
- pr_info("warning about user-space split_locks\n");
- break;
- case sld_fatal:
- pr_info("sending SIGBUS on user-space split_locks\n");
- break;
}
rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
@@ -1061,7 +1110,9 @@ static void __init split_lock_setup(void)
return;
}
- sld_state = state;
+ /* Restore the MSR to its cached value. */
+ wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
}
@@ -1082,22 +1133,65 @@ static void sld_update_msr(bool on)
static void split_lock_init(void)
{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
if (cpu_model_supports_sld)
split_lock_verify_msr(sld_state != sld_off);
}
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
+
static void split_lock_warn(unsigned long ip)
{
- pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
- current->comm, current->pid, ip);
+ int cpu;
- /*
- * Disable the split lock detection for this task so it can make
- * progress and set TIF_SLD so the detection is re-enabled via
- * switch_to_sld() when the task is scheduled out.
- */
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ /* misery factor #1, sleep 10ms before trying to execute split lock */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /* Misery factor #2, only allow one buslocked disabled core at a time */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ cpu = get_cpu();
+ schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
sld_update_msr(false);
- set_tsk_thread_flag(current, TIF_SLD);
+ put_cpu();
}
bool handle_guest_split_lock(unsigned long ip)
@@ -1118,6 +1212,29 @@ bool handle_guest_split_lock(unsigned long ip)
}
EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+static void bus_lock_init(void)
+{
+ u64 val;
+
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) ||
+ (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off)
+ return;
+
+ /*
+ * Enable #DB for bus lock. All bus locks are handled in #DB except
+ * split locks are handled in #AC in the fatal case.
+ */
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+}
+
bool handle_user_split_lock(struct pt_regs *regs, long error_code)
{
if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
@@ -1126,16 +1243,25 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
return true;
}
-/*
- * This function is called only when switching between tasks with
- * different split-lock detection modes. It sets the MSR for the
- * mode of the new task. This is right most of the time, but since
- * the MSR is shared by hyperthreads on a physical core there can
- * be glitches when the two threads need different modes.
- */
-void switch_to_sld(unsigned long tifn)
+void handle_bus_lock(struct pt_regs *regs)
{
- sld_update_msr(!(tifn & _TIF_SLD));
+ switch (sld_state) {
+ case sld_off:
+ break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
+ case sld_warn:
+ pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, regs->ip);
+ break;
+ case sld_fatal:
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ break;
+ }
}
/*
@@ -1163,10 +1289,11 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1),
{}
};
-void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
{
const struct x86_cpu_id *m;
u64 ia32_core_caps;
@@ -1193,5 +1320,64 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
}
cpu_model_supports_sld = true;
- split_lock_setup();
+ __split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return;
+
+ switch (sld_state) {
+ case sld_off:
+ pr_info("disabled\n");
+ break;
+ case sld_warn:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: warning on user-space bus_locks\n");
+ }
+ break;
+ case sld_fatal:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+ boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+ " from non-WB" : "");
+ }
+ break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
+ }
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+ split_lock_setup(c);
+ sld_state_setup();
+ sld_state_show();
+}
+
+#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24
+
+/**
+ * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU
+ *
+ * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in
+ * a hybrid processor. If the processor is not hybrid, returns 0.
+ */
+u8 get_this_hybrid_cpu_type(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return 0;
+
+ return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
}
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index f4dd73396f28..fbaf12e43f41 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -16,6 +16,7 @@
#include <linux/syscore_ops.h>
#include <linux/pm.h>
+#include <asm/cpu_device_id.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -58,6 +59,22 @@ static DEFINE_PER_CPU(u8, saved_epb);
#define EPB_SAVED 0x10ULL
#define MAX_EPB EPB_MASK
+enum energy_perf_value_index {
+ EPB_INDEX_PERFORMANCE,
+ EPB_INDEX_BALANCE_PERFORMANCE,
+ EPB_INDEX_NORMAL,
+ EPB_INDEX_BALANCE_POWERSAVE,
+ EPB_INDEX_POWERSAVE,
+};
+
+static u8 energ_perf_values[] = {
+ [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE,
+ [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
+ [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL,
+ [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
+ [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE,
+};
+
static int intel_epb_save(void)
{
u64 epb;
@@ -90,7 +107,7 @@ static void intel_epb_restore(void)
*/
val = epb & EPB_MASK;
if (val == ENERGY_PERF_BIAS_PERFORMANCE) {
- val = ENERGY_PERF_BIAS_NORMAL;
+ val = energ_perf_values[EPB_INDEX_NORMAL];
pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
}
}
@@ -103,18 +120,11 @@ static struct syscore_ops intel_epb_syscore_ops = {
};
static const char * const energy_perf_strings[] = {
- "performance",
- "balance-performance",
- "normal",
- "balance-power",
- "power"
-};
-static const u8 energ_perf_values[] = {
- ENERGY_PERF_BIAS_PERFORMANCE,
- ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
- ENERGY_PERF_BIAS_NORMAL,
- ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
- ENERGY_PERF_BIAS_POWERSAVE
+ [EPB_INDEX_PERFORMANCE] = "performance",
+ [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance",
+ [EPB_INDEX_NORMAL] = "normal",
+ [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power",
+ [EPB_INDEX_POWERSAVE] = "power",
};
static ssize_t energy_perf_bias_show(struct device *dev,
@@ -193,13 +203,22 @@ static int intel_epb_offline(unsigned int cpu)
return 0;
}
+static const struct x86_cpu_id intel_epb_normal[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7),
+ {}
+};
+
static __init int intel_epb_init(void)
{
+ const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal);
int ret;
if (!boot_cpu_has(X86_FEATURE_EPB))
return -ENODEV;
+ if (id)
+ energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data;
+
ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE,
"x86/intel/epb:online", intel_epb_online,
intel_epb_offline);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e486f96b3cb3..1c87501e0fa3 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -71,33 +71,58 @@ static const char * const smca_umc_block_names[] = {
"misc_umc"
};
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+};
+
+struct smca_bank {
+ const struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
+static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
+
struct smca_bank_name {
const char *name; /* Short name for sysfs */
const char *long_name; /* Long name for pretty-printing */
};
static struct smca_bank_name smca_names[] = {
- [SMCA_LS] = { "load_store", "Load Store Unit" },
- [SMCA_LS_V2] = { "load_store", "Load Store Unit" },
- [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
- [SMCA_DE] = { "decode_unit", "Decode Unit" },
- [SMCA_RESERVED] = { "reserved", "Reserved" },
- [SMCA_EX] = { "execution_unit", "Execution Unit" },
- [SMCA_FP] = { "floating_point", "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
- [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
- [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
- [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
- [SMCA_UMC] = { "umc", "Unified Memory Controller" },
- [SMCA_PB] = { "param_block", "Parameter Block" },
- [SMCA_PSP] = { "psp", "Platform Security Processor" },
- [SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
- [SMCA_SMU] = { "smu", "System Management Unit" },
- [SMCA_SMU_V2] = { "smu", "System Management Unit" },
- [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
- [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
- [SMCA_PCIE] = { "pcie", "PCI Express Unit" },
+ [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
+ [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
+ [SMCA_DE] = { "decode_unit", "Decode Unit" },
+ [SMCA_RESERVED] = { "reserved", "Reserved" },
+ [SMCA_EX] = { "execution_unit", "Execution Unit" },
+ [SMCA_FP] = { "floating_point", "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
+ [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+
+ /* UMC v2 is separate because both of them can exist in a single system. */
+ [SMCA_UMC] = { "umc", "Unified Memory Controller" },
+ [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
+ [SMCA_PB] = { "param_block", "Parameter Block" },
+ [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
+ [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
+ [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
+ [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
+ [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
+ [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
+ [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_NBIF] = { "nbif", "NBIF Unit" },
+ [SMCA_SHUB] = { "shub", "System Hub Unit" },
+ [SMCA_SATA] = { "sata", "SATA Unit" },
+ [SMCA_USB] = { "usb", "USB Unit" },
+ [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
+ [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
+ [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
+ [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -117,21 +142,22 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);
-static enum smca_bank_types smca_get_bank_type(unsigned int bank)
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
struct smca_bank *b;
if (bank >= MAX_NR_BANKS)
return N_SMCA_BANK_TYPES;
- b = &smca_banks[bank];
+ b = &per_cpu(smca_banks, cpu)[bank];
if (!b->hwid)
return N_SMCA_BANK_TYPES;
return b->hwid->bank_type;
}
+EXPORT_SYMBOL_GPL(smca_get_bank_type);
-static struct smca_hwid smca_hwid_mcatypes[] = {
+static const struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype } */
/* Reserved type */
@@ -155,6 +181,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
/* Parameter Block MCA type */
{ SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
@@ -170,16 +197,27 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Microprocessor 5 Unit MCA type */
{ SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
+ /* MPDMA MCA type */
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
+
/* Northbridge IO Unit MCA type */
{ SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
/* PCI Express Unit MCA type */
{ SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
+
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};
-struct smca_bank smca_banks[MAX_NR_BANKS];
-EXPORT_SYMBOL_GPL(smca_banks);
-
/*
* In SMCA enabled processors, we can have multiple banks for a given IP type.
* So to define a unique name for each bank, we use a temp c-string to append
@@ -235,8 +273,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
static void smca_configure(unsigned int bank, unsigned int cpu)
{
+ u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
+ const struct smca_hwid *s_hwid;
unsigned int i, hwid_mcatype;
- struct smca_hwid *s_hwid;
u32 high, low;
u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
@@ -272,10 +311,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
smca_set_misc_banks_map(bank, cpu);
- /* Return early if this bank was already initialized. */
- if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
- return;
-
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
@@ -286,10 +321,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
+
if (hwid_mcatype == s_hwid->hwid_mcatype) {
- smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = low;
- smca_banks[bank].sysfs_id = s_hwid->count++;
+ this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
+ this_cpu_ptr(smca_banks)[bank].id = low;
+ this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
break;
}
}
@@ -387,7 +423,7 @@ static void threshold_restart_bank(void *_tr)
u32 hi, lo;
/* sysfs write might race against an offline operation */
- if (this_cpu_read(threshold_banks))
+ if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off)
return;
rdmsr(tr->b->address, lo, hi);
@@ -513,7 +549,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
/* Fall back to method we used for older processors: */
switch (block) {
case 0:
- addr = msr_ops.misc(bank);
+ addr = mca_msr_reg(bank, MCA_MISC);
break;
case 1:
offset = ((low & MASK_BLKPTR_LO) >> 21);
@@ -575,7 +611,7 @@ out:
bool amd_filter_mce(struct mce *m)
{
- enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;
/* See Family 17h Models 10h-2Fh Erratum #1114. */
@@ -613,7 +649,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
} else if (c->x86 == 0x17 &&
(c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
- if (smca_get_bank_type(bank) != SMCA_IF)
+ if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
return;
msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
@@ -675,213 +711,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
-int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
-{
- u64 dram_base_addr, dram_limit_addr, dram_hole_base;
- /* We start from the normalized address */
- u64 ret_addr = norm_addr;
-
- u32 tmp;
-
- u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
- u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
- u8 intlv_addr_sel, intlv_addr_bit;
- u8 num_intlv_bits, hashed_bit;
- u8 lgcy_mmio_hole_en, base = 0;
- u8 cs_mask, cs_id = 0;
- bool hash_enabled = false;
-
- /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
- if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
- goto out_err;
-
- /* Remove HiAddrOffset from normalized address, if enabled: */
- if (tmp & BIT(0)) {
- u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
-
- if (norm_addr >= hi_addr_offset) {
- ret_addr -= hi_addr_offset;
- base = 1;
- }
- }
-
- /* Read D18F0x110 (DramBaseAddress). */
- if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
- goto out_err;
-
- /* Check if address range is valid. */
- if (!(tmp & BIT(0))) {
- pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
- __func__, tmp);
- goto out_err;
- }
-
- lgcy_mmio_hole_en = tmp & BIT(1);
- intlv_num_chan = (tmp >> 4) & 0xF;
- intlv_addr_sel = (tmp >> 8) & 0x7;
- dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
-
- /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
- if (intlv_addr_sel > 3) {
- pr_err("%s: Invalid interleave address select %d.\n",
- __func__, intlv_addr_sel);
- goto out_err;
- }
-
- /* Read D18F0x114 (DramLimitAddress). */
- if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
- goto out_err;
-
- intlv_num_sockets = (tmp >> 8) & 0x1;
- intlv_num_dies = (tmp >> 10) & 0x3;
- dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
-
- intlv_addr_bit = intlv_addr_sel + 8;
-
- /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
- switch (intlv_num_chan) {
- case 0: intlv_num_chan = 0; break;
- case 1: intlv_num_chan = 1; break;
- case 3: intlv_num_chan = 2; break;
- case 5: intlv_num_chan = 3; break;
- case 7: intlv_num_chan = 4; break;
-
- case 8: intlv_num_chan = 1;
- hash_enabled = true;
- break;
- default:
- pr_err("%s: Invalid number of interleaved channels %d.\n",
- __func__, intlv_num_chan);
- goto out_err;
- }
-
- num_intlv_bits = intlv_num_chan;
-
- if (intlv_num_dies > 2) {
- pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
- __func__, intlv_num_dies);
- goto out_err;
- }
-
- num_intlv_bits += intlv_num_dies;
-
- /* Add a bit if sockets are interleaved. */
- num_intlv_bits += intlv_num_sockets;
-
- /* Assert num_intlv_bits <= 4 */
- if (num_intlv_bits > 4) {
- pr_err("%s: Invalid interleave bits %d.\n",
- __func__, num_intlv_bits);
- goto out_err;
- }
-
- if (num_intlv_bits > 0) {
- u64 temp_addr_x, temp_addr_i, temp_addr_y;
- u8 die_id_bit, sock_id_bit, cs_fabric_id;
-
- /*
- * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
- * This is the fabric id for this coherent slave. Use
- * umc/channel# as instance id of the coherent slave
- * for FICAA.
- */
- if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
- goto out_err;
-
- cs_fabric_id = (tmp >> 8) & 0xFF;
- die_id_bit = 0;
-
- /* If interleaved over more than 1 channel: */
- if (intlv_num_chan) {
- die_id_bit = intlv_num_chan;
- cs_mask = (1 << die_id_bit) - 1;
- cs_id = cs_fabric_id & cs_mask;
- }
-
- sock_id_bit = die_id_bit;
-
- /* Read D18F1x208 (SystemFabricIdMask). */
- if (intlv_num_dies || intlv_num_sockets)
- if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
- goto out_err;
-
- /* If interleaved over more than 1 die. */
- if (intlv_num_dies) {
- sock_id_bit = die_id_bit + intlv_num_dies;
- die_id_shift = (tmp >> 24) & 0xF;
- die_id_mask = (tmp >> 8) & 0xFF;
-
- cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
- }
-
- /* If interleaved over more than 1 socket. */
- if (intlv_num_sockets) {
- socket_id_shift = (tmp >> 28) & 0xF;
- socket_id_mask = (tmp >> 16) & 0xFF;
-
- cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
- }
-
- /*
- * The pre-interleaved address consists of XXXXXXIIIYYYYY
- * where III is the ID for this CS, and XXXXXXYYYYY are the
- * address bits from the post-interleaved address.
- * "num_intlv_bits" has been calculated to tell us how many "I"
- * bits there are. "intlv_addr_bit" tells us how many "Y" bits
- * there are (where "I" starts).
- */
- temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
- temp_addr_i = (cs_id << intlv_addr_bit);
- temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
- ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
- }
-
- /* Add dram base address */
- ret_addr += dram_base_addr;
-
- /* If legacy MMIO hole enabled */
- if (lgcy_mmio_hole_en) {
- if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
- goto out_err;
-
- dram_hole_base = tmp & GENMASK(31, 24);
- if (ret_addr >= dram_hole_base)
- ret_addr += (BIT_ULL(32) - dram_hole_base);
- }
-
- if (hash_enabled) {
- /* Save some parentheses and grab ls-bit at the end. */
- hashed_bit = (ret_addr >> 12) ^
- (ret_addr >> 18) ^
- (ret_addr >> 21) ^
- (ret_addr >> 30) ^
- cs_id;
-
- hashed_bit &= BIT(0);
-
- if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
- ret_addr ^= BIT(intlv_addr_bit);
- }
-
- /* Is calculated system address is above DRAM limit address? */
- if (ret_addr > dram_limit_addr)
- goto out_err;
-
- *sys_addr = ret_addr;
- return 0;
-
-out_err:
- return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
-
bool amd_mce_is_memory_error(struct mce *m)
{
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
if (mce_flags.smca)
- return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
+ return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;
return m->bank == 4 && xec == 0x8;
}
@@ -965,8 +801,8 @@ static void log_error_deferred(unsigned int bank)
{
bool defrd;
- defrd = _log_error_bank(bank, msr_ops.status(bank),
- msr_ops.addr(bank), 0);
+ defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+ mca_msr_reg(bank, MCA_ADDR), 0);
if (!mce_flags.smca)
return;
@@ -996,7 +832,7 @@ static void amd_deferred_error_interrupt(void)
static void log_error_thresholding(unsigned int bank, u64 misc)
{
- _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+ _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc);
}
static void log_and_reset_block(struct threshold_block *block)
@@ -1157,6 +993,7 @@ static struct attribute *default_attrs[] = {
NULL, /* possibly interrupt_enable if supported, see below */
NULL,
};
+ATTRIBUTE_GROUPS(default);
#define to_block(k) container_of(k, struct threshold_block, kobj)
#define to_attr(a) container_of(a, struct threshold_attr, attr)
@@ -1193,11 +1030,11 @@ static void threshold_block_release(struct kobject *kobj);
static struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
- .default_attrs = default_attrs,
+ .default_groups = default_groups,
.release = threshold_block_release,
};
-static const char *get_name(unsigned int bank, struct threshold_block *b)
+static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
{
enum smca_bank_types bank_type;
@@ -1208,7 +1045,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}
- bank_type = smca_get_bank_type(bank);
+ bank_type = smca_get_bank_type(cpu, bank);
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;
@@ -1218,12 +1055,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return NULL;
}
- if (smca_banks[bank].hwid->count == 1)
+ if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
return smca_get_name(bank_type);
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
- "%s_%x", smca_get_name(bank_type),
- smca_banks[bank].sysfs_id);
+ "%s_%u", smca_get_name(bank_type),
+ per_cpu(smca_banks, cpu)[bank].sysfs_id);
return buf_mcatype;
}
@@ -1265,10 +1102,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
b->threshold_limit = THRESHOLD_MAX;
if (b->interrupt_capable) {
- threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
+ default_attrs[2] = &interrupt_enable.attr;
b->interrupt_enable = 1;
} else {
- threshold_ktype.default_attrs[2] = NULL;
+ default_attrs[2] = NULL;
}
INIT_LIST_HEAD(&b->miscj);
@@ -1279,7 +1116,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
else
tb->blocks = b;
- err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
if (err)
goto out_free;
recurse:
@@ -1334,7 +1171,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
- const char *name = get_name(bank, NULL);
+ const char *name = get_name(cpu, bank, NULL);
int err = 0;
if (!dev)
@@ -1384,7 +1221,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
}
}
- err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
+ err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
if (err)
goto out_kobj;
@@ -1457,10 +1294,23 @@ out_free:
kfree(bank);
}
+static void __threshold_remove_device(struct threshold_bank **bp)
+{
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (!bp[bank])
+ continue;
+
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
+ kfree(bp);
+}
+
int mce_threshold_remove_device(unsigned int cpu)
{
struct threshold_bank **bp = this_cpu_read(threshold_banks);
- unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
if (!bp)
return 0;
@@ -1471,13 +1321,7 @@ int mce_threshold_remove_device(unsigned int cpu)
*/
this_cpu_write(threshold_banks, NULL);
- for (bank = 0; bank < numbanks; bank++) {
- if (bp[bank]) {
- threshold_remove_bank(bp[bank]);
- bp[bank] = NULL;
- }
- }
- kfree(bp);
+ __threshold_remove_device(bp);
return 0;
}
@@ -1514,15 +1358,14 @@ int mce_threshold_create_device(unsigned int cpu)
if (!(this_cpu_read(bank_map) & (1 << bank)))
continue;
err = threshold_create_bank(bp, cpu, bank);
- if (err)
- goto out_err;
+ if (err) {
+ __threshold_remove_device(bp);
+ return err;
+ }
}
this_cpu_write(threshold_banks, bp);
if (thresholding_irq_en)
mce_threshold_vector = amd_threshold_interrupt;
return 0;
-out_err:
- mce_threshold_remove_device(cpu);
- return err;
}
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index b58b85380ddb..717192915f28 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -36,7 +36,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
mce_setup(&m);
m.bank = -1;
/* Fake a memory read error with unknown channel */
- m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT;
if (severity >= GHES_SEV_RECOVERABLE)
m.status |= MCI_STATUS_UC;
@@ -176,16 +177,14 @@ retry:
/* no more record */
if (*record_id == APEI_ERST_INVALID_RECORD_ID)
goto out;
- rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+ rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd),
+ &CPER_CREATOR_MCE);
/* someone else has cleared the record, try next one */
if (rc == -ENOENT)
goto retry;
else if (rc < 0)
goto out;
- /* try to skip other type records in storage */
- else if (rc != sizeof(rcd) ||
- !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE))
- goto retry;
+
memcpy(m, &rcd.mce, sizeof(*m));
rc = sizeof(*m);
out:
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 7962355436da..2c8ec5c71712 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -69,7 +69,9 @@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_bank {
u64 ctl; /* subevents to enable */
- bool init; /* initialise bank? */
+
+ __u64 init : 1, /* initialise bank? */
+ __reserved_1 : 63;
};
static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
@@ -86,20 +88,11 @@ struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
.bootlog = -1,
- /*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
- .tolerant = 1,
.monarch_timeout = -1
};
static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;
-static int cpu_missing;
/*
* MCA banks polled by the period polling timer for corrected events.
@@ -121,8 +114,6 @@ mce_banks_t mce_banks_ce_disabled;
static struct work_struct mce_work;
static struct irq_work mce_irq_work;
-static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
@@ -130,7 +121,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
-noinstr void mce_setup(struct mce *m)
+void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
@@ -141,12 +132,7 @@ noinstr void mce_setup(struct mce *m)
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
-
- if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
- m->ppin = __rdmsr(MSR_PPIN);
- else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
- m->ppin = __rdmsr(MSR_AMD_PPIN);
-
+ m->ppin = cpu_data(m->extcpu).ppin;
m->microcode = boot_cpu_data.microcode;
}
@@ -176,53 +162,6 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
-static inline u32 ctl_reg(int bank)
-{
- return MSR_IA32_MCx_CTL(bank);
-}
-
-static inline u32 status_reg(int bank)
-{
- return MSR_IA32_MCx_STATUS(bank);
-}
-
-static inline u32 addr_reg(int bank)
-{
- return MSR_IA32_MCx_ADDR(bank);
-}
-
-static inline u32 misc_reg(int bank)
-{
- return MSR_IA32_MCx_MISC(bank);
-}
-
-static inline u32 smca_ctl_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_CTL(bank);
-}
-
-static inline u32 smca_status_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_STATUS(bank);
-}
-
-static inline u32 smca_addr_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_ADDR(bank);
-}
-
-static inline u32 smca_misc_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_MISC(bank);
-}
-
-struct mca_msr_regs msr_ops = {
- .ctl = ctl_reg,
- .status = status_reg,
- .addr = addr_reg,
- .misc = misc_reg
-};
-
static void __print_mce(struct mce *m)
{
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
@@ -295,11 +234,17 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static void mce_panic(const char *msg, struct mce *final, char *exp)
+static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
- int apei_err = 0;
struct llist_node *pending;
struct mce_evt_llist *l;
+ int apei_err = 0;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
if (!fake_panic) {
/*
@@ -314,7 +259,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
} else {
/* Don't log too much for fake panic */
if (atomic_inc_return(&mce_fake_panicked) > 1)
- return;
+ goto out;
}
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
@@ -342,8 +287,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
if (!apei_err)
apei_err = apei_write_mce(final);
}
- if (cpu_missing)
- pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
@@ -352,6 +295,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
}
/* Support code for software error injection */
@@ -362,24 +308,27 @@ static int msr_to_offset(u32 msr)
if (msr == mca_cfg.rip_msr)
return offsetof(struct mce, ip);
- if (msr == msr_ops.status(bank))
+ if (msr == mca_msr_reg(bank, MCA_STATUS))
return offsetof(struct mce, status);
- if (msr == msr_ops.addr(bank))
+ if (msr == mca_msr_reg(bank, MCA_ADDR))
return offsetof(struct mce, addr);
- if (msr == msr_ops.misc(bank))
+ if (msr == mca_msr_reg(bank, MCA_MISC))
return offsetof(struct mce, misc);
if (msr == MSR_IA32_MCG_STATUS)
return offsetof(struct mce, mcgstatus);
return -1;
}
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
{
- pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ if (wrmsr) {
+ pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+ regs->ip, (void *)regs->ip);
+ } else {
+ pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ }
show_stack_regs(regs);
@@ -387,12 +336,10 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
while (true)
cpu_relax();
-
- return true;
}
/* MSR access wrappers used for error injection */
-static noinstr u64 mce_rdmsrl(u32 msr)
+noinstr u64 mce_rdmsrl(u32 msr)
{
DECLARE_ARGS(val, low, high);
@@ -420,32 +367,13 @@ static noinstr u64 mce_rdmsrl(u32 msr)
*/
asm volatile("1: rdmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
: EAX_EDX_RET(val, low, high) : "c" (msr));
return EAX_EDX_VAL(val, low, high);
}
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
-{
- pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
- regs->ip, (void *)regs->ip);
-
- show_stack_regs(regs);
-
- panic("MCA architectural violation!\n");
-
- while (true)
- cpu_relax();
-
- return true;
-}
-
static noinstr void mce_wrmsrl(u32 msr, u64 v)
{
u32 low, high;
@@ -470,7 +398,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
/* See comment in mce_rdmsrl() */
asm volatile("1: wrmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
: : "c" (msr), "a"(low), "d" (high) : "memory");
}
@@ -479,9 +407,15 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
* check into our "mce" struct so that we can use it later to assess
* the severity of the problem as we read per-bank specific details.
*/
-static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
{
+ /*
+ * Enable instrumentation around mce_setup() which calls external
+ * facilities.
+ */
+ instrumentation_begin();
mce_setup(m);
+ instrumentation_end();
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
if (regs) {
@@ -529,7 +463,7 @@ static void mce_irq_work_cb(struct irq_work *entry)
* Check if the address reported by the CPU is in a format we can parse.
* It would be possible to add code for most other cases, but all would
* be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
+ * parser). So only support physical addresses up to page granularity for now.
*/
int mce_usable_address(struct mce *m)
{
@@ -647,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
- set_mce_nospec(pfn, whole_page(mce));
+ set_mce_nospec(pfn);
mce->kflags |= MCE_HANDLED_UC;
}
@@ -682,13 +616,13 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
- m->misc = mce_rdmsrl(msr_ops.misc(i));
+ m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrl(msr_ops.addr(i));
+ m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
@@ -758,7 +692,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.bank = i;
barrier();
- m.status = mce_rdmsrl(msr_ops.status(i));
+ m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
@@ -808,7 +742,7 @@ log_it:
goto clear_it;
mce_read_aux(&m, i);
- m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false);
+ m.severity = mce_severity(&m, NULL, NULL, false);
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
@@ -817,13 +751,16 @@ log_it:
if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
goto clear_it;
- mce_log(&m);
+ if (flags & MCP_QUEUE_LOG)
+ mce_gen_pool_add(&m);
+ else
+ mce_log(&m);
clear_it:
/*
* Clear state for this bank.
*/
- mce_wrmsrl(msr_ops.status(i), 0);
+ mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
/*
@@ -838,26 +775,108 @@ clear_it:
EXPORT_SYMBOL_GPL(machine_check_poll);
/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static __always_inline void
+quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashing on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU has been disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
+ u64 mc1_status;
+
+ /*
+ * Apply the quirk only to local machine checks, i.e., no broadcast
+ * sync is needed.
+ */
+ if (!(mcgstatus & MCG_STATUS_LMCES) ||
+ !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
+ return false;
+
+ mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+
+ /* Check for a software-recoverable data fetch error. */
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+
+ instrumentation_begin();
+ pr_err_once("Erratum detected, disable fast string copy instructions.\n");
+ instrumentation_end();
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
- struct pt_regs *regs)
+static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
{
char *tmp = *msg;
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- m->status = mce_rdmsrl(msr_ops.status(i));
+ m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
- __set_bit(i, validp);
- if (quirk_no_way_out)
- quirk_no_way_out(i, m, regs);
+ arch___set_bit(i, validp);
+ if (mce_flags.snb_ifu_quirk)
+ quirk_sandybridge_ifu(i, m, regs);
m->bank = i;
- if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
mce_read_aux(m, i);
*msg = tmp;
return 1;
@@ -886,8 +905,13 @@ static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
/*
* Check if a timeout waiting for other CPUs happened.
*/
-static int mce_timed_out(u64 *t, const char *msg)
+static noinstr int mce_timed_out(u64 *t, const char *msg)
{
+ int ret = 0;
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
/*
* The others already did panic for some reason.
* Bail out like in a timeout.
@@ -900,19 +924,22 @@ static int mce_timed_out(u64 *t, const char *msg)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- if (mca_cfg.tolerant <= 1) {
- if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
- pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
- cpumask_pr_args(&mce_missing_cpus));
- mce_panic(msg, NULL, NULL);
- }
- cpu_missing = 1;
- return 1;
+ if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
+ mce_panic(msg, NULL, NULL);
+
+ ret = 1;
+ goto out;
}
*t -= SPINUNIT;
+
out:
touch_nmi_watchdog();
- return 0;
+
+ instrumentation_end();
+
+ return ret;
}
/*
@@ -965,9 +992,9 @@ static void mce_reign(void)
* This dumps all the mces in the log buffer and stops the
* other CPUs.
*/
- if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+ if (m && global_worst >= MCE_PANIC_SEVERITY) {
/* call mce_severity() to get "msg" for panic */
- mce_severity(m, NULL, mca_cfg.tolerant, &msg, true);
+ mce_severity(m, NULL, &msg, true);
mce_panic("Fatal machine check", m, msg);
}
@@ -981,7 +1008,7 @@ static void mce_reign(void)
* No machine check event found. Must be some external
* source or one CPU is hung. Panic.
*/
- if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
+ if (global_worst <= MCE_KEEP_SEVERITY)
mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
@@ -1001,31 +1028,33 @@ static atomic_t global_nwo;
* in the entry order.
* TBD double check parallel CPU hotunplug
*/
-static int mce_start(int *no_way_out)
+static noinstr int mce_start(int *no_way_out)
{
- int order;
- int cpus = num_online_cpus();
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int order, ret = -1;
if (!timeout)
- return -1;
+ return ret;
- atomic_add(*no_way_out, &global_nwo);
+ arch_atomic_add(*no_way_out, &global_nwo);
/*
* Rely on the implied barrier below, such that global_nwo
* is updated before mce_callin.
*/
- order = atomic_inc_return(&mce_callin);
- cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
+ order = arch_atomic_inc_return(&mce_callin);
+ arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
/*
* Wait for everyone.
*/
- while (atomic_read(&mce_callin) != cpus) {
+ while (arch_atomic_read(&mce_callin) != num_online_cpus()) {
if (mce_timed_out(&timeout,
"Timeout: Not all CPUs entered broadcast exception handler")) {
- atomic_set(&global_nwo, 0);
- return -1;
+ arch_atomic_set(&global_nwo, 0);
+ goto out;
}
ndelay(SPINUNIT);
}
@@ -1039,7 +1068,7 @@ static int mce_start(int *no_way_out)
/*
* Monarch: Starts executing now, the others wait.
*/
- atomic_set(&mce_executing, 1);
+ arch_atomic_set(&mce_executing, 1);
} else {
/*
* Subject: Now start the scanning loop one by one in
@@ -1047,11 +1076,11 @@ static int mce_start(int *no_way_out)
* This way when there are any shared banks it will be
* only seen by one CPU before cleared, avoiding duplicates.
*/
- while (atomic_read(&mce_executing) < order) {
+ while (arch_atomic_read(&mce_executing) < order) {
if (mce_timed_out(&timeout,
"Timeout: Subject CPUs unable to finish machine check processing")) {
- atomic_set(&global_nwo, 0);
- return -1;
+ arch_atomic_set(&global_nwo, 0);
+ goto out;
}
ndelay(SPINUNIT);
}
@@ -1060,19 +1089,27 @@ static int mce_start(int *no_way_out)
/*
* Cache the global no_way_out state.
*/
- *no_way_out = atomic_read(&global_nwo);
+ *no_way_out = arch_atomic_read(&global_nwo);
+
+ ret = order;
+
+out:
+ instrumentation_end();
- return order;
+ return ret;
}
/*
* Synchronize between CPUs after main scanning loop.
* This invokes the bulk of the Monarch processing.
*/
-static int mce_end(int order)
+static noinstr int mce_end(int order)
{
- int ret = -1;
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
if (!timeout)
goto reset;
@@ -1085,14 +1122,11 @@ static int mce_end(int order)
atomic_inc(&mce_executing);
if (order == 1) {
- /* CHECKME: Can this race with a parallel hotplug? */
- int cpus = num_online_cpus();
-
/*
* Monarch: Wait for everyone to go through their scanning
* loops.
*/
- while (atomic_read(&mce_executing) <= cpus) {
+ while (atomic_read(&mce_executing) <= num_online_cpus()) {
if (mce_timed_out(&timeout,
"Timeout: Monarch CPU unable to finish machine check processing"))
goto reset;
@@ -1116,7 +1150,8 @@ static int mce_end(int order)
/*
* Don't reset anything. That's done by the Monarch.
*/
- return 0;
+ ret = 0;
+ goto out;
}
/*
@@ -1132,16 +1167,20 @@ reset:
* Let others run again.
*/
atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
return ret;
}
-static void mce_clear_state(unsigned long *toclear)
+static __always_inline void mce_clear_state(unsigned long *toclear)
{
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- if (test_bit(i, toclear))
- mce_wrmsrl(msr_ops.status(i), 0);
+ if (arch_test_bit(i, toclear))
+ mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1180,17 +1219,18 @@ static noinstr bool mce_check_crashing_cpu(void)
return false;
}
-static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
- unsigned long *toclear, unsigned long *valid_banks,
- int no_way_out, int *worst)
+static __always_inline int
+__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
+ unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
+ int *worst)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
- int severity, i;
+ int severity, i, taint = 0;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- __clear_bit(i, toclear);
- if (!test_bit(i, valid_banks))
+ arch___clear_bit(i, toclear);
+ if (!arch_test_bit(i, valid_banks))
continue;
if (!mce_banks[i].ctl)
@@ -1200,7 +1240,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
m->addr = 0;
m->bank = i;
- m->status = mce_rdmsrl(msr_ops.status(i));
+ m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
@@ -1213,9 +1253,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
continue;
/* Set taint even when machine check was not enabled. */
- add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ taint++;
- severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
+ severity = mce_severity(m, regs, NULL, true);
/*
* When machine check was for corrected/deferred handler don't
@@ -1225,7 +1265,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
severity == MCE_UCNA_SEVERITY) && !no_way_out)
continue;
- __set_bit(i, toclear);
+ arch___set_bit(i, toclear);
/* Machine check event was not enabled. Clear, but ignore. */
if (severity == MCE_NO_SEVERITY)
@@ -1236,7 +1276,13 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
/* assuming valid severity level != 0 */
m->severity = severity;
+ /*
+ * Enable instrumentation around the mce_log() call which is
+ * done in #MC context, where instrumentation is disabled.
+ */
+ instrumentation_begin();
mce_log(m);
+ instrumentation_end();
if (severity > *worst) {
*final = *m;
@@ -1246,10 +1292,15 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
/* mce_clear_state will clear *final, save locally for use later */
*m = *final;
+
+ return taint;
}
static void kill_me_now(struct callback_head *ch)
{
+ struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
force_sig(SIGBUS);
}
@@ -1257,48 +1308,88 @@ static void kill_me_maybe(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
int flags = MF_ACTION_REQUIRED;
+ int ret;
+ p->mce_count = 0;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
- if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
- !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+ ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+ if (!ret) {
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
sync_core();
return;
}
- if (p->mce_vaddr != (void __user *)-1l) {
- force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
- } else {
- pr_err("Memory error not recovered");
- kill_me_now(cb);
- }
+ /*
+ * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+ * to the current process with the proper error info,
+ * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
+ *
+ * In both cases, no further processing is required.
+ */
+ if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
+ return;
+
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
}
-static void queue_task_work(struct mce *m, int kill_current_task)
+static void kill_me_never(struct callback_head *cb)
{
- current->mce_addr = m->addr;
- current->mce_kflags = m->kflags;
- current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
- current->mce_whole_page = whole_page(m);
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
- if (kill_current_task)
- current->mce_kill_me.func = kill_me_now;
- else
- current->mce_kill_me.func = kill_me_maybe;
+ p->mce_count = 0;
+ pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+ if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+}
+
+static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
+{
+ int count = ++current->mce_count;
+
+ /* First call, save all the details */
+ if (count == 1) {
+ current->mce_addr = m->addr;
+ current->mce_kflags = m->kflags;
+ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(m);
+ current->mce_kill_me.func = func;
+ }
+
+ /* Ten is likely overkill. Don't expect more than two faults before task_work() */
+ if (count > 10)
+ mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+
+ /* Second or later call, make sure page address matches the one from first call */
+ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+ mce_panic("Consecutive machine checks to different user pages", m, msg);
+
+ /* Do not call task_work_add() more than once */
+ if (count > 1)
+ return;
task_work_add(current, &current->mce_kill_me, TWA_RESUME);
}
+/* Handle unconfigured int18 (should never happen) */
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+ smp_processor_id());
+ instrumentation_end();
+}
+
/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
+ * The actual machine check handler. This only handles real exceptions when
+ * something got corrupted coming in through int 18.
*
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
+ * This is executed in #MC context not subject to normal locking rules.
+ * This implies that most kernel services cannot be safely used. Don't even
* think about putting a printk in there!
*
* On Intel systems this is entered on all CPUs in parallel through
@@ -1310,39 +1401,56 @@ static void queue_task_work(struct mce *m, int kill_current_task)
* issues: if the machine check was due to a failure of the memory
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
+ *
+ * Currently, the #MC handler calls out to a number of external facilities
+ * and, therefore, allows instrumentation around them. The optimal thing to
+ * have would be to do the absolutely minimal work required in #MC context
+ * and have instrumentation disabled only around that. Further processing can
+ * then happen in process context where instrumentation is allowed. Achieving
+ * that requires careful auditing and modifications. Until then, the code
+ * allows instrumentation temporarily, where required. *
*/
noinstr void do_machine_check(struct pt_regs *regs)
{
- DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
- struct mca_config *cfg = &mca_cfg;
+ int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
struct mce m, *final;
char *msg = NULL;
- int worst = 0;
+
+ if (unlikely(mce_flags.p5))
+ return pentium_machine_check(regs);
+ else if (unlikely(mce_flags.winchip))
+ return winchip_machine_check(regs);
+ else if (unlikely(!mca_cfg.initialized))
+ return unexpected_machine_check(regs);
+
+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ goto clear;
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
*/
- int order = -1;
+ order = -1;
/*
* If no_way_out gets set, there is no safe way to recover from this
- * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
+ * MCE.
*/
- int no_way_out = 0;
+ no_way_out = 0;
/*
* If kill_current_task is not set, there might be a way to recover from this
* error.
*/
- int kill_current_task = 0;
+ kill_current_task = 0;
/*
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
* on Intel.
*/
- int lmce = 1;
+ lmce = 1;
this_cpu_inc(mce_exception_count);
@@ -1352,7 +1460,6 @@ noinstr void do_machine_check(struct pt_regs *regs)
final = this_cpu_ptr(&mces_seen);
*final = m;
- memset(valid_banks, 0, sizeof(valid_banks));
no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
barrier();
@@ -1363,7 +1470,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
* severity is MCE_AR_SEVERITY we have other options.
*/
if (!(m.mcgstatus & MCG_STATUS_RIPV))
- kill_current_task = (cfg->tolerant == 3) ? 0 : 1;
+ kill_current_task = 1;
/*
* Check if this MCE is signaled to only this logical processor,
* on Intel, Zhaoxin only.
@@ -1380,13 +1487,13 @@ noinstr void do_machine_check(struct pt_regs *regs)
* to see it will clear it.
*/
if (lmce) {
- if (no_way_out && cfg->tolerant < 3)
+ if (no_way_out)
mce_panic("Fatal local machine check", &m, msg);
} else {
order = mce_start(&no_way_out);
}
- __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
+ taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1400,7 +1507,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
if (!no_way_out)
no_way_out = worst >= MCE_PANIC_SEVERITY;
- if (no_way_out && cfg->tolerant < 3)
+ if (no_way_out)
mce_panic("Fatal machine check on current CPU", &m, msg);
}
} else {
@@ -1412,12 +1519,22 @@ noinstr void do_machine_check(struct pt_regs *regs)
* fatal error. We call "mce_severity()" again to
* make sure we have the right "msg".
*/
- if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
- mce_severity(&m, regs, cfg->tolerant, &msg, true);
+ if (worst >= MCE_PANIC_SEVERITY) {
+ mce_severity(&m, regs, &msg, true);
mce_panic("Local fatal machine check!", &m, msg);
}
}
+ /*
+ * Enable instrumentation around the external facilities like task_work_add()
+ * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
+ * properly would need a lot more involved reorganization.
+ */
+ instrumentation_begin();
+
+ if (taint)
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
if (worst != MCE_AR_SEVERITY && !kill_current_task)
goto out;
@@ -1426,7 +1543,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- queue_task_work(&m, kill_current_task);
+ if (kill_current_task)
+ queue_task_work(&m, msg, kill_me_now);
+ else
+ queue_task_work(&m, msg, kill_me_maybe);
} else {
/*
@@ -1444,9 +1564,13 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, kill_current_task);
+ queue_task_work(&m, msg, kill_me_never);
}
+
out:
+ instrumentation_end();
+
+clear:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1630,10 +1754,12 @@ static void __mcheck_cpu_init_generic(void)
m_fl = MCP_DONTLOG;
/*
- * Log the machine checks left over from the previous reset.
+ * Log the machine checks left over from the previous reset. Log them
+ * only, do not start processing them. That will happen in mcheck_late_init()
+ * when all consumers have been registered on the notifier chain.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC | m_fl, &all_banks);
+ machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
cr4_set_bits(X86_CR4_MCE);
@@ -1652,8 +1778,8 @@ static void __mcheck_cpu_init_clear_banks(void)
if (!b->init)
continue;
- wrmsrl(msr_ops.ctl(i), b->ctl);
- wrmsrl(msr_ops.status(i), 0);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1679,39 +1805,11 @@ static void __mcheck_cpu_check_banks(void)
if (!b->init)
continue;
- rdmsrl(msr_ops.ctl(i), msrval);
+ rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
b->init = !!msrval;
}
}
-/*
- * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
- * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
- * Vol 3B Table 15-20). But this confuses both the code that determines
- * whether the machine check occurred in kernel or user mode, and also
- * the severity assessment code. Pretend that EIPV was set, and take the
- * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
- */
-static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
- if (bank != 0)
- return;
- if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
- return;
- if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
- MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
- MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
- MCACOD)) !=
- (MCI_STATUS_UC|MCI_STATUS_EN|
- MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
- MCI_STATUS_AR|MCACOD_INSTR))
- return;
-
- m->mcgstatus |= MCG_STATUS_EIPV;
- m->ip = regs->ip;
- m->cs = regs->cs;
-}
-
/* Add per CPU specific workarounds here */
static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
@@ -1785,7 +1883,14 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
cfg->bootlog = 0;
if (c->x86 == 6 && c->x86_model == 45)
- quirk_no_way_out = quirk_sandybridge_ifu;
+ mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
}
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
@@ -1815,9 +1920,11 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
+ mce_flags.p5 = 1;
return 1;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
+ mce_flags.winchip = 1;
return 1;
default:
return 0;
@@ -1836,13 +1943,6 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
mce_flags.amd_threshold = 1;
-
- if (mce_flags.smca) {
- msr_ops.ctl = smca_ctl_reg;
- msr_ops.status = smca_status_reg;
- msr_ops.addr = smca_addr_reg;
- msr_ops.misc = smca_misc_reg;
- }
}
}
@@ -1972,18 +2072,6 @@ bool filter_mce(struct mce *m)
return false;
}
-/* Handle unconfigured int18 (should never happen) */
-static noinstr void unexpected_machine_check(struct pt_regs *regs)
-{
- instrumentation_begin();
- pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
- smp_processor_id());
- instrumentation_end();
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
-
static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
irqentry_state_t irq_state;
@@ -1994,31 +2082,22 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
* Only required when from kernel mode. See
* mce_check_crashing_cpu() for details.
*/
- if (machine_check_vector == do_machine_check &&
- mce_check_crashing_cpu())
+ if (mca_cfg.initialized && mce_check_crashing_cpu())
return;
irq_state = irqentry_nmi_enter(regs);
- /*
- * The call targets are marked noinstr, but objtool can't figure
- * that out because it's an indirect call. Annotate it.
- */
- instrumentation_begin();
- machine_check_vector(regs);
+ do_machine_check(regs);
- instrumentation_end();
irqentry_nmi_exit(regs, irq_state);
}
static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{
irqentry_enter_from_user_mode(regs);
- instrumentation_begin();
- machine_check_vector(regs);
+ do_machine_check(regs);
- instrumentation_end();
irqentry_exit_to_user_mode(regs);
}
@@ -2085,7 +2164,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
}
- machine_check_vector = do_machine_check;
+ mca_cfg.initialized = 1;
__mcheck_cpu_init_early(c);
__mcheck_cpu_init_generic();
@@ -2177,10 +2256,9 @@ static int __init mcheck_enable(char *str)
cfg->bios_cmci_threshold = 1;
else if (!strcmp(str, "recovery"))
cfg->recovery = 1;
- else if (isdigit(str[0])) {
- if (get_option(&str, &cfg->tolerant) == 2)
- get_option(&str, &(cfg->monarch_timeout));
- } else {
+ else if (isdigit(str[0]))
+ get_option(&str, &(cfg->monarch_timeout));
+ else {
pr_info("mce argument %s ignored. Please use /sys\n", str);
return 0;
}
@@ -2193,7 +2271,6 @@ int __init mcheck_init(void)
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
- mcheck_vendor_init_severity();
INIT_WORK(&mce_work, mce_gen_pool_process);
init_irq_work(&mce_irq_work, mce_irq_work_cb);
@@ -2218,7 +2295,7 @@ static void mce_disable_error_reporting(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(msr_ops.ctl(i), 0);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
}
return;
}
@@ -2431,7 +2508,6 @@ static ssize_t store_int_with_restart(struct device *s,
return ret;
}
-static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
@@ -2452,7 +2528,6 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
};
static struct device_attribute *mce_device_attrs[] = {
- &dev_attr_tolerant.attr,
&dev_attr_check_interval.attr,
#ifdef CONFIG_X86_MCELOG_LEGACY
&dev_attr_trigger,
@@ -2570,7 +2645,7 @@ static void mce_reenable_cpu(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(msr_ops.ctl(i), b->ctl);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
}
}
@@ -2719,7 +2794,6 @@ struct dentry *mce_get_debugfs_dir(void)
static void mce_reset(void)
{
- cpu_missing = 0;
atomic_set(&mce_fake_panicked, 0);
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 7b360731fc2d..5fbd7ffb3233 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -88,12 +88,28 @@ MCE_INJECT_GET(status);
MCE_INJECT_GET(misc);
MCE_INJECT_GET(addr);
MCE_INJECT_GET(synd);
+MCE_INJECT_GET(ipid);
DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+/* Use the user provided IPID value on a sw injection. */
+static int inj_ipid_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ if (inj_type == SW_INJ)
+ m->ipid = val;
+ }
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
+
static void setup_inj_struct(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
@@ -232,7 +248,7 @@ static void __maybe_unused raise_mce(struct mce *m)
unsigned long start;
int cpu;
- get_online_cpus();
+ cpus_read_lock();
cpumask_copy(mce_inject_cpumask, cpu_online_mask);
cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
for_each_online_cpu(cpu) {
@@ -266,7 +282,7 @@ static void __maybe_unused raise_mce(struct mce *m)
}
raise_local();
put_cpu();
- put_online_cpus();
+ cpus_read_unlock();
} else {
preempt_disable();
raise_local();
@@ -347,7 +363,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf,
char buf[MAX_FLAG_OPT_SIZE], *__buf;
int err;
- if (cnt > MAX_FLAG_OPT_SIZE)
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
@@ -487,6 +503,8 @@ static void do_inject(void)
i_mce.tsc = rdtsc_ordered();
+ i_mce.status |= MCI_STATUS_VAL;
+
if (i_mce.misc)
i_mce.status |= MCI_STATUS_MISCV;
@@ -526,7 +544,7 @@ static void do_inject(void)
cpu = get_nbc_for_node(topology_die_id(cpu));
}
- get_online_cpus();
+ cpus_read_lock();
if (!cpu_online(cpu))
goto err;
@@ -550,7 +568,7 @@ static void do_inject(void)
}
err:
- put_online_cpus();
+ cpus_read_unlock();
}
@@ -574,6 +592,33 @@ static int inj_bank_set(void *data, u64 val)
}
m->bank = val;
+
+ /*
+ * sw-only injection allows to write arbitrary values into the MCA
+ * registers because it tests only the decoding paths.
+ */
+ if (inj_type == SW_INJ)
+ goto inject;
+
+ /*
+ * Read IPID value to determine if a bank is populated on the target
+ * CPU.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ u64 ipid;
+
+ if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
+ pr_err("Error reading IPID on CPU%d\n", m->extcpu);
+ return -EINVAL;
+ }
+
+ if (!ipid) {
+ pr_err("Cannot inject into unpopulated bank %llu\n", val);
+ return -ENODEV;
+ }
+ }
+
+inject:
do_inject();
/* Reset injection struct */
@@ -629,6 +674,8 @@ static const char readme_msg[] =
"\t is present in hardware. \n"
"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
"\t APIC interrupt handler to handle the error. \n"
+"\n"
+"ipid:\t IPID (AMD-specific)\n"
"\n";
static ssize_t
@@ -652,6 +699,7 @@ static struct dfs_node {
{ .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index e309476743b7..95275a5e57e0 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -470,45 +470,6 @@ void intel_clear_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
}
-static void intel_ppin_init(struct cpuinfo_x86 *c)
-{
- unsigned long long val;
-
- /*
- * Even if testing the presence of the MSR would be enough, we don't
- * want to risk the situation where other models reuse this MSR for
- * other purposes.
- */
- switch (c->x86_model) {
- case INTEL_FAM6_IVYBRIDGE_X:
- case INTEL_FAM6_HASWELL_X:
- case INTEL_FAM6_BROADWELL_D:
- case INTEL_FAM6_BROADWELL_X:
- case INTEL_FAM6_SKYLAKE_X:
- case INTEL_FAM6_ICELAKE_X:
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
-
- if (rdmsrl_safe(MSR_PPIN_CTL, &val))
- return;
-
- if ((val & 3UL) == 1UL) {
- /* PPIN locked in disabled mode */
- return;
- }
-
- /* If PPIN is disabled, try to enable */
- if (!(val & 2UL)) {
- wrmsrl_safe(MSR_PPIN_CTL, val | 2UL);
- rdmsrl_safe(MSR_PPIN_CTL, &val);
- }
-
- /* Is the enable bit set? */
- if (val & 2UL)
- set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
- }
-}
-
/*
* Enable additional error logs from the integrated
* memory controller on processors that support this.
@@ -533,7 +494,6 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_cmci();
intel_init_lmce();
- intel_ppin_init(c);
intel_imc_init(c);
}
@@ -546,12 +506,13 @@ bool intel_filter_mce(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- /* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */
+ /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
if ((c->x86 == 6) &&
((c->x86_model == INTEL_FAM6_HASWELL) ||
(c->x86_model == INTEL_FAM6_HASWELL_L) ||
(c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G)) &&
+ (c->x86_model == INTEL_FAM6_HASWELL_G) ||
+ (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 88dcc79cfb07..4ae0e603f7fa 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -8,9 +8,6 @@
#include <linux/device.h>
#include <asm/mce.h>
-/* Pointer to the installed machine check handler for this CPU setup. */
-extern void (*machine_check_vector)(struct pt_regs *);
-
enum severity_level {
MCE_NO_SEVERITY,
MCE_DEFERRED_SEVERITY,
@@ -38,8 +35,7 @@ int mce_gen_pool_add(struct mce *mce);
int mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
-extern int (*mce_severity)(struct mce *a, struct pt_regs *regs,
- int tolerant, char **msg, bool is_excp);
+int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
@@ -61,7 +57,7 @@ static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
-static inline bool intel_filter_mce(struct mce *m) { return false; };
+static inline bool intel_filter_mce(struct mce *m) { return false; }
#endif
void mce_timer_kick(unsigned long interval);
@@ -117,23 +113,24 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
#endif
struct mca_config {
- bool dont_log_ce;
- bool cmci_disabled;
- bool ignore_ce;
- bool print_all;
-
__u64 lmce_disabled : 1,
disabled : 1,
ser : 1,
recovery : 1,
bios_cmci_threshold : 1,
- __reserved : 59;
+ /* Proper #MC exception handler is set */
+ initialized : 1,
+ __reserved : 58;
+
+ bool dont_log_ce;
+ bool cmci_disabled;
+ bool ignore_ce;
+ bool print_all;
- s8 bootlog;
- int tolerant;
int monarch_timeout;
int panic_timeout;
u32 rip_msr;
+ s8 bootlog;
};
extern struct mca_config mca_cfg;
@@ -163,37 +160,74 @@ struct mce_vendor_flags {
/* AMD-style error thresholding banks present. */
amd_threshold : 1,
- __reserved_0 : 60;
+ /* Pentium, family 5-style MCA */
+ p5 : 1,
+
+ /* Centaur Winchip C6-style MCA */
+ winchip : 1,
+
+ /* SandyBridge IFU quirk */
+ snb_ifu_quirk : 1,
+
+ /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 56;
};
extern struct mce_vendor_flags mce_flags;
-struct mca_msr_regs {
- u32 (*ctl) (int bank);
- u32 (*status) (int bank);
- u32 (*addr) (int bank);
- u32 (*misc) (int bank);
+enum mca_msr {
+ MCA_CTL,
+ MCA_STATUS,
+ MCA_ADDR,
+ MCA_MISC,
};
-extern struct mca_msr_regs msr_ops;
-
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
#else
-static inline bool amd_filter_mce(struct mce *m) { return false; };
+static inline bool amd_filter_mce(struct mce *m) { return false; }
+#endif
+
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+noinstr void pentium_machine_check(struct pt_regs *regs);
+noinstr void winchip_machine_check(struct pt_regs *regs);
+static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#else
+static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void enable_p5_mce(void) {}
+static inline void pentium_machine_check(struct pt_regs *regs) {}
+static inline void winchip_machine_check(struct pt_regs *regs) {}
#endif
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr);
+noinstr u64 mce_rdmsrl(u32 msr);
+
+static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
+{
+ if (mce_flags.smca) {
+ switch (reg) {
+ case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank);
+ }
+ }
+
+ switch (reg) {
+ case MCA_CTL: return MSR_IA32_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_IA32_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank);
+ }
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr);
+ return 0;
+}
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 19e90cae8e97..2272ad53fc33 100644
--- a/arch/x86/kernel/cpu/mce/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -21,7 +21,7 @@
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static noinstr void pentium_machine_check(struct pt_regs *regs)
+noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
@@ -54,10 +54,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_MCE))
return;
- machine_check_vector = pentium_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 83df991314c5..00483d1c27e4 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -142,7 +142,7 @@ static struct severity {
MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
),
MCESEV(
- KEEP, "Non signalled machine check",
+ KEEP, "Non signaled machine check",
SER, BITCLR(MCI_STATUS_S)
),
@@ -218,15 +218,18 @@ static struct severity {
static bool is_copy_from_user(struct pt_regs *regs)
{
u8 insn_buf[MAX_INSN_SIZE];
- struct insn insn;
unsigned long addr;
+ struct insn insn;
+ int ret;
+
+ if (!regs)
+ return false;
if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
return false;
- kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
- insn_get_opcode(&insn);
- if (!insn.opcode.got)
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
return false;
switch (insn.opcode.value) {
@@ -234,10 +237,6 @@ static bool is_copy_from_user(struct pt_regs *regs)
case 0x8A: case 0x8B:
/* MOVZ mem,reg */
case 0xB60F: case 0xB70F:
- insn_get_modrm(&insn);
- insn_get_sib(&insn);
- if (!insn.modrm.got || !insn.sib.got)
- return false;
addr = (unsigned long)insn_get_addr_ref(&insn, regs);
break;
/* REP MOVS */
@@ -267,115 +266,103 @@ static bool is_copy_from_user(struct pt_regs *regs)
* distinguish an exception taken in user from from one
* taken in the kernel.
*/
-static int error_context(struct mce *m, struct pt_regs *regs)
+static noinstr int error_context(struct mce *m, struct pt_regs *regs)
{
- enum handler_type t;
+ int fixup_type;
+ bool copy_user;
if ((m->cs & 3) == 3)
return IN_USER;
+
if (!mc_recoverable(m->mcgstatus))
return IN_KERNEL;
- t = ex_get_fault_handler_type(m->ip);
- if (t == EX_HANDLER_FAULT) {
- m->kflags |= MCE_IN_KERNEL_RECOV;
- return IN_KERNEL_RECOV;
- }
- if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) {
- m->kflags |= MCE_IN_KERNEL_RECOV;
+ /* Allow instrumentation around external facilities usage. */
+ instrumentation_begin();
+ fixup_type = ex_get_fixup_type(m->ip);
+ copy_user = is_copy_from_user(regs);
+ instrumentation_end();
+
+ switch (fixup_type) {
+ case EX_TYPE_UACCESS:
+ case EX_TYPE_COPY:
+ if (!copy_user)
+ return IN_KERNEL;
m->kflags |= MCE_IN_KERNEL_COPYIN;
+ fallthrough;
+
+ case EX_TYPE_FAULT_MCE_SAFE:
+ case EX_TYPE_DEFAULT_MCE_SAFE:
+ m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
- }
- return IN_KERNEL;
+ default:
+ return IN_KERNEL;
+ }
}
-static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
+/* See AMD PPR(s) section Machine Check Error Handling. */
+static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
{
- u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
- u32 low, high;
+ char *panic_msg = NULL;
+ int ret;
/*
- * We need to look at the following bits:
- * - "succor" bit (data poisoning support), and
- * - TCC bit (Task Context Corrupt)
- * in MCi_STATUS to determine error severity.
+ * Default return value: Action required, the error must be handled
+ * immediately.
*/
- if (!mce_flags.succor)
- return MCE_PANIC_SEVERITY;
-
- if (rdmsr_safe(addr, &low, &high))
- return MCE_PANIC_SEVERITY;
-
- /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
- if ((low & MCI_CONFIG_MCAX) &&
- (m->status & MCI_STATUS_TCC) &&
- (err_ctx == IN_KERNEL))
- return MCE_PANIC_SEVERITY;
-
- /* ...otherwise invoke hwpoison handler. */
- return MCE_AR_SEVERITY;
-}
-
-/*
- * See AMD Error Scope Hierarchy table in a newer BKDG. For example
- * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
- */
-static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
- char **msg, bool is_excp)
-{
- enum context ctx = error_context(m, regs);
+ ret = MCE_AR_SEVERITY;
/* Processor Context Corrupt, no need to fumble too much, die! */
- if (m->status & MCI_STATUS_PCC)
- return MCE_PANIC_SEVERITY;
-
- if (m->status & MCI_STATUS_UC) {
-
- if (ctx == IN_KERNEL)
- return MCE_PANIC_SEVERITY;
+ if (m->status & MCI_STATUS_PCC) {
+ panic_msg = "Processor Context Corrupt";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
- /*
- * On older systems where overflow_recov flag is not present, we
- * should simply panic if an error overflow occurs. If
- * overflow_recov flag is present and set, then software can try
- * to at least kill process to prolong system operation.
- */
- if (mce_flags.overflow_recov) {
- if (mce_flags.smca)
- return mce_severity_amd_smca(m, ctx);
-
- /* kill current process */
- return MCE_AR_SEVERITY;
- } else {
- /* at least one error was not logged */
- if (m->status & MCI_STATUS_OVER)
- return MCE_PANIC_SEVERITY;
- }
-
- /*
- * For any other case, return MCE_UC_SEVERITY so that we log the
- * error and exit #MC handler.
- */
- return MCE_UC_SEVERITY;
+ if (m->status & MCI_STATUS_DEFERRED) {
+ ret = MCE_DEFERRED_SEVERITY;
+ goto out;
}
/*
- * deferred error: poll handler catches these and adds to mce_ring so
- * memory-failure can take recovery actions.
+ * If the UC bit is not set, the system either corrected or deferred
+ * the error. No action will be required after logging the error.
*/
- if (m->status & MCI_STATUS_DEFERRED)
- return MCE_DEFERRED_SEVERITY;
+ if (!(m->status & MCI_STATUS_UC)) {
+ ret = MCE_KEEP_SEVERITY;
+ goto out;
+ }
/*
- * corrected error: poll handler catches these and passes responsibility
- * of decoding the error to EDAC
+ * On MCA overflow, without the MCA overflow recovery feature the
+ * system will not be able to recover, panic.
*/
- return MCE_KEEP_SEVERITY;
+ if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
+ panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (!mce_flags.succor) {
+ panic_msg = "Uncorrected error without MCA Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (error_context(m, regs) == IN_KERNEL) {
+ panic_msg = "Uncorrected unrecoverable error in kernel context";
+ ret = MCE_PANIC_SEVERITY;
+ }
+
+out:
+ if (msg && panic_msg)
+ *msg = panic_msg;
+
+ return ret;
}
-static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
- int tolerant, char **msg, bool is_excp)
+static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m, regs);
@@ -403,23 +390,21 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
if (msg)
*msg = s->msg;
s->covered = 1;
- if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
- if (tolerant < 1)
- return MCE_PANIC_SEVERITY;
- }
+
+ if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL)
+ return MCE_PANIC_SEVERITY;
+
return s->sev;
}
}
-/* Default to mce_severity_intel */
-int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) =
- mce_severity_intel;
-
-void __init mcheck_vendor_init_severity(void)
+int noinstr mce_severity(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
- mce_severity = mce_severity_amd;
+ return mce_severity_amd(m, regs, msg, is_excp);
+ else
+ return mce_severity_intel(m, regs, msg, is_excp);
}
#ifdef CONFIG_DEBUG_FS
diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index 9c9f0abd2d7f..6c99f2941909 100644
--- a/arch/x86/kernel/cpu/mce/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -17,7 +17,7 @@
#include "internal.h"
/* Machine check handler for WinChip C6: */
-static noinstr void winchip_machine_check(struct pt_regs *regs)
+noinstr void winchip_machine_check(struct pt_regs *regs)
{
instrumentation_begin();
pr_emerg("CPU0: Machine Check Exception.\n");
@@ -30,10 +30,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
- machine_check_vector = winchip_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 3d4a48336084..8b2fcdfa6d31 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -456,17 +456,23 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p
static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
{
-#ifdef CONFIG_X86_64
char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct firmware fw;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
if (family >= 0x15)
snprintf(fw_name, sizeof(fw_name),
"amd-ucode/microcode_amd_fam%.2xh.bin", family);
- return get_builtin_firmware(cp, fw_name);
-#else
+ if (firmware_request_builtin(&fw, fw_name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
return false;
-#endif
}
static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b935e1b5f115..ad57e0e4d674 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -55,7 +55,7 @@ LIST_HEAD(microcode_cache);
* All non cpu-hotplug-callback call sites use:
*
* - microcode_mutex to synchronize with each other;
- * - get/put_online_cpus() to synchronize with
+ * - cpus_read_lock/unlock() to synchronize with
* the cpu-hotplug-callback call sites.
*
* We guarantee that only a single cpu is being
@@ -140,23 +140,6 @@ static bool __init check_loader_disabled_bsp(void)
return *res;
}
-extern struct builtin_fw __start_builtin_fw[];
-extern struct builtin_fw __end_builtin_fw[];
-
-bool get_builtin_firmware(struct cpio_data *cd, const char *name)
-{
- struct builtin_fw *b_fw;
-
- for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
- if (!strcmp(name, b_fw->name)) {
- cd->size = b_fw->size;
- cd->data = b_fw->data;
- return true;
- }
- }
- return false;
-}
-
void __init load_ucode_bsp(void)
{
unsigned int cpuid_1_eax;
@@ -390,101 +373,10 @@ static int apply_microcode_on_target(int cpu)
return ret;
}
-#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-static int do_microcode_update(const void __user *buf, size_t size)
-{
- int error = 0;
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- enum ucode_state ustate;
-
- if (!uci->valid)
- continue;
-
- ustate = microcode_ops->request_microcode_user(cpu, buf, size);
- if (ustate == UCODE_ERROR) {
- error = -1;
- break;
- } else if (ustate == UCODE_NEW) {
- apply_microcode_on_target(cpu);
- }
- }
-
- return error;
-}
-
-static int microcode_open(struct inode *inode, struct file *file)
-{
- return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM;
-}
-
-static ssize_t microcode_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
-{
- ssize_t ret = -EINVAL;
- unsigned long nr_pages = totalram_pages();
-
- if ((len >> PAGE_SHIFT) > nr_pages) {
- pr_err("too much data (max %ld pages)\n", nr_pages);
- return ret;
- }
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- if (do_microcode_update(buf, len) == 0)
- ret = (ssize_t)len;
-
- if (ret > 0)
- perf_check_microcode();
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- return ret;
-}
-
-static const struct file_operations microcode_fops = {
- .owner = THIS_MODULE,
- .write = microcode_write,
- .open = microcode_open,
- .llseek = no_llseek,
-};
-
-static struct miscdevice microcode_dev = {
- .minor = MICROCODE_MINOR,
- .name = "microcode",
- .nodename = "cpu/microcode",
- .fops = &microcode_fops,
-};
-
-static int __init microcode_dev_init(void)
-{
- int error;
-
- error = misc_register(&microcode_dev);
- if (error) {
- pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
- return error;
- }
-
- return 0;
-}
-
-static void __exit microcode_dev_exit(void)
-{
- misc_deregister(&microcode_dev);
-}
-#else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while (0)
-#endif
-
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
+#ifdef CONFIG_MICROCODE_LATE_LOADING
/*
* Late loading dance. Why the heavy-handed stomp_machine effort?
*
@@ -601,6 +493,9 @@ static int microcode_reload_late(void)
{
int ret;
+ pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
+ pr_err("You should switch to early loading, if possible.\n");
+
atomic_set(&late_cpus_in, 0);
atomic_set(&late_cpus_out, 0);
@@ -629,29 +524,34 @@ static ssize_t reload_store(struct device *dev,
if (val != 1)
return size;
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
- if (tmp_ret != UCODE_NEW)
- return size;
-
- get_online_cpus();
+ cpus_read_lock();
ret = check_online_cpus();
if (ret)
goto put;
+ tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
+ if (tmp_ret != UCODE_NEW)
+ goto put;
+
mutex_lock(&microcode_mutex);
ret = microcode_reload_late();
mutex_unlock(&microcode_mutex);
put:
- put_online_cpus();
+ cpus_read_unlock();
if (ret == 0)
ret = size;
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
return ret;
}
+static DEVICE_ATTR_WO(reload);
+#endif
+
static ssize_t version_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -668,7 +568,6 @@ static ssize_t pf_show(struct device *dev,
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
-static DEVICE_ATTR_WO(reload);
static DEVICE_ATTR(version, 0444, version_show, NULL);
static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL);
@@ -775,9 +674,9 @@ static struct subsys_interface mc_cpu_interface = {
};
/**
- * mc_bp_resume - Update boot CPU microcode during resume.
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
*/
-static void mc_bp_resume(void)
+void microcode_bsp_resume(void)
{
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -789,7 +688,7 @@ static void mc_bp_resume(void)
}
static struct syscore_ops mc_syscore_ops = {
- .resume = mc_bp_resume,
+ .resume = microcode_bsp_resume,
};
static int mc_cpu_starting(unsigned int cpu)
@@ -821,7 +720,9 @@ static int mc_cpu_down_prep(unsigned int cpu)
}
static struct attribute *cpu_root_microcode_attrs[] = {
+#ifdef CONFIG_MICROCODE_LATE_LOADING
&dev_attr_reload.attr,
+#endif
NULL
};
@@ -853,14 +754,11 @@ static int __init microcode_init(void)
if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&microcode_mutex);
-
error = subsys_interface_register(&mc_cpu_interface);
- if (!error)
- perf_check_microcode();
mutex_unlock(&microcode_mutex);
- put_online_cpus();
+ cpus_read_unlock();
if (error)
goto out_pdev;
@@ -873,10 +771,6 @@ static int __init microcode_init(void)
goto out_driver;
}
- error = microcode_dev_init();
- if (error)
- goto out_ucode_group;
-
register_syscore_ops(&mc_syscore_ops);
cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
mc_cpu_starting, NULL);
@@ -887,18 +781,14 @@ static int __init microcode_init(void)
return 0;
- out_ucode_group:
- sysfs_remove_group(&cpu_subsys.dev_root->kobj,
- &cpu_root_microcode_group);
-
out_driver:
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&microcode_mutex);
subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
- put_online_cpus();
+ cpus_read_unlock();
out_pdev:
platform_device_unregister(microcode_pdev);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 7e8e07bddd5f..025c8f0cd948 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch;
/* last level cache size per core */
static int llc_size_per_core;
-static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
- unsigned int s2, unsigned int p2)
-{
- if (s1 != s2)
- return false;
-
- /* Processor flags are either both 0 ... */
- if (!p1 && !p2)
- return true;
-
- /* ... or they intersect. */
- return p1 & p2;
-}
-
/*
* Returns 1 if update has been found, 0 otherwise.
*/
@@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf)
struct extended_signature *ext_sig;
int i;
- if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
return 1;
/* Look for ext. headers: */
@@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf)
ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
for (i = 0; i < ext_hdr->count; i++) {
- if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
return 1;
ext_sig++;
}
@@ -342,37 +328,6 @@ next:
return patch;
}
-static int collect_cpu_info_early(struct ucode_cpu_info *uci)
-{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig = { 0 };
- unsigned int eax, ebx, ecx, edx;
-
- memset(uci, 0, sizeof(*uci));
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
-
- family = x86_family(eax);
- model = x86_model(eax);
-
- if ((model >= 5) || (family > 6)) {
- /* get processor flags from MSR 0x17 */
- native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
- }
-
- csig.rev = intel_get_microcode_revision();
-
- uci->cpu_sig = csig;
- uci->valid = 1;
-
- return 0;
-}
-
static void show_saved_mc(void)
{
#ifdef DEBUG
@@ -386,7 +341,7 @@ static void show_saved_mc(void)
return;
}
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
@@ -456,6 +411,7 @@ static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int s
static bool load_builtin_intel_microcode(struct cpio_data *cp)
{
unsigned int eax = 1, ebx, ecx = 0, edx;
+ struct firmware fw;
char name[30];
if (IS_ENABLED(CONFIG_X86_32))
@@ -466,7 +422,13 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp)
sprintf(name, "intel-ucode/%02x-%02x-%02x",
x86_family(eax), x86_model(eax), x86_stepping(eax));
- return get_builtin_firmware(cp, name);
+ if (firmware_request_builtin(&fw, name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
+ return false;
}
/*
@@ -495,7 +457,7 @@ void show_ucode_info_early(void)
struct ucode_cpu_info uci;
if (delay_ucode_info) {
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
print_ucode_info(&uci, current_mc_date);
delay_ucode_info = 0;
}
@@ -597,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void)
if (!(cp.data && cp.size))
return 0;
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
scan_microcode(cp.data, cp.size, &uci, true);
@@ -630,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
if (!(cp.data && cp.size))
return NULL;
- collect_cpu_info_early(uci);
+ intel_cpu_collect_info(uci);
return scan_microcode(cp.data, cp.size, uci, false);
}
@@ -705,7 +667,7 @@ void reload_ucode_intel(void)
struct microcode_intel *p;
struct ucode_cpu_info uci;
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
p = find_patch(&uci);
if (!p)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e88bc296afca..831613959a92 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
#include <linux/kexec.h>
#include <linux/i8253.h>
#include <linux/random.h>
+#include <linux/swiotlb.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
@@ -32,13 +33,11 @@
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
+#include <asm/coco.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
-EXPORT_SYMBOL_GPL(hv_root_partition);
-
struct ms_hyperv_info ms_hyperv;
-EXPORT_SYMBOL_GPL(ms_hyperv);
#if IS_ENABLED(CONFIG_HYPERV)
static void (*vmbus_handler)(void);
@@ -60,23 +59,16 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
set_irq_regs(old_regs);
}
-int hv_setup_vmbus_irq(int irq, void (*handler)(void))
+void hv_setup_vmbus_handler(void (*handler)(void))
{
- /*
- * The 'irq' argument is ignored on x86/x64 because a hard-coded
- * interrupt vector is used for Hyper-V interrupts.
- */
vmbus_handler = handler;
- return 0;
}
-void hv_remove_vmbus_irq(void)
+void hv_remove_vmbus_handler(void)
{
/* We have no way to deallocate the interrupt gate */
vmbus_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
-EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
/*
* Routines to do per-architecture handling of stimer0
@@ -89,51 +81,43 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
- add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
ack_APIC_irq();
set_irq_regs(old_regs);
}
-int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void))
+/* For x86/x64, override weak placeholders in hyperv_timer.c */
+void hv_setup_stimer0_handler(void (*handler)(void))
{
- *vector = HYPERV_STIMER0_VECTOR;
- *irq = -1; /* Unused on x86/x64 */
hv_stimer0_handler = handler;
- return 0;
}
-EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq);
-void hv_remove_stimer0_irq(int irq)
+void hv_remove_stimer0_handler(void)
{
/* We have no way to deallocate the interrupt gate */
hv_stimer0_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq);
void hv_setup_kexec_handler(void (*handler)(void))
{
hv_kexec_handler = handler;
}
-EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
void hv_remove_kexec_handler(void)
{
hv_kexec_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
{
hv_crash_handler = handler;
}
-EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
void hv_remove_crash_handler(void)
{
hv_crash_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
#ifdef CONFIG_KEXEC_CORE
static void hv_machine_shutdown(void)
@@ -181,12 +165,22 @@ static uint32_t __init ms_hyperv_platform(void)
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
- if (eax >= HYPERV_CPUID_MIN &&
- eax <= HYPERV_CPUID_MAX &&
- !memcmp("Microsoft Hv", hyp_signature, 12))
- return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+ if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX ||
+ memcmp("Microsoft Hv", hyp_signature, 12))
+ return 0;
- return 0;
+ /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */
+ eax = cpuid_eax(HYPERV_CPUID_FEATURES);
+ if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) {
+ pr_warn("x86/hyperv: HYPERCALL MSR not available.\n");
+ return 0;
+ }
+ if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) {
+ pr_warn("x86/hyperv: VP_INDEX MSR not available.\n");
+ return 0;
+ }
+
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
}
static unsigned char hv_get_nmi_reason(void)
@@ -197,7 +191,7 @@ static unsigned char hv_get_nmi_reason(void)
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
- * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle
* unknown NMI on the first CPU which gets it.
*/
static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
@@ -261,6 +255,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
static void __init ms_hyperv_init_platform(void)
{
+ int hv_max_functions_eax;
int hv_host_info_eax;
int hv_host_info_ebx;
int hv_host_info_ecx;
@@ -274,12 +269,15 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
- ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
- pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
+ hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+
+ pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
+ ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+ ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -306,17 +304,16 @@ static void __init ms_hyperv_init_platform(void)
/*
* Extract host information.
*/
- if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
- HYPERV_CPUID_VERSION) {
+ if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) {
hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
- pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
- hv_host_info_eax, hv_host_info_ebx >> 16,
- hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
- hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
+ pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
+ hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF,
+ hv_host_info_eax, hv_host_info_edx & 0xFFFFFF,
+ hv_host_info_ecx, hv_host_info_edx >> 24);
}
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
@@ -325,29 +322,35 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
- if (ms_hyperv.features_b & HV_ISOLATION) {
+ if (ms_hyperv.priv_high & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
+ static_branch_enable(&isolation_type_snp);
+#ifdef CONFIG_SWIOTLB
+ swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
+#endif
+ }
+ /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE)
+ cc_set_vendor(CC_VENDOR_HYPERV);
+ }
}
- if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+ if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
ms_hyperv.nested_features =
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ pr_info("Hyper-V: Nested features: 0x%x\n",
+ ms_hyperv.nested_features);
}
- /*
- * Hyper-V expects to get crash register data or kmsg when
- * crash enlightment is available and system crashes. Set
- * crash_kexec_post_notifiers to be true to make sure that
- * calling crash enlightment interface before running kdump
- * kernel.
- */
- if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
- crash_kexec_post_notifiers = true;
-
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
@@ -376,10 +379,17 @@ static void __init ms_hyperv_init_platform(void)
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
+ /*
+ * Writing to synthetic MSR 0x40000118 updates/changes the
+ * guest visible CPUIDs. Setting bit 0 of this MSR enables
+ * guests to report invariant TSC feature through CPUID
+ * instruction, CPUID 0x800000007/EDX, bit 8. See code in
+ * early_init_intel() where this bit is examined. The
+ * setting of this MSR bit should happen before init_intel()
+ * is called.
+ */
wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
- } else {
- mark_tsc_unstable("running on Hyper-V");
}
/*
@@ -428,7 +438,7 @@ static void __init ms_hyperv_init_platform(void)
/*
* Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
- * set x2apic destination mode to physcial mode when x2apic is available
+ * set x2apic destination mode to physical mode when x2apic is available
* and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs
* have 8-bit APIC id.
*/
@@ -440,6 +450,15 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
#endif
+ /*
+ * TSC should be marked as unstable only after Hyper-V
+ * clocksource has been initialized. This ensures that the
+ * stability of the sched_clock is not altered.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ mark_tsc_unstable("running on Hyper-V");
+
+ hardlockup_detector_disable();
}
static bool __init ms_hyperv_x2apic_available(void)
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 9231640782fa..b5f43049fa5f 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -434,7 +434,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
state->range_sizek = sizek - second_sizek;
}
-/* Mininum size of mtrr block that can take hole: */
+/* Minimum size of mtrr block that can take hole: */
static u64 mtrr_chunk_size __initdata = (256ULL<<20);
static int __init parse_mtrr_chunk_size_opt(char *p)
@@ -836,7 +836,7 @@ int __init amd_special_default_mtrr(void)
if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
- if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0)
return 0;
/*
* Memory between 4GB and top of mem is forced WB by this magic bit.
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index b90f3f437765..558108296f3c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -53,13 +53,13 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
- rdmsr(MSR_K8_SYSCFG, lo, hi);
+ rdmsr(MSR_AMD64_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
" not cleared by BIOS, clearing this bit\n",
smp_processor_id());
lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
- mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi);
}
}
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 28c8a23aa42e..2746cac9d8a9 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -336,7 +336,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
replace = -1;
/* No CPU hotplug when we change MTRR entries */
- get_online_cpus();
+ cpus_read_lock();
/* Search for existing MTRR */
mutex_lock(&mtrr_mutex);
@@ -398,7 +398,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
error = i;
out:
mutex_unlock(&mtrr_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return error;
}
@@ -485,7 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&mtrr_mutex);
if (reg < 0) {
/* Search for existing MTRR */
@@ -520,7 +520,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
error = reg;
out:
mutex_unlock(&mtrr_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return error;
}
@@ -799,7 +799,7 @@ void mtrr_ap_init(void)
*
* This routine is called in two cases:
*
- * 1. very earily time of software resume, when there absolutely
+ * 1. very early time of software resume, when there absolutely
* isn't mtrr entry changes;
*
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 3ef5868ac588..7aecb2fc3186 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,7 +63,7 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
- fallthrough;
+ break;
case X86_VENDOR_ZHAOXIN:
case X86_VENDOR_CENTAUR:
return msr - MSR_ARCH_PERFMON_PERFCTR0;
@@ -96,7 +96,7 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
- fallthrough;
+ break;
case X86_VENDOR_ZHAOXIN:
case X86_VENDOR_CENTAUR:
return msr - MSR_ARCH_PERFMON_EVENTSEL0;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 4eec8889b0ff..099b6f0d96bd 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -84,14 +84,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = aperfmperf_get_khz(cpu);
-
- if (!freq)
- freq = cpufreq_quick_get(cpu);
- if (!freq)
- freq = cpu_khz;
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
+ unsigned int freq = arch_freq_get_on_cpu(cpu);
+
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 698bb26aeb6e..bb1c3f5f60c8 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -57,128 +57,57 @@ static void
mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
struct rdt_resource *r);
-#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
-struct rdt_resource rdt_resources_all[] = {
+struct rdt_hw_resource rdt_resources_all[] = {
[RDT_RESOURCE_L3] =
{
- .rid = RDT_RESOURCE_L3,
- .name = "L3",
- .domains = domain_init(RDT_RESOURCE_L3),
- .msr_base = MSR_IA32_L3_CBM_BASE,
- .msr_update = cat_wrmsr,
- .cache_level = 3,
- .cache = {
- .min_cbm_bits = 1,
- .cbm_idx_mult = 1,
- .cbm_idx_offset = 0,
+ .r_resctrl = {
+ .rid = RDT_RESOURCE_L3,
+ .name = "L3",
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ },
+ .domains = domain_init(RDT_RESOURCE_L3),
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
- },
- [RDT_RESOURCE_L3DATA] =
- {
- .rid = RDT_RESOURCE_L3DATA,
- .name = "L3DATA",
- .domains = domain_init(RDT_RESOURCE_L3DATA),
- .msr_base = MSR_IA32_L3_CBM_BASE,
- .msr_update = cat_wrmsr,
- .cache_level = 3,
- .cache = {
- .min_cbm_bits = 1,
- .cbm_idx_mult = 2,
- .cbm_idx_offset = 0,
- },
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
- },
- [RDT_RESOURCE_L3CODE] =
- {
- .rid = RDT_RESOURCE_L3CODE,
- .name = "L3CODE",
- .domains = domain_init(RDT_RESOURCE_L3CODE),
.msr_base = MSR_IA32_L3_CBM_BASE,
.msr_update = cat_wrmsr,
- .cache_level = 3,
- .cache = {
- .min_cbm_bits = 1,
- .cbm_idx_mult = 2,
- .cbm_idx_offset = 1,
- },
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L2] =
{
- .rid = RDT_RESOURCE_L2,
- .name = "L2",
- .domains = domain_init(RDT_RESOURCE_L2),
- .msr_base = MSR_IA32_L2_CBM_BASE,
- .msr_update = cat_wrmsr,
- .cache_level = 2,
- .cache = {
- .min_cbm_bits = 1,
- .cbm_idx_mult = 1,
- .cbm_idx_offset = 0,
+ .r_resctrl = {
+ .rid = RDT_RESOURCE_L2,
+ .name = "L2",
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ },
+ .domains = domain_init(RDT_RESOURCE_L2),
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
},
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
- },
- [RDT_RESOURCE_L2DATA] =
- {
- .rid = RDT_RESOURCE_L2DATA,
- .name = "L2DATA",
- .domains = domain_init(RDT_RESOURCE_L2DATA),
- .msr_base = MSR_IA32_L2_CBM_BASE,
- .msr_update = cat_wrmsr,
- .cache_level = 2,
- .cache = {
- .min_cbm_bits = 1,
- .cbm_idx_mult = 2,
- .cbm_idx_offset = 0,
- },
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
- },
- [RDT_RESOURCE_L2CODE] =
- {
- .rid = RDT_RESOURCE_L2CODE,
- .name = "L2CODE",
- .domains = domain_init(RDT_RESOURCE_L2CODE),
.msr_base = MSR_IA32_L2_CBM_BASE,
.msr_update = cat_wrmsr,
- .cache_level = 2,
- .cache = {
- .min_cbm_bits = 1,
- .cbm_idx_mult = 2,
- .cbm_idx_offset = 1,
- },
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_MBA] =
{
- .rid = RDT_RESOURCE_MBA,
- .name = "MB",
- .domains = domain_init(RDT_RESOURCE_MBA),
- .cache_level = 3,
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .r_resctrl = {
+ .rid = RDT_RESOURCE_MBA,
+ .name = "MB",
+ .cache_level = 3,
+ .domains = domain_init(RDT_RESOURCE_MBA),
+ .parse_ctrlval = parse_bw,
+ .format_str = "%d=%*u",
+ .fflags = RFTYPE_RES_MB,
+ },
},
};
-static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
-{
- return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
-}
-
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
@@ -192,14 +121,15 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
* Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
* Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
*
- * Probe by trying to write the first of the L3 cach mask registers
+ * Probe by trying to write the first of the L3 cache mask registers
* and checking that the bits stick. Max CLOSids is always 4 and max cbm length
* is always 20 on hsw server parts. The minimum cache bitmask length
* allowed for HSW server is always 2 bits. Hardcode all of them.
*/
static inline void cache_alloc_hsw_probe(void)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r = &hw_res->r_resctrl;
u32 l, h, max_cbm = BIT_MASK(20) - 1;
if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0))
@@ -211,7 +141,7 @@ static inline void cache_alloc_hsw_probe(void)
if (l != max_cbm)
return;
- r->num_closid = 4;
+ hw_res->num_closid = 4;
r->default_ctrl = max_cbm;
r->cache.cbm_len = 20;
r->cache.shareable_bits = 0xc0000;
@@ -225,7 +155,7 @@ static inline void cache_alloc_hsw_probe(void)
bool is_mba_sc(struct rdt_resource *r)
{
if (!r)
- return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc;
+ return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
return r->membw.mba_sc;
}
@@ -253,12 +183,13 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r)
static bool __get_mem_config_intel(struct rdt_resource *r)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_3_eax eax;
union cpuid_0x10_x_edx edx;
u32 ebx, ecx, max_delay;
cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
- r->num_closid = edx.split.cos_max + 1;
+ hw_res->num_closid = edx.split.cos_max + 1;
max_delay = eax.split.max_delay + 1;
r->default_ctrl = MAX_MBA_BW;
r->membw.arch_needs_linear = true;
@@ -287,12 +218,13 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_3_eax eax;
union cpuid_0x10_x_edx edx;
u32 ebx, ecx;
cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full);
- r->num_closid = edx.split.cos_max + 1;
+ hw_res->num_closid = edx.split.cos_max + 1;
r->default_ctrl = MAX_MBA_BW_AMD;
/* AMD does not use delay */
@@ -317,12 +249,13 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_edx edx;
u32 ebx, ecx;
cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
- r->num_closid = edx.split.cos_max + 1;
+ hw_res->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
r->cache.shareable_bits = ebx & r->default_ctrl;
@@ -331,43 +264,35 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
r->alloc_enabled = true;
}
-static void rdt_get_cdp_config(int level, int type)
+static void rdt_get_cdp_config(int level)
{
- struct rdt_resource *r_l = &rdt_resources_all[level];
- struct rdt_resource *r = &rdt_resources_all[type];
-
- r->num_closid = r_l->num_closid / 2;
- r->cache.cbm_len = r_l->cache.cbm_len;
- r->default_ctrl = r_l->default_ctrl;
- r->cache.shareable_bits = r_l->cache.shareable_bits;
- r->data_width = (r->cache.cbm_len + 3) / 4;
- r->alloc_capable = true;
/*
* By default, CDP is disabled. CDP can be enabled by mount parameter
* "cdp" during resctrl file system mount time.
*/
- r->alloc_enabled = false;
+ rdt_resources_all[level].cdp_enabled = false;
+ rdt_resources_all[level].r_resctrl.cdp_capable = true;
}
static void rdt_get_cdp_l3_config(void)
{
- rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA);
- rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE);
+ rdt_get_cdp_config(RDT_RESOURCE_L3);
}
static void rdt_get_cdp_l2_config(void)
{
- rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA);
- rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
+ rdt_get_cdp_config(RDT_RESOURCE_L2);
}
static void
mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
{
unsigned int i;
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
- wrmsrl(r->msr_base + i, d->ctrl_val[i]);
+ wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
/*
@@ -389,19 +314,23 @@ mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
struct rdt_resource *r)
{
unsigned int i;
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
/* Write the delay values for mba. */
for (i = m->low; i < m->high; i++)
- wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
+ wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
}
static void
cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
{
unsigned int i;
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
- wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
+ wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
@@ -417,16 +346,22 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
return NULL;
}
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->num_closid;
+}
+
void rdt_ctrl_update(void *arg)
{
struct msr_param *m = arg;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
struct rdt_resource *r = m->res;
int cpu = smp_processor_id();
struct rdt_domain *d;
d = get_domain_from_cpu(cpu, r);
if (d) {
- r->msr_update(d, m, r);
+ hw_res->msr_update(d, m, r);
return;
}
pr_warn_once("cpu %d not found in any domain for resource %s\n",
@@ -468,6 +403,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
int i;
/*
@@ -476,7 +412,7 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
* For Memory Allocation: Set b/w requested to 100%
* and the bandwidth in MBps to U32_MAX
*/
- for (i = 0; i < r->num_closid; i++, dc++, dm++) {
+ for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) {
*dc = r->default_ctrl;
*dm = MBA_MAX_MBPS;
}
@@ -484,26 +420,30 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct msr_param m;
u32 *dc, *dm;
- dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
+ dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
+ GFP_KERNEL);
if (!dc)
return -ENOMEM;
- dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
+ dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val),
+ GFP_KERNEL);
if (!dm) {
kfree(dc);
return -ENOMEM;
}
- d->ctrl_val = dc;
- d->mbps_val = dm;
+ hw_dom->ctrl_val = dc;
+ hw_dom->mbps_val = dm;
setup_default_ctrlval(r, dc, dm);
m.low = 0;
- m.high = r->num_closid;
- r->msr_update(d, &m, r);
+ m.high = hw_res->num_closid;
+ hw_res->msr_update(d, &m, r);
return 0;
}
@@ -560,6 +500,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
{
int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
struct list_head *add_pos = NULL;
+ struct rdt_hw_domain *hw_dom;
struct rdt_domain *d;
d = rdt_find_domain(r, id, &add_pos);
@@ -575,22 +516,25 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
}
- d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
- if (!d)
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
return;
+ d = &hw_dom->d_resctrl;
d->id = id;
cpumask_set_cpu(cpu, &d->cpu_mask);
rdt_domain_reconfigure_cdp(r);
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- kfree(d);
+ kfree(hw_dom);
return;
}
if (r->mon_capable && domain_setup_mon_state(r, d)) {
- kfree(d);
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom->mbps_val);
+ kfree(hw_dom);
return;
}
@@ -607,6 +551,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
static void domain_remove_cpu(int cpu, struct rdt_resource *r)
{
int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+ struct rdt_hw_domain *hw_dom;
struct rdt_domain *d;
d = rdt_find_domain(r, id, NULL);
@@ -614,6 +559,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
pr_warn("Couldn't find cache id for CPU %d\n", cpu);
return;
}
+ hw_dom = resctrl_to_arch_dom(d);
cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) {
@@ -646,16 +592,16 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
if (d->plr)
d->plr->d = NULL;
- kfree(d->ctrl_val);
- kfree(d->mbps_val);
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom->mbps_val);
bitmap_free(d->rmid_busy_llc);
kfree(d->mbm_total);
kfree(d->mbm_local);
- kfree(d);
+ kfree(hw_dom);
return;
}
- if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+ if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
cancel_delayed_work(&d->mbm_over);
mbm_setup_overflow_handler(d, 0);
@@ -732,13 +678,8 @@ static int resctrl_offline_cpu(unsigned int cpu)
static __init void rdt_init_padding(void)
{
struct rdt_resource *r;
- int cl;
for_each_alloc_capable_rdt_resource(r) {
- cl = strlen(r->name);
- if (cl > max_name_width)
- max_name_width = cl;
-
if (r->data_width > max_data_width)
max_data_width = r->data_width;
}
@@ -827,19 +768,22 @@ static bool __init rdt_cpu_has(int flag)
static __init bool get_mem_config(void)
{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
+
if (!rdt_cpu_has(X86_FEATURE_MBA))
return false;
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- return __get_mem_config_intel(&rdt_resources_all[RDT_RESOURCE_MBA]);
+ return __get_mem_config_intel(&hw_res->r_resctrl);
else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
- return __rdt_get_mem_config_amd(&rdt_resources_all[RDT_RESOURCE_MBA]);
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
return false;
}
static __init bool get_rdt_alloc_resources(void)
{
+ struct rdt_resource *r;
bool ret = false;
if (rdt_alloc_capable)
@@ -849,14 +793,16 @@ static __init bool get_rdt_alloc_resources(void)
return false;
if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
- rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ rdt_get_cache_alloc_cfg(1, r);
if (rdt_cpu_has(X86_FEATURE_CDP_L3))
rdt_get_cdp_l3_config();
ret = true;
}
if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
/* CPUID 0x10.2 fields are same format at 0x10.1 */
- rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
+ rdt_get_cache_alloc_cfg(2, r);
if (rdt_cpu_has(X86_FEATURE_CDP_L2))
rdt_get_cdp_l2_config();
ret = true;
@@ -870,6 +816,8 @@ static __init bool get_rdt_alloc_resources(void)
static __init bool get_rdt_mon_resources(void)
{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
@@ -880,7 +828,7 @@ static __init bool get_rdt_mon_resources(void)
if (!rdt_mon_features)
return false;
- return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+ return !rdt_get_mon_l3_config(r);
}
static __init void __check_quirks_intel(void)
@@ -918,42 +866,40 @@ static __init bool get_rdt_resources(void)
static __init void rdt_init_res_defs_intel(void)
{
+ struct rdt_hw_resource *hw_res;
struct rdt_resource *r;
for_each_rdt_resource(r) {
+ hw_res = resctrl_to_arch_res(r);
+
if (r->rid == RDT_RESOURCE_L3 ||
- r->rid == RDT_RESOURCE_L3DATA ||
- r->rid == RDT_RESOURCE_L3CODE ||
- r->rid == RDT_RESOURCE_L2 ||
- r->rid == RDT_RESOURCE_L2DATA ||
- r->rid == RDT_RESOURCE_L2CODE) {
+ r->rid == RDT_RESOURCE_L2) {
r->cache.arch_has_sparse_bitmaps = false;
r->cache.arch_has_empty_bitmaps = false;
r->cache.arch_has_per_cpu_cfg = false;
} else if (r->rid == RDT_RESOURCE_MBA) {
- r->msr_base = MSR_IA32_MBA_THRTL_BASE;
- r->msr_update = mba_wrmsr_intel;
+ hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
+ hw_res->msr_update = mba_wrmsr_intel;
}
}
}
static __init void rdt_init_res_defs_amd(void)
{
+ struct rdt_hw_resource *hw_res;
struct rdt_resource *r;
for_each_rdt_resource(r) {
+ hw_res = resctrl_to_arch_res(r);
+
if (r->rid == RDT_RESOURCE_L3 ||
- r->rid == RDT_RESOURCE_L3DATA ||
- r->rid == RDT_RESOURCE_L3CODE ||
- r->rid == RDT_RESOURCE_L2 ||
- r->rid == RDT_RESOURCE_L2DATA ||
- r->rid == RDT_RESOURCE_L2CODE) {
+ r->rid == RDT_RESOURCE_L2) {
r->cache.arch_has_sparse_bitmaps = true;
r->cache.arch_has_empty_bitmaps = true;
r->cache.arch_has_per_cpu_cfg = true;
} else if (r->rid == RDT_RESOURCE_MBA) {
- r->msr_base = MSR_IA32_MBA_BW_BASE;
- r->msr_update = mba_wrmsr_amd;
+ hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
}
}
}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index c877642e8a14..87666275eed9 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -57,20 +57,23 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
-int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
struct rdt_domain *d)
{
+ struct resctrl_staged_config *cfg;
+ struct rdt_resource *r = s->res;
unsigned long bw_val;
- if (d->have_new_ctrl) {
+ cfg = &d->staged_config[s->conf_type];
+ if (cfg->have_new_ctrl) {
rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
return -EINVAL;
}
if (!bw_validate(data->buf, &bw_val, r))
return -EINVAL;
- d->new_ctrl = bw_val;
- d->have_new_ctrl = true;
+ cfg->new_ctrl = bw_val;
+ cfg->have_new_ctrl = true;
return 0;
}
@@ -125,13 +128,16 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
struct rdt_domain *d)
{
struct rdtgroup *rdtgrp = data->rdtgrp;
+ struct resctrl_staged_config *cfg;
+ struct rdt_resource *r = s->res;
u32 cbm_val;
- if (d->have_new_ctrl) {
+ cfg = &d->staged_config[s->conf_type];
+ if (cfg->have_new_ctrl) {
rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
return -EINVAL;
}
@@ -160,12 +166,12 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
* The CBM may not overlap with the CBM of another closid if
* either is exclusive.
*/
- if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) {
+ if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
rdt_last_cmd_puts("Overlaps with exclusive group\n");
return -EINVAL;
}
- if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) {
+ if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
rdt_last_cmd_puts("Overlaps with other group\n");
@@ -173,8 +179,8 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
}
}
- d->new_ctrl = cbm_val;
- d->have_new_ctrl = true;
+ cfg->new_ctrl = cbm_val;
+ cfg->have_new_ctrl = true;
return 0;
}
@@ -185,9 +191,12 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
* separated by ";". The "id" is in decimal, and must match one of
* the "id"s for this resource.
*/
-static int parse_line(char *line, struct rdt_resource *r,
+static int parse_line(char *line, struct resctrl_schema *s,
struct rdtgroup *rdtgrp)
{
+ enum resctrl_conf_type t = s->conf_type;
+ struct resctrl_staged_config *cfg;
+ struct rdt_resource *r = s->res;
struct rdt_parse_data data;
char *dom = NULL, *id;
struct rdt_domain *d;
@@ -213,9 +222,10 @@ next:
if (d->id == dom_id) {
data.buf = dom;
data.rdtgrp = rdtgrp;
- if (r->parse_ctrlval(&data, r, d))
+ if (r->parse_ctrlval(&data, s, d))
return -EINVAL;
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ cfg = &d->staged_config[t];
/*
* In pseudo-locking setup mode and just
* parsed a valid CBM that should be
@@ -224,9 +234,9 @@ next:
* the required initialization for single
* region and return.
*/
- rdtgrp->plr->r = r;
+ rdtgrp->plr->s = s;
rdtgrp->plr->d = d;
- rdtgrp->plr->cbm = d->new_ctrl;
+ rdtgrp->plr->cbm = cfg->new_ctrl;
d->plr = rdtgrp->plr;
return 0;
}
@@ -236,28 +246,72 @@ next:
return -EINVAL;
}
-int update_domains(struct rdt_resource *r, int closid)
+static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
{
+ switch (type) {
+ default:
+ case CDP_NONE:
+ return closid;
+ case CDP_CODE:
+ return closid * 2 + 1;
+ case CDP_DATA:
+ return closid * 2;
+ }
+}
+
+static bool apply_config(struct rdt_hw_domain *hw_dom,
+ struct resctrl_staged_config *cfg, u32 idx,
+ cpumask_var_t cpu_mask, bool mba_sc)
+{
+ struct rdt_domain *dom = &hw_dom->d_resctrl;
+ u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
+
+ if (cfg->new_ctrl != dc[idx]) {
+ cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
+ dc[idx] = cfg->new_ctrl;
+
+ return true;
+ }
+
+ return false;
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+ struct resctrl_staged_config *cfg;
+ struct rdt_hw_domain *hw_dom;
struct msr_param msr_param;
+ enum resctrl_conf_type t;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
bool mba_sc;
- u32 *dc;
int cpu;
+ u32 idx;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
- msr_param.low = closid;
- msr_param.high = msr_param.low + 1;
- msr_param.res = r;
-
mba_sc = is_mba_sc(r);
+ msr_param.res = NULL;
list_for_each_entry(d, &r->domains, list) {
- dc = !mba_sc ? d->ctrl_val : d->mbps_val;
- if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
- dc[closid] = d->new_ctrl;
+ hw_dom = resctrl_to_arch_dom(d);
+ for (t = 0; t < CDP_NUM_TYPES; t++) {
+ cfg = &hw_dom->d_resctrl.staged_config[t];
+ if (!cfg->have_new_ctrl)
+ continue;
+
+ idx = get_config_index(closid, t);
+ if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc))
+ continue;
+
+ if (!msr_param.res) {
+ msr_param.low = idx;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+ } else {
+ msr_param.low = min(msr_param.low, idx);
+ msr_param.high = max(msr_param.high, idx + 1);
+ }
}
}
@@ -284,11 +338,11 @@ done:
static int rdtgroup_parse_resource(char *resname, char *tok,
struct rdtgroup *rdtgrp)
{
- struct rdt_resource *r;
+ struct resctrl_schema *s;
- for_each_alloc_enabled_rdt_resource(r) {
- if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid)
- return parse_line(tok, r, rdtgrp);
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
+ return parse_line(tok, s, rdtgrp);
}
rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
return -EINVAL;
@@ -297,6 +351,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok,
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
+ struct resctrl_schema *s;
struct rdtgroup *rdtgrp;
struct rdt_domain *dom;
struct rdt_resource *r;
@@ -327,9 +382,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
goto out;
}
- for_each_alloc_enabled_rdt_resource(r) {
- list_for_each_entry(dom, &r->domains, list)
- dom->have_new_ctrl = false;
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ list_for_each_entry(dom, &s->res->domains, list)
+ memset(dom->staged_config, 0, sizeof(dom->staged_config));
}
while ((tok = strsep(&buf, "\n")) != NULL) {
@@ -349,8 +404,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
goto out;
}
- for_each_alloc_enabled_rdt_resource(r) {
- ret = update_domains(r, rdtgrp->closid);
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ r = s->res;
+ ret = resctrl_arch_update_domains(r, rdtgrp->closid);
if (ret)
goto out;
}
@@ -371,19 +427,31 @@ out:
return ret ?: nbytes;
}
-static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+ u32 closid, enum resctrl_conf_type type)
+{
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ u32 idx = get_config_index(closid, type);
+
+ if (!is_mba_sc(r))
+ return hw_dom->ctrl_val[idx];
+ return hw_dom->mbps_val[idx];
+}
+
+static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
{
+ struct rdt_resource *r = schema->res;
struct rdt_domain *dom;
bool sep = false;
u32 ctrl_val;
- seq_printf(s, "%*s:", max_name_width, r->name);
+ seq_printf(s, "%*s:", max_name_width, schema->name);
list_for_each_entry(dom, &r->domains, list) {
if (sep)
seq_puts(s, ";");
- ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] :
- dom->mbps_val[closid]);
+ ctrl_val = resctrl_arch_get_config(r, dom, closid,
+ schema->conf_type);
seq_printf(s, r->format_str, dom->id, max_data_width,
ctrl_val);
sep = true;
@@ -394,16 +462,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
{
+ struct resctrl_schema *schema;
struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
int ret = 0;
u32 closid;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (rdtgrp) {
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- for_each_alloc_enabled_rdt_resource(r)
- seq_printf(s, "%s:uninitialized\n", r->name);
+ list_for_each_entry(schema, &resctrl_schema_all, list) {
+ seq_printf(s, "%s:uninitialized\n", schema->name);
+ }
} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
if (!rdtgrp->plr->d) {
rdt_last_cmd_clear();
@@ -411,15 +480,15 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
ret = -ENODEV;
} else {
seq_printf(s, "%s:%d=%x\n",
- rdtgrp->plr->r->name,
+ rdtgrp->plr->s->res->name,
rdtgrp->plr->d->id,
rdtgrp->plr->cbm);
}
} else {
closid = rdtgrp->closid;
- for_each_alloc_enabled_rdt_resource(r) {
- if (closid < r->num_closid)
- show_doms(s, r, closid);
+ list_for_each_entry(schema, &resctrl_schema_all, list) {
+ if (closid < schema->num_closid)
+ show_doms(s, schema, closid);
}
}
} else {
@@ -449,6 +518,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
int rdtgroup_mondata_show(struct seq_file *m, void *arg)
{
struct kernfs_open_file *of = m->private;
+ struct rdt_hw_resource *hw_res;
u32 resid, evtid, domid;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
@@ -468,7 +538,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
domid = md.u.domid;
evtid = md.u.evtid;
- r = &rdt_resources_all[resid];
+ hw_res = &rdt_resources_all[resid];
+ r = &hw_res->r_resctrl;
d = rdt_find_domain(r, domid, NULL);
if (IS_ERR_OR_NULL(d)) {
ret = -ENOENT;
@@ -482,7 +553,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
else if (rr.val & RMID_VAL_UNAVAIL)
seq_puts(m, "Unavailable\n");
else
- seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+ seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale);
out:
rdtgroup_kn_unlock(of->kn);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index c4d320d02fd5..1d647188a43b 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_RESCTRL_INTERNAL_H
#define _ASM_X86_RESCTRL_INTERNAL_H
+#include <linux/resctrl.h>
#include <linux/sched.h>
#include <linux/kernfs.h>
#include <linux/fs_context.h>
@@ -70,6 +71,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
* struct mon_evt - Entry in the event list of a resource
* @evtid: event id
* @name: name of the event
+ * @list: entry in &rdt_resource->evt_list
*/
struct mon_evt {
u32 evtid;
@@ -78,10 +80,13 @@ struct mon_evt {
};
/**
- * struct mon_data_bits - Monitoring details for each event file
- * @rid: Resource id associated with the event file.
+ * union mon_data_bits - Monitoring details for each event file
+ * @priv: Used to store monitoring event data in @u
+ * as kernfs private data
+ * @rid: Resource id associated with the event file
* @evtid: Event id associated with the event file
* @domid: The domain to which the event file belongs
+ * @u: Name of the bit fields struct
*/
union mon_data_bits {
void *priv;
@@ -105,6 +110,7 @@ extern unsigned int resctrl_cqm_threshold;
extern bool rdt_alloc_capable;
extern bool rdt_mon_capable;
extern unsigned int rdt_mon_features;
+extern struct list_head resctrl_schema_all;
enum rdt_group_type {
RDTCTRL_GROUP = 0,
@@ -119,6 +125,7 @@ enum rdt_group_type {
* @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
* @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
* allowed AND the allocations are Cache Pseudo-Locked
+ * @RDT_NUM_MODES: Total number of modes
*
* The mode of a resource group enables control over the allowed overlap
* between allocations associated with different resource groups (classes
@@ -142,7 +149,7 @@ enum rdtgrp_mode {
/**
* struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn kernlfs node for the mon_data directory
+ * @mon_data_kn: kernfs node for the mon_data directory
* @parent: parent rdtgrp
* @crdtgrp_list: child rdtgroup node list
* @rmid: rmid for this rdtgroup
@@ -156,8 +163,8 @@ struct mongroup {
/**
* struct pseudo_lock_region - pseudo-lock region information
- * @r: RDT resource to which this pseudo-locked region
- * belongs
+ * @s: Resctrl schema for the resource to which this
+ * pseudo-locked region belongs
* @d: RDT domain to which this pseudo-locked region
* belongs
* @cbm: bitmask of the pseudo-locked region
@@ -177,7 +184,7 @@ struct mongroup {
* @pm_reqs: Power management QoS requests related to this region
*/
struct pseudo_lock_region {
- struct rdt_resource *r;
+ struct resctrl_schema *s;
struct rdt_domain *d;
u32 cbm;
wait_queue_head_t lock_thread_wq;
@@ -282,11 +289,11 @@ struct rftype {
/**
* struct mbm_state - status for each MBM counter in each domain
* @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
- * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
+ * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it
* @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
- * @prev_bw The most recent bandwidth in MBps
- * @delta_bw Difference between the current and previous bandwidth
- * @delta_comp Indicates whether to compute the delta_bw
+ * @prev_bw: The most recent bandwidth in MBps
+ * @delta_bw: Difference between the current and previous bandwidth
+ * @delta_comp: Indicates whether to compute the delta_bw
*/
struct mbm_state {
u64 chunks;
@@ -298,44 +305,25 @@ struct mbm_state {
};
/**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list: all instances of this resource
- * @id: unique id for this instance
- * @cpu_mask: which cpus share this resource
- * @rmid_busy_llc:
- * bitmap of which limbo RMIDs are above threshold
- * @mbm_total: saved state for MBM total bandwidth
- * @mbm_local: saved state for MBM local bandwidth
- * @mbm_over: worker to periodically read MBM h/w counters
- * @cqm_limbo: worker to periodically read CQM h/w counters
- * @mbm_work_cpu:
- * worker cpu for MBM h/w counters
- * @cqm_work_cpu:
- * worker cpu for CQM h/w counters
+ * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
+ * a resource
+ * @d_resctrl: Properties exposed to the resctrl file system
* @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
* @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
- * @new_ctrl: new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- * @plr: pseudo-locked region (if any) associated with domain
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
*/
-struct rdt_domain {
- struct list_head list;
- int id;
- struct cpumask cpu_mask;
- unsigned long *rmid_busy_llc;
- struct mbm_state *mbm_total;
- struct mbm_state *mbm_local;
- struct delayed_work mbm_over;
- struct delayed_work cqm_limbo;
- int mbm_work_cpu;
- int cqm_work_cpu;
+struct rdt_hw_domain {
+ struct rdt_domain d_resctrl;
u32 *ctrl_val;
u32 *mbps_val;
- u32 new_ctrl;
- bool have_new_ctrl;
- struct pseudo_lock_region *plr;
};
+static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
+{
+ return container_of(r, struct rdt_hw_domain, d_resctrl);
+}
+
/**
* struct msr_param - set a range of MSRs from a domain
* @res: The resource to use
@@ -344,69 +332,8 @@ struct rdt_domain {
*/
struct msr_param {
struct rdt_resource *res;
- int low;
- int high;
-};
-
-/**
- * struct rdt_cache - Cache allocation related data
- * @cbm_len: Length of the cache bit mask
- * @min_cbm_bits: Minimum number of consecutive bits to be set
- * @cbm_idx_mult: Multiplier of CBM index
- * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
- * closid * cbm_idx_multi + cbm_idx_offset
- * in a cache bit mask
- * @shareable_bits: Bitmask of shareable resource with other
- * executing entities
- * @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid.
- * @arch_has_empty_bitmaps: True if the '0' bitmap is valid.
- * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache
- * level has CPU scope.
- */
-struct rdt_cache {
- unsigned int cbm_len;
- unsigned int min_cbm_bits;
- unsigned int cbm_idx_mult;
- unsigned int cbm_idx_offset;
- unsigned int shareable_bits;
- bool arch_has_sparse_bitmaps;
- bool arch_has_empty_bitmaps;
- bool arch_has_per_cpu_cfg;
-};
-
-/**
- * enum membw_throttle_mode - System's memory bandwidth throttling mode
- * @THREAD_THROTTLE_UNDEFINED: Not relevant to the system
- * @THREAD_THROTTLE_MAX: Memory bandwidth is throttled at the core
- * always using smallest bandwidth percentage
- * assigned to threads, aka "max throttling"
- * @THREAD_THROTTLE_PER_THREAD: Memory bandwidth is throttled at the thread
- */
-enum membw_throttle_mode {
- THREAD_THROTTLE_UNDEFINED = 0,
- THREAD_THROTTLE_MAX,
- THREAD_THROTTLE_PER_THREAD,
-};
-
-/**
- * struct rdt_membw - Memory bandwidth allocation related data
- * @min_bw: Minimum memory bandwidth percentage user can request
- * @bw_gran: Granularity at which the memory bandwidth is allocated
- * @delay_linear: True if memory B/W delay is in linear scale
- * @arch_needs_linear: True if we can't configure non-linear resources
- * @throttle_mode: Bandwidth throttling mode when threads request
- * different memory bandwidths
- * @mba_sc: True if MBA software controller(mba_sc) is enabled
- * @mb_map: Mapping of memory B/W percentage to memory B/W delay
- */
-struct rdt_membw {
- u32 min_bw;
- u32 bw_gran;
- u32 delay_linear;
- bool arch_needs_linear;
- enum membw_throttle_mode throttle_mode;
- bool mba_sc;
- u32 *mb_map;
+ u32 low;
+ u32 high;
};
static inline bool is_llc_occupancy_enabled(void)
@@ -441,109 +368,103 @@ struct rdt_parse_data {
};
/**
- * struct rdt_resource - attributes of an RDT resource
- * @rid: The index of the resource
- * @alloc_enabled: Is allocation enabled on this machine
- * @mon_enabled: Is monitoring enabled for this feature
- * @alloc_capable: Is allocation available on this machine
- * @mon_capable: Is monitor feature available on this machine
- * @name: Name to use in "schemata" file
- * @num_closid: Number of CLOSIDs available
- * @cache_level: Which cache level defines scope of this resource
- * @default_ctrl: Specifies default cache cbm or memory B/W percent.
+ * struct rdt_hw_resource - arch private attributes of a resctrl resource
+ * @r_resctrl: Attributes of the resource used directly by resctrl.
+ * @num_closid: Maximum number of closid this hardware can support,
+ * regardless of CDP. This is exposed via
+ * resctrl_arch_get_num_closid() to avoid confusion
+ * with struct resctrl_schema's property of the same name,
+ * which has been corrected for features like CDP.
* @msr_base: Base MSR address for CBMs
* @msr_update: Function pointer to update QOS MSRs
- * @data_width: Character width of data when displaying
- * @domains: All domains for this resource
- * @cache: Cache allocation related data
- * @format_str: Per resource format string to show domain value
- * @parse_ctrlval: Per resource function pointer to parse control values
- * @evt_list: List of monitoring events
- * @num_rmid: Number of RMIDs available
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
- * @fflags: flags to choose base and info files
+ * @mbm_width: Monitor width, to detect and correct for overflow.
+ * @cdp_enabled: CDP state of this resource
+ *
+ * Members of this structure are either private to the architecture
+ * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
+ * msr_update and msr_base.
*/
-struct rdt_resource {
- int rid;
- bool alloc_enabled;
- bool mon_enabled;
- bool alloc_capable;
- bool mon_capable;
- char *name;
- int num_closid;
- int cache_level;
- u32 default_ctrl;
+struct rdt_hw_resource {
+ struct rdt_resource r_resctrl;
+ u32 num_closid;
unsigned int msr_base;
void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
struct rdt_resource *r);
- int data_width;
- struct list_head domains;
- struct rdt_cache cache;
- struct rdt_membw membw;
- const char *format_str;
- int (*parse_ctrlval)(struct rdt_parse_data *data,
- struct rdt_resource *r,
- struct rdt_domain *d);
- struct list_head evt_list;
- int num_rmid;
unsigned int mon_scale;
unsigned int mbm_width;
- unsigned long fflags;
+ bool cdp_enabled;
};
-int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
+{
+ return container_of(r, struct rdt_hw_resource, r_resctrl);
+}
+
+int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
struct rdt_domain *d);
-int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
struct rdt_domain *d);
extern struct mutex rdtgroup_mutex;
-extern struct rdt_resource rdt_resources_all[];
+extern struct rdt_hw_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default;
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
extern struct dentry *debugfs_resctrl;
-enum {
+enum resctrl_res_level {
RDT_RESOURCE_L3,
- RDT_RESOURCE_L3DATA,
- RDT_RESOURCE_L3CODE,
RDT_RESOURCE_L2,
- RDT_RESOURCE_L2DATA,
- RDT_RESOURCE_L2CODE,
RDT_RESOURCE_MBA,
/* Must be the last */
RDT_NUM_RESOURCES,
};
+static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res);
+
+ hw_res++;
+ return &hw_res->r_resctrl;
+}
+
+static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
+{
+ return rdt_resources_all[l].cdp_enabled;
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
+
+/*
+ * To return the common struct rdt_resource, which is contained in struct
+ * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
+ */
#define for_each_rdt_resource(r) \
- for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++)
+ for (r = &rdt_resources_all[0].r_resctrl; \
+ r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \
+ r = resctrl_inc(r))
#define for_each_capable_rdt_resource(r) \
- for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++) \
+ for_each_rdt_resource(r) \
if (r->alloc_capable || r->mon_capable)
#define for_each_alloc_capable_rdt_resource(r) \
- for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++) \
+ for_each_rdt_resource(r) \
if (r->alloc_capable)
#define for_each_mon_capable_rdt_resource(r) \
- for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++) \
+ for_each_rdt_resource(r) \
if (r->mon_capable)
#define for_each_alloc_enabled_rdt_resource(r) \
- for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++) \
+ for_each_rdt_resource(r) \
if (r->alloc_enabled)
#define for_each_mon_enabled_rdt_resource(r) \
- for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
- r++) \
+ for_each_rdt_resource(r) \
if (r->mon_enabled)
/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
@@ -587,7 +508,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
unsigned long cbm, int closid, bool exclusive);
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
unsigned long cbm);
@@ -602,7 +523,6 @@ void rdt_pseudo_lock_release(void);
int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
-int update_domains(struct rdt_resource *r, int closid);
int closids_supported(void);
void closid_free(int closid);
int alloc_rmid(void);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 7ac31210e452..eaf25a234ff5 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -84,7 +84,7 @@ unsigned int resctrl_cqm_threshold;
static const struct mbm_correction_factor_table {
u32 rmidthreshold;
u64 cf;
-} mbm_cf_table[] __initdata = {
+} mbm_cf_table[] __initconst = {
{7, CF(1.000000)},
{15, CF(1.000000)},
{15, CF(0.969650)},
@@ -174,7 +174,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
struct rdt_resource *r;
u32 crmid = 1, nrmid;
- r = &rdt_resources_all[RDT_RESOURCE_L3];
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
/*
* Skip RMID 0 and start from RMID 1 and check all the RMIDs that
@@ -232,7 +232,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
int cpu;
u64 val;
- r = &rdt_resources_all[RDT_RESOURCE_L3];
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
entry->busy = 0;
cpu = get_cpu();
@@ -282,18 +282,18 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
u64 shift = 64 - width, chunks;
chunks = (cur_msr << shift) - (prev_msr << shift);
- return chunks >>= shift;
+ return chunks >> shift;
}
-static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
struct mbm_state *m;
u64 chunks, tval;
tval = __rmid_read(rmid, rr->evtid);
if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
- rr->val = tval;
- return -EINVAL;
+ return tval;
}
switch (rr->evtid) {
case QOS_L3_OCCUP_EVENT_ID:
@@ -307,10 +307,10 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
break;
default:
/*
- * Code would never reach here because
- * an invalid event id would fail the __rmid_read.
+ * Code would never reach here because an invalid
+ * event id would fail the __rmid_read.
*/
- return -EINVAL;
+ return RMID_VAL_ERROR;
}
if (rr->first) {
@@ -319,7 +319,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
return 0;
}
- chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width);
+ chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width);
m->chunks += chunks;
m->prev_msr = tval;
@@ -334,7 +334,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
*/
static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
struct mbm_state *m = &rr->d->mbm_local[rmid];
u64 tval, cur_bw, chunks;
@@ -342,8 +342,8 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
return;
- chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
- cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20;
+ chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width);
+ cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20;
if (m->delta_comp)
m->delta_bw = abs(cur_bw - m->prev_bw);
@@ -361,23 +361,29 @@ void mon_event_count(void *info)
struct rdtgroup *rdtgrp, *entry;
struct rmid_read *rr = info;
struct list_head *head;
+ u64 ret_val;
rdtgrp = rr->rgrp;
- if (__mon_event_count(rdtgrp->mon.rmid, rr))
- return;
+ ret_val = __mon_event_count(rdtgrp->mon.rmid, rr);
/*
- * For Ctrl groups read data from child monitor groups.
+ * For Ctrl groups read data from child monitor groups and
+ * add them together. Count events which are read successfully.
+ * Discard the rmid_read's reporting errors.
*/
head = &rdtgrp->mon.crdtgrp_list;
if (rdtgrp->type == RDTCTRL_GROUP) {
list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->mon.rmid, rr))
- return;
+ if (__mon_event_count(entry->mon.rmid, rr) == 0)
+ ret_val = 0;
}
}
+
+ /* Report error if none of rmid_reads are successful */
+ if (ret_val)
+ rr->val = ret_val;
}
/*
@@ -387,7 +393,7 @@ void mon_event_count(void *info)
* adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
* that:
*
- * current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ * current bandwidth(cur_bw) < user specified bandwidth(user_bw)
*
* This uses the MBM counters to measure the bandwidth and MBA throttle
* MSRs to control the bandwidth for a particular rdtgrp. It builds on the
@@ -397,7 +403,7 @@ void mon_event_count(void *info)
* timer. Having 1s interval makes the calculation of bandwidth simpler.
*
* Although MBA's goal is to restrict the bandwidth to a maximum, there may
- * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * be a need to increase the bandwidth to avoid unnecessarily restricting
* the L2 <-> L3 traffic.
*
* Since MBA controls the L2 external bandwidth where as MBM measures the
@@ -416,6 +422,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
{
u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
+ struct rdt_hw_resource *hw_r_mba;
+ struct rdt_hw_domain *hw_dom_mba;
u32 cur_bw, delta_bw, user_bw;
struct rdt_resource *r_mba;
struct rdt_domain *dom_mba;
@@ -425,7 +433,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
if (!is_mbm_local_enabled())
return;
- r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+ hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+ r_mba = &hw_r_mba->r_resctrl;
closid = rgrp->closid;
rmid = rgrp->mon.rmid;
pmbm_data = &dom_mbm->mbm_local[rmid];
@@ -435,11 +444,16 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
pr_warn_once("Failure to get domain for MBA update\n");
return;
}
+ hw_dom_mba = resctrl_to_arch_dom(dom_mba);
cur_bw = pmbm_data->prev_bw;
- user_bw = dom_mba->mbps_val[closid];
+ user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
delta_bw = pmbm_data->delta_bw;
- cur_msr_val = dom_mba->ctrl_val[closid];
+ /*
+ * resctrl_arch_get_config() chooses the mbps/ctrl value to return
+ * based on is_mba_sc(). For now, reach into the hw_dom.
+ */
+ cur_msr_val = hw_dom_mba->ctrl_val[closid];
/*
* For Ctrl groups read data from child monitor groups.
@@ -474,13 +488,13 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
return;
}
- cur_msr = r_mba->msr_base + closid;
+ cur_msr = hw_r_mba->msr_base + closid;
wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
- dom_mba->ctrl_val[closid] = new_msr_val;
+ hw_dom_mba->ctrl_val[closid] = new_msr_val;
/*
* Delta values are updated dynamically package wise for each
- * rdtgrp everytime the throttle MSR changes value.
+ * rdtgrp every time the throttle MSR changes value.
*
* This is because (1)the increase in bandwidth is not perfectly
* linear and only "approximately" linear even when the hardware
@@ -538,7 +552,7 @@ void cqm_handle_limbo(struct work_struct *work)
mutex_lock(&rdtgroup_mutex);
- r = &rdt_resources_all[RDT_RESOURCE_L3];
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
d = container_of(work, struct rdt_domain, cqm_limbo.work);
__check_limbo(d, false);
@@ -574,7 +588,7 @@ void mbm_handle_overflow(struct work_struct *work)
if (!static_branch_likely(&rdt_mon_enable_key))
goto out_unlock;
- r = &rdt_resources_all[RDT_RESOURCE_L3];
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
d = container_of(work, struct rdt_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
@@ -671,15 +685,16 @@ static void l3_mon_evt_init(struct rdt_resource *r)
int rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
unsigned int cl_size = boot_cpu_data.x86_cache_size;
int ret;
- r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
- r->mbm_width = MBM_CNTR_WIDTH_BASE;
+ hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
- r->mbm_width += mbm_offset;
+ hw_res->mbm_width += mbm_offset;
else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
pr_warn("Ignoring impossible MBM counter offset\n");
@@ -693,7 +708,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
- resctrl_cqm_threshold /= r->mon_scale;
+ resctrl_cqm_threshold /= hw_res->mon_scale;
ret = dom_data_init(r);
if (ret)
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index e916646adc69..db813f819ad6 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -49,6 +49,7 @@ static struct class *pseudo_lock_class;
/**
* get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * @void: It takes no parameters.
*
* Capture the list of platforms that have been validated to support
* pseudo-locking. This includes testing to ensure pseudo-locked regions
@@ -162,7 +163,7 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor)
}
/**
- * pseudo_lock_pm_req - A power management QoS request list entry
+ * struct pseudo_lock_pm_req - A power management QoS request list entry
* @list: Entry within the @pm_reqs list for a pseudo-locked region
* @req: PM QoS request
*/
@@ -184,6 +185,7 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
/**
* pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ * @plr: Pseudo-locked region
*
* To prevent the cache from being affected by power management entering
* C6 has to be avoided. This is accomplished by requesting a latency
@@ -196,6 +198,8 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
* the ACPI latencies need to be considered while keeping in mind that C2
* may be set to map to deeper sleep states. In this case the latency
* requirement needs to prevent entering C2 also.
+ *
+ * Return: 0 on success, <0 on failure
*/
static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
{
@@ -246,7 +250,7 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
plr->line_size = 0;
kfree(plr->kmem);
plr->kmem = NULL;
- plr->r = NULL;
+ plr->s = NULL;
if (plr->d)
plr->d->plr = NULL;
plr->d = NULL;
@@ -290,10 +294,10 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
ci = get_cpu_cacheinfo(plr->cpu);
- plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm);
+ plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == plr->r->cache_level) {
+ if (ci->info_list[i].level == plr->s->res->cache_level) {
plr->line_size = ci->info_list[i].coherency_line_size;
return 0;
}
@@ -520,7 +524,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
/**
* rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @r: resource group being queried
+ * @rdtgrp: resource group being queried
*
* Return: 1 if monitor groups have been created for this resource
* group, 0 otherwise.
@@ -684,8 +688,8 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
* resource, the portion of cache used by it should be made
* unavailable to all future allocations from both resources.
*/
- if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled ||
- rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) {
+ if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
+ resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
rdt_last_cmd_puts("CDP enabled\n");
return -EINVAL;
}
@@ -796,7 +800,7 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm
unsigned long cbm_b;
if (d->plr) {
- cbm_len = d->plr->r->cache.cbm_len;
+ cbm_len = d->plr->s->res->cache.cbm_len;
cbm_b = d->plr->cbm;
if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
return true;
@@ -1140,6 +1144,8 @@ out:
/**
* pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ * @rdtgrp: Resource group to which the pseudo-locked region belongs.
+ * @sel: Selector of which measurement to perform on a pseudo-locked region.
*
* The measurement of latency to access a pseudo-locked region should be
* done from a cpu that is associated with that pseudo-locked region.
@@ -1307,7 +1313,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
* If the thread does not get on the CPU for whatever
* reason and the process which sets up the region is
* interrupted then this will leave the thread in runnable
- * state and once it gets on the CPU it will derefence
+ * state and once it gets on the CPU it will dereference
* the cleared, but not freed, plr struct resulting in an
* empty pseudo-locking loop.
*/
@@ -1391,7 +1397,7 @@ out:
* group is removed from user space via a "rmdir" from userspace or the
* unmount of the resctrl filesystem. On removal the resource group does
* not go back to pseudo-locksetup mode before it is removed, instead it is
- * removed directly. There is thus assymmetry with the creation where the
+ * removed directly. There is thus asymmetry with the creation where the
* &struct pseudo_lock_region is removed here while it was not created in
* rdtgroup_pseudo_lock_create().
*
@@ -1458,7 +1464,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
return 0;
}
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
{
/* Not supported */
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f9190adc52cb..f276aff521e8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * User interface for Resource Alloction in Resource Director Technology(RDT)
+ * User interface for Resource Allocation in Resource Director Technology(RDT)
*
* Copyright (C) 2016 Intel Corporation
*
@@ -39,6 +39,9 @@ static struct kernfs_root *rdt_root;
struct rdtgroup rdtgroup_default;
LIST_HEAD(rdt_all_groups);
+/* list of entries for the schemata file */
+LIST_HEAD(resctrl_schema_all);
+
/* Kernel fs node for "info" directory under root */
static struct kernfs_node *kn_info;
@@ -100,12 +103,12 @@ int closids_supported(void)
static void closid_init(void)
{
- struct rdt_resource *r;
- int rdt_min_closid = 32;
+ struct resctrl_schema *s;
+ u32 rdt_min_closid = 32;
/* Compute rdt_min_closid across all resources */
- for_each_alloc_enabled_rdt_resource(r)
- rdt_min_closid = min(rdt_min_closid, r->num_closid);
+ list_for_each_entry(s, &resctrl_schema_all, list)
+ rdt_min_closid = min(rdt_min_closid, s->num_closid);
closid_free_map = BIT_MASK(rdt_min_closid) - 1;
@@ -294,7 +297,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
/*
* This is safe against resctrl_sched_in() called from __switch_to()
* because __switch_to() is executed with interrupts disabled. A local call
- * from update_closid_rmid() is proteced against __switch_to() because
+ * from update_closid_rmid() is protected against __switch_to() because
* preemption is disabled.
*/
static void update_cpu_closid_rmid(void *info)
@@ -338,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus belong to parent ctrl group */
cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
return -EINVAL;
}
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
/* Give any dropped cpus to parent rdtgroup */
cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
update_closid_rmid(tmpmask, prgrp);
@@ -356,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
* and update per-cpu rmid
*/
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
if (crgrp == rdtgrp)
@@ -391,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
/* Can't drop from default group */
if (rdtgrp == &rdtgroup_default) {
rdt_last_cmd_puts("Can't drop CPUs from default group\n");
@@ -410,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
* and update per-cpu closid/rmid.
*/
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
if (r == rdtgrp)
continue;
cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
- if (cpumask_weight(tmpmask1))
+ if (!cpumask_empty(tmpmask1))
cpumask_rdtgrp_clear(r, tmpmask1);
}
update_closid_rmid(tmpmask, rdtgrp);
@@ -485,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
/* check that user didn't specify any offline cpus */
cpumask_andnot(tmpmask, newmask, cpu_online_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
ret = -EINVAL;
rdt_last_cmd_puts("Can only assign online CPUs\n");
goto unlock;
@@ -842,16 +845,17 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
static int rdt_num_closids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
- seq_printf(seq, "%d\n", r->num_closid);
+ seq_printf(seq, "%u\n", s->num_closid);
return 0;
}
static int rdt_default_ctrl_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
seq_printf(seq, "%x\n", r->default_ctrl);
return 0;
@@ -860,7 +864,8 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of,
static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
return 0;
@@ -869,7 +874,8 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
static int rdt_shareable_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
seq_printf(seq, "%x\n", r->cache.shareable_bits);
return 0;
@@ -892,38 +898,40 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
static int rdt_bit_usage_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
/*
* Use unsigned long even though only 32 bits are used to ensure
* test_bit() is used safely.
*/
unsigned long sw_shareable = 0, hw_shareable = 0;
unsigned long exclusive = 0, pseudo_locked = 0;
+ struct rdt_resource *r = s->res;
struct rdt_domain *dom;
int i, hwb, swb, excl, psl;
enum rdtgrp_mode mode;
bool sep = false;
- u32 *ctrl;
+ u32 ctrl_val;
mutex_lock(&rdtgroup_mutex);
hw_shareable = r->cache.shareable_bits;
list_for_each_entry(dom, &r->domains, list) {
if (sep)
seq_putc(seq, ';');
- ctrl = dom->ctrl_val;
sw_shareable = 0;
exclusive = 0;
seq_printf(seq, "%d=", dom->id);
- for (i = 0; i < closids_supported(); i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++) {
if (!closid_allocated(i))
continue;
+ ctrl_val = resctrl_arch_get_config(r, dom, i,
+ s->conf_type);
mode = rdtgroup_mode_by_closid(i);
switch (mode) {
case RDT_MODE_SHAREABLE:
- sw_shareable |= *ctrl;
+ sw_shareable |= ctrl_val;
break;
case RDT_MODE_EXCLUSIVE:
- exclusive |= *ctrl;
+ exclusive |= ctrl_val;
break;
case RDT_MODE_PSEUDO_LOCKSETUP:
/*
@@ -970,7 +978,8 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
static int rdt_min_bw_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.min_bw);
return 0;
@@ -1001,7 +1010,8 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
static int rdt_bw_gran_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.bw_gran);
return 0;
@@ -1010,7 +1020,8 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of,
static int rdt_delay_linear_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.delay_linear);
return 0;
@@ -1020,8 +1031,9 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- seq_printf(seq, "%u\n", resctrl_cqm_threshold * r->mon_scale);
+ seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
return 0;
}
@@ -1029,7 +1041,8 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
seq_puts(seq, "per-thread\n");
@@ -1042,7 +1055,7 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_hw_resource *hw_res;
unsigned int bytes;
int ret;
@@ -1053,7 +1066,8 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
if (bytes > (boot_cpu_data.x86_cache_size * 1024))
return -EINVAL;
- resctrl_cqm_threshold = bytes / r->mon_scale;
+ hw_res = resctrl_to_arch_res(of->kn->parent->priv);
+ resctrl_cqm_threshold = bytes / hw_res->mon_scale;
return nbytes;
}
@@ -1078,76 +1092,17 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of,
return 0;
}
-/**
- * rdt_cdp_peer_get - Retrieve CDP peer if it exists
- * @r: RDT resource to which RDT domain @d belongs
- * @d: Cache instance for which a CDP peer is requested
- * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
- * Used to return the result.
- * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
- * Used to return the result.
- *
- * RDT resources are managed independently and by extension the RDT domains
- * (RDT resource instances) are managed independently also. The Code and
- * Data Prioritization (CDP) RDT resources, while managed independently,
- * could refer to the same underlying hardware. For example,
- * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
- *
- * When provided with an RDT resource @r and an instance of that RDT
- * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
- * resource and the exact instance that shares the same hardware.
- *
- * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
- * If a CDP peer was found, @r_cdp will point to the peer RDT resource
- * and @d_cdp will point to the peer RDT domain.
- */
-static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
- struct rdt_resource **r_cdp,
- struct rdt_domain **d_cdp)
+static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
{
- struct rdt_resource *_r_cdp = NULL;
- struct rdt_domain *_d_cdp = NULL;
- int ret = 0;
-
- switch (r->rid) {
- case RDT_RESOURCE_L3DATA:
- _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
- break;
- case RDT_RESOURCE_L3CODE:
- _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3DATA];
- break;
- case RDT_RESOURCE_L2DATA:
- _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2CODE];
- break;
- case RDT_RESOURCE_L2CODE:
- _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2DATA];
- break;
+ switch (my_type) {
+ case CDP_CODE:
+ return CDP_DATA;
+ case CDP_DATA:
+ return CDP_CODE;
default:
- ret = -ENOENT;
- goto out;
- }
-
- /*
- * When a new CPU comes online and CDP is enabled then the new
- * RDT domains (if any) associated with both CDP RDT resources
- * are added in the same CPU online routine while the
- * rdtgroup_mutex is held. It should thus not happen for one
- * RDT domain to exist and be associated with its RDT CDP
- * resource but there is no RDT domain associated with the
- * peer RDT CDP resource. Hence the WARN.
- */
- _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
- if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
- _r_cdp = NULL;
- _d_cdp = NULL;
- ret = -EINVAL;
+ case CDP_NONE:
+ return CDP_NONE;
}
-
-out:
- *r_cdp = _r_cdp;
- *d_cdp = _d_cdp;
-
- return ret;
}
/**
@@ -1171,11 +1126,11 @@ out:
* Return: false if CBM does not overlap, true if it does.
*/
static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
- unsigned long cbm, int closid, bool exclusive)
+ unsigned long cbm, int closid,
+ enum resctrl_conf_type type, bool exclusive)
{
enum rdtgrp_mode mode;
unsigned long ctrl_b;
- u32 *ctrl;
int i;
/* Check for any overlap with regions used by hardware directly */
@@ -1186,9 +1141,8 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
}
/* Check for overlap with other resource groups */
- ctrl = d->ctrl_val;
- for (i = 0; i < closids_supported(); i++, ctrl++) {
- ctrl_b = *ctrl;
+ for (i = 0; i < closids_supported(); i++) {
+ ctrl_b = resctrl_arch_get_config(r, d, i, type);
mode = rdtgroup_mode_by_closid(i);
if (closid_allocated(i) && i != closid &&
mode != RDT_MODE_PSEUDO_LOCKSETUP) {
@@ -1208,7 +1162,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
/**
* rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
- * @r: Resource to which domain instance @d belongs.
+ * @s: Schema for the resource to which domain instance @d belongs.
* @d: The domain instance for which @closid is being tested.
* @cbm: Capacity bitmask being tested.
* @closid: Intended closid for @cbm.
@@ -1226,19 +1180,19 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
*
* Return: true if CBM overlap detected, false if there is no overlap
*/
-bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
unsigned long cbm, int closid, bool exclusive)
{
- struct rdt_resource *r_cdp;
- struct rdt_domain *d_cdp;
+ enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
+ struct rdt_resource *r = s->res;
- if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
+ if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
+ exclusive))
return true;
- if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
+ if (!resctrl_arch_get_cdp_enabled(r->rid))
return false;
-
- return __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
+ return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
}
/**
@@ -1256,17 +1210,21 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
{
int closid = rdtgrp->closid;
+ struct resctrl_schema *s;
struct rdt_resource *r;
bool has_cache = false;
struct rdt_domain *d;
+ u32 ctrl;
- for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ r = s->res;
if (r->rid == RDT_RESOURCE_MBA)
continue;
has_cache = true;
list_for_each_entry(d, &r->domains, list) {
- if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
- rdtgrp->closid, false)) {
+ ctrl = resctrl_arch_get_config(r, d, closid,
+ s->conf_type);
+ if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
rdt_last_cmd_puts("Schemata overlaps\n");
return false;
}
@@ -1397,6 +1355,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
static int rdtgroup_size_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
{
+ struct resctrl_schema *schema;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
struct rdt_domain *d;
@@ -1418,8 +1377,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
ret = -ENODEV;
} else {
seq_printf(s, "%*s:", max_name_width,
- rdtgrp->plr->r->name);
- size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
+ rdtgrp->plr->s->name);
+ size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
rdtgrp->plr->d,
rdtgrp->plr->cbm);
seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
@@ -1427,18 +1386,19 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
goto out;
}
- for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(schema, &resctrl_schema_all, list) {
+ r = schema->res;
sep = false;
- seq_printf(s, "%*s:", max_name_width, r->name);
+ seq_printf(s, "%*s:", max_name_width, schema->name);
list_for_each_entry(d, &r->domains, list) {
if (sep)
seq_putc(s, ';');
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
size = 0;
} else {
- ctrl = (!is_mba_sc(r) ?
- d->ctrl_val[rdtgrp->closid] :
- d->mbps_val[rdtgrp->closid]);
+ ctrl = resctrl_arch_get_config(r, d,
+ rdtgrp->closid,
+ schema->conf_type);
if (r->rid == RDT_RESOURCE_MBA)
size = ctrl;
else
@@ -1757,14 +1717,14 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
return ret;
}
-static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
unsigned long fflags)
{
struct kernfs_node *kn_subdir;
int ret;
kn_subdir = kernfs_create_dir(kn_info, name,
- kn_info->mode, r);
+ kn_info->mode, priv);
if (IS_ERR(kn_subdir))
return PTR_ERR(kn_subdir);
@@ -1781,6 +1741,7 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
{
+ struct resctrl_schema *s;
struct rdt_resource *r;
unsigned long fflags;
char name[32];
@@ -1795,9 +1756,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
if (ret)
goto out_destroy;
- for_each_alloc_enabled_rdt_resource(r) {
+ /* loop over enabled controls, these are all alloc_enabled */
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ r = s->res;
fflags = r->fflags | RF_CTRL_INFO;
- ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
+ ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
if (ret)
goto out_destroy;
}
@@ -1867,7 +1830,7 @@ static void l2_qos_cfg_update(void *arg)
static inline bool is_mba_linear(void)
{
- return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
+ return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
}
static int set_cache_qos_cfg(int level, bool enable)
@@ -1888,7 +1851,7 @@ static int set_cache_qos_cfg(int level, bool enable)
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
- r_l = &rdt_resources_all[level];
+ r_l = &rdt_resources_all[level].r_resctrl;
list_for_each_entry(d, &r_l->domains, list) {
if (r_l->cache.arch_has_per_cpu_cfg)
/* Pick all the CPUs in the domain instance */
@@ -1914,14 +1877,16 @@ static int set_cache_qos_cfg(int level, bool enable)
/* Restore the qos cfg state when a domain comes online */
void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
{
- if (!r->alloc_capable)
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (!r->cdp_capable)
return;
- if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
- l2_qos_cfg_update(&r->alloc_enabled);
+ if (r->rid == RDT_RESOURCE_L2)
+ l2_qos_cfg_update(&hw_res->cdp_enabled);
- if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
- l3_qos_cfg_update(&r->alloc_enabled);
+ if (r->rid == RDT_RESOURCE_L3)
+ l3_qos_cfg_update(&hw_res->cdp_enabled);
}
/*
@@ -1932,7 +1897,8 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
*/
static int set_mba_sc(bool mba_sc)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ struct rdt_hw_domain *hw_dom;
struct rdt_domain *d;
if (!is_mbm_enabled() || !is_mba_linear() ||
@@ -1940,73 +1906,60 @@ static int set_mba_sc(bool mba_sc)
return -EINVAL;
r->membw.mba_sc = mba_sc;
- list_for_each_entry(d, &r->domains, list)
- setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
+ list_for_each_entry(d, &r->domains, list) {
+ hw_dom = resctrl_to_arch_dom(d);
+ setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val);
+ }
return 0;
}
-static int cdp_enable(int level, int data_type, int code_type)
+static int cdp_enable(int level)
{
- struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
- struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
- struct rdt_resource *r_l = &rdt_resources_all[level];
+ struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
int ret;
- if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
- !r_lcode->alloc_capable)
+ if (!r_l->alloc_capable)
return -EINVAL;
ret = set_cache_qos_cfg(level, true);
- if (!ret) {
- r_l->alloc_enabled = false;
- r_ldata->alloc_enabled = true;
- r_lcode->alloc_enabled = true;
- }
+ if (!ret)
+ rdt_resources_all[level].cdp_enabled = true;
+
return ret;
}
-static int cdpl3_enable(void)
+static void cdp_disable(int level)
{
- return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
- RDT_RESOURCE_L3CODE);
-}
+ struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
-static int cdpl2_enable(void)
-{
- return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
- RDT_RESOURCE_L2CODE);
+ if (r_hw->cdp_enabled) {
+ set_cache_qos_cfg(level, false);
+ r_hw->cdp_enabled = false;
+ }
}
-static void cdp_disable(int level, int data_type, int code_type)
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
{
- struct rdt_resource *r = &rdt_resources_all[level];
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
- r->alloc_enabled = r->alloc_capable;
+ if (!hw_res->r_resctrl.cdp_capable)
+ return -EINVAL;
- if (rdt_resources_all[data_type].alloc_enabled) {
- rdt_resources_all[data_type].alloc_enabled = false;
- rdt_resources_all[code_type].alloc_enabled = false;
- set_cache_qos_cfg(level, false);
- }
-}
+ if (enable)
+ return cdp_enable(l);
-static void cdpl3_disable(void)
-{
- cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
-}
+ cdp_disable(l);
-static void cdpl2_disable(void)
-{
- cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
+ return 0;
}
static void cdp_disable_all(void)
{
- if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
- cdpl3_disable();
- if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
- cdpl2_disable();
+ if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+ if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
}
/*
@@ -2084,10 +2037,10 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
int ret = 0;
if (ctx->enable_cdpl2)
- ret = cdpl2_enable();
+ ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
if (!ret && ctx->enable_cdpl3)
- ret = cdpl3_enable();
+ ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
if (!ret && ctx->enable_mba_mbps)
ret = set_mba_sc(true);
@@ -2095,6 +2048,92 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx)
return ret;
}
+static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
+{
+ struct resctrl_schema *s;
+ const char *suffix = "";
+ int ret, cl;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ s->res = r;
+ s->num_closid = resctrl_arch_get_num_closid(r);
+ if (resctrl_arch_get_cdp_enabled(r->rid))
+ s->num_closid /= 2;
+
+ s->conf_type = type;
+ switch (type) {
+ case CDP_CODE:
+ suffix = "CODE";
+ break;
+ case CDP_DATA:
+ suffix = "DATA";
+ break;
+ case CDP_NONE:
+ suffix = "";
+ break;
+ }
+
+ ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
+ if (ret >= sizeof(s->name)) {
+ kfree(s);
+ return -EINVAL;
+ }
+
+ cl = strlen(s->name);
+
+ /*
+ * If CDP is supported by this resource, but not enabled,
+ * include the suffix. This ensures the tabular format of the
+ * schemata file does not change between mounts of the filesystem.
+ */
+ if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
+ cl += 4;
+
+ if (cl > max_name_width)
+ max_name_width = cl;
+
+ INIT_LIST_HEAD(&s->list);
+ list_add(&s->list, &resctrl_schema_all);
+
+ return 0;
+}
+
+static int schemata_list_create(void)
+{
+ struct rdt_resource *r;
+ int ret = 0;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ if (resctrl_arch_get_cdp_enabled(r->rid)) {
+ ret = schemata_list_add(r, CDP_CODE);
+ if (ret)
+ break;
+
+ ret = schemata_list_add(r, CDP_DATA);
+ } else {
+ ret = schemata_list_add(r, CDP_NONE);
+ }
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void schemata_list_destroy(void)
+{
+ struct resctrl_schema *s, *tmp;
+
+ list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
+ list_del(&s->list);
+ kfree(s);
+ }
+}
+
static int rdt_get_tree(struct fs_context *fc)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
@@ -2116,11 +2155,17 @@ static int rdt_get_tree(struct fs_context *fc)
if (ret < 0)
goto out_cdp;
+ ret = schemata_list_create();
+ if (ret) {
+ schemata_list_destroy();
+ goto out_mba;
+ }
+
closid_init();
ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
if (ret < 0)
- goto out_mba;
+ goto out_schemata_free;
if (rdt_mon_capable) {
ret = mongroup_create_dir(rdtgroup_default.kn,
@@ -2153,7 +2198,7 @@ static int rdt_get_tree(struct fs_context *fc)
static_branch_enable_cpuslocked(&rdt_enable_key);
if (is_mbm_enabled()) {
- r = &rdt_resources_all[RDT_RESOURCE_L3];
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
list_for_each_entry(dom, &r->domains, list)
mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
}
@@ -2170,6 +2215,8 @@ out_mongrp:
kernfs_remove(kn_mongrp);
out_info:
kernfs_remove(kn_info);
+out_schemata_free:
+ schemata_list_destroy();
out_mba:
if (ctx->enable_mba_mbps)
set_mba_sc(false);
@@ -2257,6 +2304,8 @@ static int rdt_init_fs_context(struct fs_context *fc)
static int reset_all_ctrls(struct rdt_resource *r)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct rdt_hw_domain *hw_dom;
struct msr_param msr_param;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
@@ -2267,7 +2316,7 @@ static int reset_all_ctrls(struct rdt_resource *r)
msr_param.res = r;
msr_param.low = 0;
- msr_param.high = r->num_closid;
+ msr_param.high = hw_res->num_closid;
/*
* Disable resource control for this resource by setting all
@@ -2275,10 +2324,11 @@ static int reset_all_ctrls(struct rdt_resource *r)
* from each domain to update the MSRs below.
*/
list_for_each_entry(d, &r->domains, list) {
+ hw_dom = resctrl_to_arch_dom(d);
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
- for (i = 0; i < r->num_closid; i++)
- d->ctrl_val[i] = r->default_ctrl;
+ for (i = 0; i < hw_res->num_closid; i++)
+ hw_dom->ctrl_val[i] = r->default_ctrl;
}
cpu = get_cpu();
/* Update CBM on this cpu if it's in cpu_mask. */
@@ -2408,6 +2458,7 @@ static void rdt_kill_sb(struct super_block *sb)
rmdir_all_sub();
rdt_pseudo_lock_release();
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
+ schemata_list_destroy();
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
static_branch_disable_cpuslocked(&rdt_mon_enable_key);
static_branch_disable_cpuslocked(&rdt_enable_key);
@@ -2555,7 +2606,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
/*
* This creates a directory mon_data which contains the monitored data.
*
- * mon_data has one directory for each domain whic are named
+ * mon_data has one directory for each domain which are named
* in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
* with L3 domain looks as below:
* ./mon_data:
@@ -2642,23 +2693,24 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
* Set the RDT domain up to start off with all usable allocations. That is,
* all shareable and unused bits. All-zero CBM is invalid.
*/
-static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
+static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
u32 closid)
{
- struct rdt_resource *r_cdp = NULL;
- struct rdt_domain *d_cdp = NULL;
+ enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
+ enum resctrl_conf_type t = s->conf_type;
+ struct resctrl_staged_config *cfg;
+ struct rdt_resource *r = s->res;
u32 used_b = 0, unused_b = 0;
unsigned long tmp_cbm;
enum rdtgrp_mode mode;
- u32 peer_ctl, *ctrl;
+ u32 peer_ctl, ctrl_val;
int i;
- rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
- d->have_new_ctrl = false;
- d->new_ctrl = r->cache.shareable_bits;
+ cfg = &d->staged_config[t];
+ cfg->have_new_ctrl = false;
+ cfg->new_ctrl = r->cache.shareable_bits;
used_b = r->cache.shareable_bits;
- ctrl = d->ctrl_val;
- for (i = 0; i < closids_supported(); i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++) {
if (closid_allocated(i) && i != closid) {
mode = rdtgroup_mode_by_closid(i);
if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
@@ -2673,35 +2725,38 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
* usage to ensure there is no overlap
* with an exclusive group.
*/
- if (d_cdp)
- peer_ctl = d_cdp->ctrl_val[i];
+ if (resctrl_arch_get_cdp_enabled(r->rid))
+ peer_ctl = resctrl_arch_get_config(r, d, i,
+ peer_type);
else
peer_ctl = 0;
- used_b |= *ctrl | peer_ctl;
+ ctrl_val = resctrl_arch_get_config(r, d, i,
+ s->conf_type);
+ used_b |= ctrl_val | peer_ctl;
if (mode == RDT_MODE_SHAREABLE)
- d->new_ctrl |= *ctrl | peer_ctl;
+ cfg->new_ctrl |= ctrl_val | peer_ctl;
}
}
if (d->plr && d->plr->cbm > 0)
used_b |= d->plr->cbm;
unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
- d->new_ctrl |= unused_b;
+ cfg->new_ctrl |= unused_b;
/*
* Force the initial CBM to be valid, user can
* modify the CBM based on system availability.
*/
- d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r);
+ cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
/*
* Assign the u32 CBM to an unsigned long to ensure that
* bitmap_weight() does not access out-of-bound memory.
*/
- tmp_cbm = d->new_ctrl;
+ tmp_cbm = cfg->new_ctrl;
if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id);
+ rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
return -ENOSPC;
}
- d->have_new_ctrl = true;
+ cfg->have_new_ctrl = true;
return 0;
}
@@ -2716,13 +2771,13 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
* If there are no more shareable bits available on any domain then
* the entire allocation will fail.
*/
-static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid)
+static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
{
struct rdt_domain *d;
int ret;
- list_for_each_entry(d, &r->domains, list) {
- ret = __init_one_rdt_domain(d, r, closid);
+ list_for_each_entry(d, &s->res->domains, list) {
+ ret = __init_one_rdt_domain(d, s, closid);
if (ret < 0)
return ret;
}
@@ -2733,30 +2788,34 @@ static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid)
/* Initialize MBA resource with default values. */
static void rdtgroup_init_mba(struct rdt_resource *r)
{
+ struct resctrl_staged_config *cfg;
struct rdt_domain *d;
list_for_each_entry(d, &r->domains, list) {
- d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
- d->have_new_ctrl = true;
+ cfg = &d->staged_config[CDP_NONE];
+ cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
+ cfg->have_new_ctrl = true;
}
}
/* Initialize the RDT group's allocations. */
static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
{
+ struct resctrl_schema *s;
struct rdt_resource *r;
int ret;
- for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ r = s->res;
if (r->rid == RDT_RESOURCE_MBA) {
rdtgroup_init_mba(r);
} else {
- ret = rdtgroup_init_cat(r, rdtgrp->closid);
+ ret = rdtgroup_init_cat(s, rdtgrp->closid);
if (ret < 0)
return ret;
}
- ret = update_domains(r, rdtgrp->closid);
+ ret = resctrl_arch_update_domains(r, rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("Failed to initialize allocations\n");
return ret;
@@ -3124,13 +3183,13 @@ out:
static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
{
- if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+ if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
seq_puts(seq, ",cdp");
- if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+ if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
seq_puts(seq, ",cdpl2");
- if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
+ if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
seq_puts(seq, ",mba_MBps");
return 0;
@@ -3162,13 +3221,13 @@ static int __init rdtgroup_setup_root(void)
list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
- ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
+ ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE);
if (ret) {
kernfs_destroy_root(rdt_root);
goto out;
}
- rdtgroup_default.kn = rdt_root->kn;
+ rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
kernfs_activate(rdtgroup_default.kn);
out:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 972ec3bfa9c0..dbaa8326d6f2 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -26,6 +26,7 @@ struct cpuid_bit {
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
{ X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
@@ -36,10 +37,13 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..9c1656779b2a 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@ obj-y += \
encl.o \
ioctl.o \
main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 8ce6d8371cfb..aa9b8b868867 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
.get_unmapped_area = sgx_get_unmapped_area,
};
-const struct file_operations sgx_provision_fops = {
- .owner = THIS_MODULE,
-};
-
static struct miscdevice sgx_dev_enclave = {
.minor = MISC_DYNAMIC_MINOR,
.name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
.fops = &sgx_encl_fops,
};
-static struct miscdevice sgx_dev_provision = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "sgx_provision",
- .nodename = "sgx_provision",
- .fops = &sgx_provision_fops,
-};
-
int __init sgx_drv_init(void)
{
unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
if (ret)
return ret;
- ret = misc_register(&sgx_dev_provision);
- if (ret) {
- misc_deregister(&sgx_dev_enclave);
- return ret;
- }
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 7449ef33f081..19876ebfb504 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -7,11 +7,121 @@
#include <linux/shmem_fs.h>
#include <linux/suspend.h>
#include <linux/sched/mm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#include "encl.h"
#include "encls.h"
#include "sgx.h"
+#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
+/*
+ * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
+ * determine the page index associated with the first PCMD entry
+ * within a PCMD page.
+ */
+#define PCMD_FIRST_MASK GENMASK(4, 0)
+
+/**
+ * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
+ * a PCMD page is in process of being reclaimed.
+ * @encl: Enclave to which PCMD page belongs
+ * @start_addr: Address of enclave page using first entry within the PCMD page
+ *
+ * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
+ * stored. The PCMD data of a reclaimed enclave page contains enough
+ * information for the processor to verify the page at the time
+ * it is loaded back into the Enclave Page Cache (EPC).
+ *
+ * The backing storage to which enclave pages are reclaimed is laid out as
+ * follows:
+ * Encrypted enclave pages:SECS page:PCMD pages
+ *
+ * Each PCMD page contains the PCMD metadata of
+ * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
+ *
+ * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
+ * process of getting data (and thus soon being non-empty). (b) is tested with
+ * a check if an enclave page sharing the PCMD page is in the process of being
+ * reclaimed.
+ *
+ * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
+ * intends to reclaim that enclave page - it means that the PCMD page
+ * associated with that enclave page is about to get some data and thus
+ * even if the PCMD page is empty, it should not be truncated.
+ *
+ * Context: Enclave mutex (&sgx_encl->lock) must be held.
+ * Return: 1 if the reclaimer is about to write to the PCMD page
+ * 0 if the reclaimer has no intention to write to the PCMD page
+ */
+static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
+ unsigned long start_addr)
+{
+ int reclaimed = 0;
+ int i;
+
+ /*
+ * PCMD_FIRST_MASK is based on number of PCMD entries within
+ * PCMD page being 32.
+ */
+ BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
+
+ for (i = 0; i < PCMDS_PER_PAGE; i++) {
+ struct sgx_encl_page *entry;
+ unsigned long addr;
+
+ addr = start_addr + i * PAGE_SIZE;
+
+ /*
+ * Stop when reaching the SECS page - it does not
+ * have a page_array entry and its reclaim is
+ * started and completed with enclave mutex held so
+ * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
+ * flag.
+ */
+ if (addr == encl->base + encl->size)
+ break;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ continue;
+
+ /*
+ * VA page slot ID uses same bit as the flag so it is important
+ * to ensure that the page is not already in backing store.
+ */
+ if (entry->epc_page &&
+ (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
+ reclaimed = 1;
+ break;
+ }
+ }
+
+ return reclaimed;
+}
+
+/*
+ * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
+ * follow right after the EPC data in the backing storage. In addition to the
+ * visible enclave pages, there's one extra page slot for SECS, before PCMD
+ * structs.
+ */
+static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
+ unsigned long page_index)
+{
+ pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
+
+ return epc_end_off + page_index * sizeof(struct sgx_pcmd);
+}
+
+/*
+ * Free a page from the backing storage in the given page index.
+ */
+static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
+{
+ struct inode *inode = file_inode(encl->backing);
+
+ shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
+}
+
/*
* ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
* Pages" in the SDM.
@@ -22,9 +132,12 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
{
unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
struct sgx_encl *encl = encl_page->encl;
+ pgoff_t page_index, page_pcmd_off;
+ unsigned long pcmd_first_page;
struct sgx_pageinfo pginfo;
struct sgx_backing b;
- pgoff_t page_index;
+ bool pcmd_page_empty;
+ u8 *pcmd_page;
int ret;
if (secs_page)
@@ -32,14 +145,21 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
else
page_index = PFN_DOWN(encl->size);
- ret = sgx_encl_get_backing(encl, page_index, &b);
+ /*
+ * Address of enclave page using the first entry within the PCMD page.
+ */
+ pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
+
+ page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+
+ ret = sgx_encl_lookup_backing(encl, page_index, &b);
if (ret)
return ret;
pginfo.addr = encl_page->desc & PAGE_MASK;
pginfo.contents = (unsigned long)kmap_atomic(b.contents);
- pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
- b.pcmd_offset;
+ pcmd_page = kmap_atomic(b.pcmd);
+ pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
if (secs_page)
pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
@@ -55,10 +175,32 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
ret = -EFAULT;
}
- kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
+ memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+ set_page_dirty(b.pcmd);
+
+ /*
+ * The area for the PCMD in the page was zeroed above. Check if the
+ * whole page is now empty meaning that all PCMD's have been zeroed:
+ */
+ pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
+
+ kunmap_atomic(pcmd_page);
kunmap_atomic((void *)(unsigned long)pginfo.contents);
- sgx_encl_put_backing(&b, false);
+ get_page(b.pcmd);
+ sgx_encl_put_backing(&b);
+
+ sgx_encl_truncate_backing_page(encl, page_index);
+
+ if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
+ sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ pcmd_page = kmap_atomic(b.pcmd);
+ if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
+ pr_warn("PCMD page not empty after truncate.\n");
+ kunmap_atomic(pcmd_page);
+ }
+
+ put_page(b.pcmd);
return ret;
}
@@ -78,7 +220,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
if (ret) {
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(ret);
}
@@ -383,7 +525,7 @@ const struct vm_operations_struct sgx_vm_ops = {
/**
* sgx_encl_release - Destroy an enclave instance
- * @kref: address of a kref inside &sgx_encl
+ * @ref: address of a kref inside &sgx_encl
*
* Used together with kref_put(). Frees all the resources associated with the
* enclave and the instance itself.
@@ -404,18 +546,20 @@ void sgx_encl_release(struct kref *ref)
if (sgx_unmark_page_reclaimable(entry->epc_page))
continue;
- sgx_free_epc_page(entry->epc_page);
+ sgx_encl_free_epc_page(entry->epc_page);
encl->secs_child_cnt--;
entry->epc_page = NULL;
}
kfree(entry);
+ /* Invoke scheduler to prevent soft lockups. */
+ cond_resched();
}
xa_destroy(&encl->page_array);
if (!encl->secs_child_cnt && encl->secs.epc_page) {
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
}
@@ -423,7 +567,7 @@ void sgx_encl_release(struct kref *ref)
va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
list);
list_del(&va_page->list);
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
kfree(va_page);
}
@@ -574,10 +718,10 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
* 0 on success,
* -errno otherwise.
*/
-int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing)
{
- pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
+ pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
struct page *contents;
struct page *pcmd;
@@ -585,7 +729,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
if (IS_ERR(contents))
return PTR_ERR(contents);
- pcmd = sgx_encl_get_backing_page(encl, pcmd_index);
+ pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
if (IS_ERR(pcmd)) {
put_page(contents);
return PTR_ERR(pcmd);
@@ -594,25 +738,118 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
backing->page_index = page_index;
backing->contents = contents;
backing->pcmd = pcmd;
- backing->pcmd_offset =
- (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) *
- sizeof(struct sgx_pcmd);
+ backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
return 0;
}
+/*
+ * When called from ksgxd, returns the mem_cgroup of a struct mm stored
+ * in the enclave's mm_list. When not called from ksgxd, just returns
+ * the mem_cgroup of the current task.
+ */
+static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ /*
+ * If called from normal task context, return the mem_cgroup
+ * of the current task's mm. The remainder of the handling is for
+ * ksgxd.
+ */
+ if (!current_is_ksgxd())
+ return get_mem_cgroup_from_mm(current->mm);
+
+ /*
+ * Search the enclave's mm_list to find an mm associated with
+ * this enclave to charge the allocation to.
+ */
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ memcg = get_mem_cgroup_from_mm(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ /*
+ * In the rare case that there isn't an mm associated with
+ * the enclave, set memcg to the current active mem_cgroup.
+ * This will be the root mem_cgroup if there is no active
+ * mem_cgroup.
+ */
+ if (!memcg)
+ return get_mem_cgroup_from_mm(NULL);
+
+ return memcg;
+}
+
/**
- * sgx_encl_put_backing() - Unpin the backing storage
+ * sgx_encl_alloc_backing() - allocate a new backing storage page
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
* @backing: data for accessing backing storage for the page
- * @do_write: mark pages dirty
+ *
+ * When called from ksgxd, sets the active memcg from one of the
+ * mms in the enclave's mm_list prior to any backing page allocation,
+ * in order to ensure that shmem page allocations are charged to the
+ * enclave.
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
*/
-void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write)
+int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
{
- if (do_write) {
- set_page_dirty(backing->pcmd);
- set_page_dirty(backing->contents);
- }
+ struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
+ struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
+ int ret;
+
+ ret = sgx_encl_get_backing(encl, page_index, backing);
+
+ set_active_memcg(memcg);
+ mem_cgroup_put(encl_memcg);
+
+ return ret;
+}
+/**
+ * sgx_encl_lookup_backing() - retrieve an existing backing storage page
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Retrieve a backing page for loading data back into an EPC page with ELDU.
+ * It is the caller's responsibility to ensure that it is appropriate to use
+ * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
+ * not used correctly, this will cause an allocation which is not accounted for.
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ return sgx_encl_get_backing(encl, page_index, backing);
+}
+
+/**
+ * sgx_encl_put_backing() - Unpin the backing storage
+ * @backing: data for accessing backing storage for the page
+ */
+void sgx_encl_put_backing(struct sgx_backing *backing)
+{
put_page(backing->pcmd);
put_page(backing->contents);
}
@@ -686,7 +923,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
ret = __epa(sgx_get_epc_virt_addr(epc_page));
if (ret) {
WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(-EFAULT);
}
@@ -735,3 +972,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
return slot == SGX_VA_SLOT_COUNT;
}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index d8d30ccbef4c..332ef3568267 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
{
struct vm_area_struct *result;
- result = find_vma(mm, addr);
- if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+ result = vma_lookup(mm, addr);
+ if (!result || result->vm_ops != &sgx_vm_ops)
return -EINVAL;
*vma = result;
@@ -103,11 +103,14 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
unsigned long end, unsigned long vm_flags);
+bool current_is_ksgxd(void);
void sgx_encl_release(struct kref *ref);
int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
-int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
- struct sgx_backing *backing);
-void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write);
+int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+void sgx_encl_put_backing(struct sgx_backing *backing);
int sgx_encl_test_and_clear_young(struct mm_struct *mm,
struct sgx_encl_page *page);
@@ -115,5 +118,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 443188fe7e70..fa04a73daf9c 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,41 +11,8 @@
#include <asm/traps.h>
#include "sgx.h"
-enum sgx_encls_function {
- ECREATE = 0x00,
- EADD = 0x01,
- EINIT = 0x02,
- EREMOVE = 0x03,
- EDGBRD = 0x04,
- EDGBWR = 0x05,
- EEXTEND = 0x06,
- ELDU = 0x08,
- EBLOCK = 0x09,
- EPA = 0x0A,
- EWB = 0x0B,
- ETRACK = 0x0C,
-};
-
-/**
- * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
- *
- * ENCLS has its own (positive value) error codes and also generates
- * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
- * with system error codes as everything percolates back up the stack.
- * Unfortunately (for us), we need to precisely identify each unique
- * error code, e.g. the action taken if EWB fails varies based on the
- * type of fault and on the exact SGX error code, i.e. we can't simply
- * convert all faults to -EFAULT.
- *
- * To make all three error types coexist, we set bit 30 to identify an
- * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
- * between positive (faults and SGX error codes) and negative (system
- * error codes) values.
- */
-#define ENCLS_FAULT_FLAG 0x40000000
-
/* Retrieve the encoded trapnr from the specified return code. */
-#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG)
+#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG)
/* Issue a WARN() about an ENCLS function. */
#define ENCLS_WARN(r, name) { \
@@ -55,6 +22,19 @@ enum sgx_encls_function {
} while (0); \
}
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & SGX_ENCLS_FAULT_FLAG;
+}
+
/**
* encls_failed() - Check if an ENCLS function failed
* @ret: the return value of an ENCLS function call
@@ -65,7 +45,7 @@ enum sgx_encls_function {
*/
static inline bool encls_failed(int ret)
{
- if (ret & ENCLS_FAULT_FLAG)
+ if (encls_faulted(ret))
return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
return !!ret;
@@ -90,11 +70,7 @@ static inline bool encls_failed(int ret)
asm volatile( \
"1: .byte 0x0f, 0x01, 0xcf;\n\t" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_FAULT(1b, 3b) \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
: "=a"(ret) \
: "a"(rax), inputs \
: "memory", "cc"); \
@@ -129,7 +105,7 @@ static inline bool encls_failed(int ret)
*
* Return:
* 0 on success,
- * trapnr with ENCLS_FAULT_FLAG set on fault
+ * trapnr with SGX_ENCLS_FAULT_FLAG set on fault
*/
#define __encls_N(rax, rbx_out, inputs...) \
({ \
@@ -138,11 +114,7 @@ static inline bool encls_failed(int ret)
"1: .byte 0x0f, 0x01, 0xcf;\n\t" \
" xor %%eax,%%eax;\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_FAULT(1b, 3b) \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
: "=a"(ret), "=b"(rbx_out) \
: "a"(rax), inputs \
: "memory"); \
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 90a5caf76939..83df20e3e633 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -2,6 +2,7 @@
/* Copyright(c) 2016-20 Intel Corporation. */
#include <asm/mman.h>
+#include <asm/sgx.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
encl->page_cnt--;
if (va_page) {
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
list_del(&va_page->list);
kfree(va_page);
}
@@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
return 0;
err_out:
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
err_out_backing:
@@ -365,7 +366,7 @@ err_out_unlock:
mmap_read_unlock(current->mm);
err_out_free:
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
kfree(encl_page);
return ret;
@@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
u64 mrsigner[4];
- int i, j, k;
+ int i, j;
void *addr;
int ret;
@@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
preempt_disable();
- for (k = 0; k < 4; k++)
- wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+ sgx_update_lepubkeyhash(mrsigner);
ret = __einit(sigstruct, token, addr);
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
}
}
- if (ret & ENCLS_FAULT_FLAG) {
+ if (encls_faulted(ret)) {
if (encls_failed(ret))
ENCLS_WARN(ret, "EINIT");
@@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
{
struct sgx_sigstruct *sigstruct;
struct sgx_enclave_init init_arg;
- struct page *initp_page;
void *token;
int ret;
@@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
return -EFAULT;
- initp_page = alloc_page(GFP_KERNEL);
- if (!initp_page)
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
return -ENOMEM;
- sigstruct = kmap(initp_page);
token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
@@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
ret = sgx_encl_init(encl, sigstruct, token);
out:
- kunmap(initp_page);
- __free_page(initp_page);
+ kfree(sigstruct);
return ret;
}
@@ -665,24 +667,11 @@ out:
static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
{
struct sgx_enclave_provision params;
- struct file *file;
if (copy_from_user(&params, arg, sizeof(params)))
return -EFAULT;
- file = fget(params.fd);
- if (!file)
- return -EINVAL;
-
- if (file->f_op != &sgx_provision_fops) {
- fput(file);
- return -EINVAL;
- }
-
- encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
- fput(file);
- return 0;
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
}
long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8df81a3ed945..a78652d43e61 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,14 +1,19 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2016-20 Intel Corporation. */
+#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/miscdevice.h>
+#include <linux/node.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
#include "encls.h"
@@ -17,48 +22,82 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
static int sgx_nr_epc_sections;
static struct task_struct *ksgxd_tsk;
static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
/*
* These variables are part of the state of the reclaimer, and must be accessed
* with sgx_reclaimer_lock acquired.
*/
static LIST_HEAD(sgx_active_page_list);
-
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
/*
- * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
- * pages whose child pages blocked EREMOVE.
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
*/
-static void sgx_sanitize_section(struct sgx_epc_section *section)
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
+/*
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
+ */
+static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
struct sgx_epc_page *page;
LIST_HEAD(dirty);
int ret;
- /* init_laundry_list is thread-local, no need for a lock: */
- while (!list_empty(&section->init_laundry_list)) {
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
return;
- /* needed for access to ->page_list: */
- spin_lock(&section->lock);
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
+
+ /*
+ * Checking page->poison without holding the node->lock
+ * is racy, but losing the race (i.e. poison is set just
+ * after the check) just means __eremove() will be uselessly
+ * called for a page that sgx_free_epc_page() will put onto
+ * the node->sgx_poison_page_list later.
+ */
+ if (page->poison) {
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+ list_move(&page->list, &node->sgx_poison_page_list);
+ spin_unlock(&node->lock);
- page = list_first_entry(&section->init_laundry_list,
- struct sgx_epc_page, list);
+ continue;
+ }
ret = __eremove(sgx_get_epc_virt_addr(page));
- if (!ret)
- list_move(&page->list, &section->page_list);
- else
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
-
- spin_unlock(&section->lock);
+ }
cond_resched();
}
- list_splice(&dirty, &section->init_laundry_list);
+ list_splice(&dirty, dirty_page_list);
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -152,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
backing->pcmd_offset;
ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
+ set_page_dirty(backing->pcmd);
+ set_page_dirty(backing->contents);
kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
backing->pcmd_offset));
@@ -195,10 +236,10 @@ static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
/*
* Swap page to the regular memory transformed to the blocked state by using
- * EBLOCK, which means that it can no loger be referenced (no new TLB entries).
+ * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
*
* The first trial just tries to write the page assuming that some other thread
- * has reset the count for threads inside the enlave by using ETRACK, and
+ * has reset the count for threads inside the enclave by using ETRACK, and
* previous thread count has been zeroed out. The second trial calls ETRACK
* before EWB. If that fails we kick all the HW threads out, and then do EWB,
* which should be guaranteed the succeed.
@@ -269,19 +310,20 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_ewb(epc_page, backing);
encl_page->epc_page = NULL;
encl->secs_child_cnt--;
+ sgx_encl_put_backing(backing);
if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
- ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size),
+ ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
&secs_backing);
if (ret)
goto out;
sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
- sgx_encl_put_backing(&secs_backing, true);
+ sgx_encl_put_backing(&secs_backing);
}
out:
@@ -305,7 +347,6 @@ static void sgx_reclaim_pages(void)
{
struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
struct sgx_backing backing[SGX_NR_TO_SCAN];
- struct sgx_epc_section *section;
struct sgx_encl_page *encl_page;
struct sgx_epc_page *epc_page;
pgoff_t page_index;
@@ -341,11 +382,14 @@ static void sgx_reclaim_pages(void)
goto skip;
page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
- ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]);
- if (ret)
- goto skip;
mutex_lock(&encl_page->encl->lock);
+ ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
+ if (ret) {
+ mutex_unlock(&encl_page->encl->lock);
+ goto skip;
+ }
+
encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
mutex_unlock(&encl_page->encl->lock);
continue;
@@ -373,56 +417,33 @@ skip:
encl_page = epc_page->owner;
sgx_reclaimer_write(epc_page, &backing[i]);
- sgx_encl_put_backing(&backing[i], true);
kref_put(&encl_page->encl->refcount, sgx_encl_release);
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
- section = &sgx_epc_sections[epc_page->section];
- spin_lock(&section->lock);
- list_add_tail(&epc_page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
+ sgx_free_epc_page(epc_page);
}
}
-static unsigned long sgx_nr_free_pages(void)
-{
- unsigned long cnt = 0;
- int i;
-
- for (i = 0; i < sgx_nr_epc_sections; i++)
- cnt += sgx_epc_sections[i].free_cnt;
-
- return cnt;
-}
-
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages() < watermark &&
+ return atomic_long_read(&sgx_nr_free_pages) < watermark &&
!list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
{
- int i;
-
set_freezable();
/*
* Sanitize pages in order to recover from kexec(). The 2nd pass is
* required for SECS pages, whose child pages blocked EREMOVE.
*/
- for (i = 0; i < sgx_nr_epc_sections; i++)
- sgx_sanitize_section(&sgx_epc_sections[i]);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- sgx_sanitize_section(&sgx_epc_sections[i]);
-
- /* Should never happen. */
- if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
- WARN(1, "EPC section %d has unsanitized pages.\n", i);
- }
+ /* sanity check: */
+ WARN_ON(!list_empty(&sgx_dirty_page_list));
while (!kthread_should_stop()) {
if (try_to_freeze())
@@ -454,45 +475,62 @@ static bool __init sgx_page_reclaimer_init(void)
return true;
}
-static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+bool current_is_ksgxd(void)
{
- struct sgx_epc_page *page;
+ return current == ksgxd_tsk;
+}
+
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
+{
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
- spin_lock(&section->lock);
+ spin_lock(&node->lock);
- if (list_empty(&section->page_list)) {
- spin_unlock(&section->lock);
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
return NULL;
}
- page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- section->free_cnt--;
+ page->flags = 0;
+
+ spin_unlock(&node->lock);
+ atomic_long_dec(&sgx_nr_free_pages);
- spin_unlock(&section->lock);
return page;
}
/**
* __sgx_alloc_epc_page() - Allocate an EPC page
*
- * Iterate through EPC sections and borrow a free EPC page to the caller. When a
- * page is no longer needed it must be released with sgx_free_epc_page().
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
*
* Return:
- * an EPC page,
- * -errno on error
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
*/
struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
- struct sgx_epc_section *section;
struct sgx_epc_page *page;
- int i;
+ int nid_of_current = numa_node_id();
+ int nid = nid_of_current;
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- section = &sgx_epc_sections[i];
+ if (node_isset(nid_of_current, sgx_numa_mask)) {
+ page = __sgx_alloc_epc_page_from_node(nid_of_current);
+ if (page)
+ return page;
+ }
- page = __sgx_alloc_epc_page_from_section(section);
+ /* Fall back to the non-local NUMA nodes: */
+ while (true) {
+ nid = next_node_in(nid, sgx_numa_mask);
+ if (nid == nid_of_current)
+ break;
+
+ page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
}
@@ -598,23 +636,27 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
* sgx_free_epc_page() - Free an EPC page
* @page: an EPC page
*
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
*/
void sgx_free_epc_page(struct sgx_epc_page *page)
{
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
- int ret;
+ struct sgx_numa_node *node = section->node;
- WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+ spin_lock(&node->lock);
- ret = __eremove(sgx_get_epc_virt_addr(page));
- if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
- return;
+ page->owner = NULL;
+ if (page->poison)
+ list_add(&page->list, &node->sgx_poison_page_list);
+ else
+ list_add_tail(&page->list, &node->free_page_list);
+ page->flags = SGX_EPC_PAGE_IS_FREE;
- spin_lock(&section->lock);
- list_add_tail(&page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
+ spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
@@ -635,21 +677,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
- spin_lock_init(&section->lock);
- INIT_LIST_HEAD(&section->page_list);
- INIT_LIST_HEAD(&section->init_laundry_list);
+ xa_store_range(&sgx_epc_address_space, section->phys_addr,
+ phys_addr + size - 1, section, GFP_KERNEL);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
- list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+ section->pages[i].poison = 0;
+ list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
- section->free_cnt = nr_pages;
return true;
}
+bool arch_is_platform_page(u64 paddr)
+{
+ return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+ struct sgx_epc_section *section;
+
+ section = xa_load(&sgx_epc_address_space, paddr);
+ if (!section)
+ return NULL;
+
+ return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+ struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+ struct sgx_epc_section *section;
+ struct sgx_numa_node *node;
+
+ /*
+ * mm/memory-failure.c calls this routine for all errors
+ * where there isn't a "struct page" for the address. But that
+ * includes other address ranges besides SGX.
+ */
+ if (!page)
+ return -ENXIO;
+
+ /*
+ * If poison was consumed synchronously. Send a SIGBUS to
+ * the task. Hardware has already exited the SGX enclave and
+ * will not allow re-entry to an enclave that has a memory
+ * error. The signal may help the task understand why the
+ * enclave is broken.
+ */
+ if (flags & MF_ACTION_REQUIRED)
+ force_sig(SIGBUS);
+
+ section = &sgx_epc_sections[page->section];
+ node = section->node;
+
+ spin_lock(&node->lock);
+
+ /* Already poisoned? Nothing more to do */
+ if (page->poison)
+ goto out;
+
+ page->poison = 1;
+
+ /*
+ * If the page is on a free list, move it to the per-node
+ * poison page list.
+ */
+ if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+ list_move(&page->list, &node->sgx_poison_page_list);
+ goto out;
+ }
+
+ /*
+ * TBD: Add additional plumbing to enable pre-emptive
+ * action for asynchronous poison notification. Until
+ * then just hope that the poison:
+ * a) is not accessed - sgx_free_epc_page() will deal with it
+ * when the user gives it back
+ * b) results in a recoverable machine check rather than
+ * a fatal one
+ */
+out:
+ spin_unlock(&node->lock);
+ return 0;
+}
+
/**
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
@@ -661,12 +784,59 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
((high & GENMASK_ULL(19, 0)) << 32);
}
+#ifdef CONFIG_NUMA
+static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
+}
+static DEVICE_ATTR_RO(sgx_total_bytes);
+
+static umode_t arch_node_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int idx)
+{
+ /* Make all x86/ attributes invisible when SGX is not initialized: */
+ if (nodes_empty(sgx_numa_mask))
+ return 0;
+
+ return attr->mode;
+}
+
+static struct attribute *arch_node_dev_attrs[] = {
+ &dev_attr_sgx_total_bytes.attr,
+ NULL,
+};
+
+const struct attribute_group arch_node_dev_group = {
+ .name = "x86",
+ .attrs = arch_node_dev_attrs,
+ .is_visible = arch_node_attr_is_visible,
+};
+
+static void __init arch_update_sysfs_visibility(int nid)
+{
+ struct node *node = node_devices[nid];
+ int ret;
+
+ ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
+
+ if (ret)
+ pr_err("sysfs update failed (%d), files may be invisible", ret);
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_update_sysfs_visibility(int nid) {}
+#endif
+
static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
u64 pa, size;
+ int nid;
int i;
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
@@ -689,6 +859,27 @@ static bool __init sgx_page_cache_init(void)
break;
}
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
+ node_set(nid, sgx_numa_mask);
+ sgx_numa_nodes[nid].size = 0;
+
+ /* Make SGX-specific node sysfs files visible: */
+ arch_update_sysfs_visibility(nid);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+ sgx_numa_nodes[nid].size += size;
+
sgx_nr_epc_sections++;
}
@@ -700,6 +891,67 @@ static bool __init sgx_page_cache_init(void)
return true;
}
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ struct file *file;
+
+ file = fget(attribute_fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &sgx_provision_fops) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+ fput(file);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
static int __init sgx_init(void)
{
int ret;
@@ -716,12 +968,28 @@ static int __init sgx_init(void)
goto err_page_cache;
}
- ret = sgx_drv_init();
+ ret = misc_register(&sgx_dev_provision);
if (ret)
goto err_kthread;
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
return 0;
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
err_kthread:
kthread_stop(ksgxd_tsk);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 5fa42d143feb..0f17def9fe6f 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -8,11 +8,15 @@
#include <linux/rwsem.h>
#include <linux/types.h>
#include <asm/asm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#undef pr_fmt
#define pr_fmt(fmt) "sgx: " fmt
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/x86/sgx.rst for more information."
+
#define SGX_MAX_EPC_SECTIONS 8
#define SGX_EEXTEND_BLOCK_SIZE 256
#define SGX_NR_TO_SCAN 16
@@ -22,36 +26,39 @@
/* Pages, which are being tracked by the page reclaimer. */
#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE BIT(1)
+
struct sgx_epc_page {
unsigned int section;
- unsigned int flags;
+ u16 flags;
+ u16 poison;
struct sgx_encl_page *owner;
struct list_head list;
};
/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ struct list_head sgx_poison_page_list;
+ unsigned long size;
+ spinlock_t lock;
+};
+
+/*
* The firmware can define multiple chunks of EPC to the different areas of the
* physical memory e.g. for memory areas of the each node. This structure is
* used to store EPC pages for one EPC section and virtual memory area where
* the pages have been mapped.
- *
- * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
*/
struct sgx_epc_section {
unsigned long phys_addr;
void *virt_addr;
struct sgx_epc_page *pages;
-
- spinlock_t lock;
- struct list_head page_list;
- unsigned long free_cnt;
-
- /*
- * Pages which need EREMOVE run on them before they can be
- * used. Only safe to be accessed in ksgxd and init code.
- * Not protected by locks.
- */
- struct list_head init_laundry_list;
+ struct sgx_numa_node *node;
};
extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -83,4 +90,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..6a77a14eee38
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
+{
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ return __eremove(sgx_get_epc_virt_addr(epc_page));
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret = sgx_vepc_remove_page(epc_page);
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+ return 0;
+}
+
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+ struct sgx_epc_page *entry;
+ unsigned long index;
+ long failures = 0;
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ int ret = sgx_vepc_remove_page(entry);
+ if (ret) {
+ if (ret == SGX_CHILD_PRESENT) {
+ /* The page is a SECS, userspace will retry. */
+ failures++;
+ } else {
+ /*
+ * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
+ * WARN, as userspace can induce said failures by
+ * calling the ioctl concurrently on multiple vEPCs or
+ * while one or more CPUs is running the enclave. Only
+ * a #PF on EREMOVE indicates a kernel/hardware issue.
+ */
+ WARN_ON_ONCE(encls_faulted(ret) &&
+ ENCLS_TRAPNR(ret) != X86_TRAP_GP);
+ return -EBUSY;
+ }
+ }
+ cond_resched();
+ }
+
+ /*
+ * Return the number of SECS pages that failed to be removed, so
+ * userspace knows that it has to retry.
+ */
+ return failures;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ xa_destroy(&vepc->page_array);
+ kfree(vepc);
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static long sgx_vepc_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ switch (cmd) {
+ case SGX_IOC_VEPC_REMOVE_ALL:
+ if (arg)
+ return -EINVAL;
+ return sgx_vepc_remove_all(vepc);
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .unlocked_ioctl = sgx_vepc_ioctl,
+ .compat_ioctl = sgx_vepc_ioctl,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 8678864ce712..132a2de44d2f 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -30,7 +30,7 @@ EXPORT_SYMBOL(__max_die_per_package);
#ifdef CONFIG_SMP
/*
- * Check if given CPUID extended toplogy "leaf" is implemented
+ * Check if given CPUID extended topology "leaf" is implemented
*/
static int check_extended_topology_leaf(int leaf)
{
@@ -44,7 +44,7 @@ static int check_extended_topology_leaf(int leaf)
return 0;
}
/*
- * Return best CPUID Extended Toplogy Leaf supported
+ * Return best CPUID Extended Topology Leaf supported
*/
static int detect_extended_topology_leaf(struct cpuinfo_x86 *c)
{
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index e2ad30e474f8..ec7bbac3a9f2 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -2,7 +2,7 @@
/*
* Intel Transactional Synchronization Extensions (TSX) control.
*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2021 Intel Corporation
*
* Author:
* Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
@@ -19,7 +19,7 @@
enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
-void tsx_disable(void)
+static void tsx_disable(void)
{
u64 tsx;
@@ -39,7 +39,7 @@ void tsx_disable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-void tsx_enable(void)
+static void tsx_enable(void)
{
u64 tsx;
@@ -58,7 +58,7 @@ void tsx_enable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-static bool __init tsx_ctrl_is_supported(void)
+static bool tsx_ctrl_is_supported(void)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -84,13 +84,117 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
return TSX_CTRL_ENABLE;
}
+/*
+ * Disabling TSX is not a trivial business.
+ *
+ * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT
+ * which says that TSX is practically disabled (all transactions are
+ * aborted by default). When that bit is set, the kernel unconditionally
+ * disables TSX.
+ *
+ * In order to do that, however, it needs to dance a bit:
+ *
+ * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and
+ * the MSR is present only when *two* CPUID bits are set:
+ *
+ * - X86_FEATURE_RTM_ALWAYS_ABORT
+ * - X86_FEATURE_TSX_FORCE_ABORT
+ *
+ * 2. The second method is for CPUs which do not have the above-mentioned
+ * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX
+ * through that one. Those CPUs can also have the initially mentioned
+ * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy
+ * applies: TSX gets disabled unconditionally.
+ *
+ * When either of the two methods are present, the kernel disables TSX and
+ * clears the respective RTM and HLE feature flags.
+ *
+ * An additional twist in the whole thing presents late microcode loading
+ * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID
+ * bit to be set after the update.
+ *
+ * A subsequent hotplug operation on any logical CPU except the BSP will
+ * cause for the supported CPUID feature bits to get re-detected and, if
+ * RTM and HLE get cleared all of a sudden, but, userspace did consult
+ * them before the update, then funny explosions will happen. Long story
+ * short: the kernel doesn't modify CPUID feature bits after booting.
+ *
+ * That's why, this function's call in init_intel() doesn't clear the
+ * feature flags.
+ */
+static void tsx_clear_cpuid(void)
+{
+ u64 msr;
+
+ /*
+ * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID
+ * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
+ boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ rdmsrl(MSR_TSX_FORCE_ABORT, msr);
+ msr |= MSR_TFA_TSX_CPUID_CLEAR;
+ wrmsrl(MSR_TSX_FORCE_ABORT, msr);
+ } else if (tsx_ctrl_is_supported()) {
+ rdmsrl(MSR_IA32_TSX_CTRL, msr);
+ msr |= TSX_CTRL_CPUID_CLEAR;
+ wrmsrl(MSR_IA32_TSX_CTRL, msr);
+ }
+}
+
+/*
+ * Disable TSX development mode
+ *
+ * When the microcode released in Feb 2022 is applied, TSX will be disabled by
+ * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123
+ * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is
+ * not recommended for production deployments. In particular, applying MD_CLEAR
+ * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient
+ * execution attack may not be effective on these processors when Intel TSX is
+ * enabled with updated microcode.
+ */
+static void tsx_dev_mode_disable(void)
+{
+ u64 mcu_opt_ctrl;
+
+ /* Check if RTM_ALLOW exists */
+ if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() ||
+ !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+
+ if (mcu_opt_ctrl & RTM_ALLOW) {
+ mcu_opt_ctrl &= ~RTM_ALLOW;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
+ }
+}
+
void __init tsx_init(void)
{
char arg[5] = {};
int ret;
- if (!tsx_ctrl_is_supported())
+ tsx_dev_mode_disable();
+
+ /*
+ * Hardware will always abort a TSX transaction when the CPUID bit
+ * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate
+ * CPUID.RTM and CPUID.HLE bits. Clear them here.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
+ tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
+ tsx_clear_cpuid();
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ return;
+ }
+
+ if (!tsx_ctrl_is_supported()) {
+ tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
return;
+ }
ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
if (ret >= 0) {
@@ -142,3 +246,16 @@ void __init tsx_init(void)
setup_force_cpu_cap(X86_FEATURE_HLE);
}
}
+
+void tsx_ap_init(void)
+{
+ tsx_dev_mode_disable();
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ /* See comment over that function for more details. */
+ tsx_clear_cpuid();
+}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index c6ede3b3d302..c04b933f48d3 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -27,6 +27,7 @@
#include <linux/clocksource.h>
#include <linux/cpu.h>
#include <linux/reboot.h>
+#include <linux/static_call.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
@@ -336,11 +337,11 @@ static void __init vmware_paravirt_ops_setup(void)
vmware_cyc2ns_setup();
if (vmw_sched_clock)
- pv_ops.time.sched_clock = vmware_sched_clock;
+ paravirt_set_sched_clock(vmware_sched_clock);
if (vmware_is_stealclock_available()) {
has_steal_clock = true;
- pv_ops.time.steal_clock = vmware_steal_clock;
+ static_call_update(pv_steal_clock, vmware_steal_clock);
/* We use reboot notifier only to disable steal clock */
register_reboot_notifier(&vmware_pv_reboot_nb);
@@ -378,6 +379,8 @@ static void __init vmware_set_capabilities(void)
{
setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ if (vmware_tsc_khz)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
setup_force_cpu_cap(X86_FEATURE_VMCALL);
else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c
new file mode 100644
index 000000000000..e2685470ba94
--- /dev/null
+++ b/arch/x86/kernel/cpu/vortex.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include "cpu.h"
+
+/*
+ * No special init required for Vortex processors.
+ */
+
+static const struct cpu_dev vortex_cpu_dev = {
+ .c_vendor = "Vortex",
+ .c_ident = { "Vortex86 SoC" },
+ .legacy_models = {
+ {
+ .family = 5,
+ .model_names = {
+ [2] = "Vortex86DX",
+ [8] = "Vortex86MX",
+ },
+ },
+ {
+ .family = 6,
+ .model_names = {
+ /*
+ * Both the Vortex86EX and the Vortex86EX2
+ * have the same family and model id.
+ *
+ * However, the -EX2 supports the product name
+ * CPUID call, so this name will only be used
+ * for the -EX, which does not.
+ */
+ [0] = "Vortex86EX",
+ },
+ },
+ },
+ .c_x86_vendor = X86_VENDOR_VORTEX,
+};
+
+cpu_dev_register(vortex_cpu_dev);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a8f3af257e26..9730c88530fc 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -70,19 +70,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
rcu_read_unlock();
}
-/*
- * When the crashkernel option is specified, only use the low
- * 1M for the real mode trampoline.
- */
-void __init crash_reserve_low_1M(void)
-{
- if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0)
- return;
-
- memblock_reserve(0, 1<<20);
- pr_info("Reserving the low 1M of memory for crashkernel\n");
-}
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -323,8 +310,8 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
cmem->nr_ranges = 1;
/* Exclude elf header region */
- start = image->arch.elf_load_addr;
- end = start + image->arch.elf_headers_sz - 1;
+ start = image->elf_load_addr;
+ end = start + image->elf_headers_sz - 1;
return crash_exclude_mem_range(cmem, start, end);
}
@@ -337,7 +324,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
struct crash_memmap_data cmd;
struct crash_mem *cmem;
- cmem = vzalloc(sizeof(struct crash_mem));
+ cmem = vzalloc(struct_size(cmem, ranges, 1));
if (!cmem)
return -ENOMEM;
@@ -407,20 +394,20 @@ int crash_load_segments(struct kimage *image)
if (ret)
return ret;
- image->arch.elf_headers = kbuf.buffer;
- image->arch.elf_headers_sz = kbuf.bufsz;
+ image->elf_headers = kbuf.buffer;
+ image->elf_headers_sz = kbuf.bufsz;
kbuf.memsz = kbuf.bufsz;
kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret) {
- vfree((void *)image->arch.elf_headers);
+ vfree((void *)image->elf_headers);
return ret;
}
- image->arch.elf_load_addr = kbuf.mem;
+ image->elf_load_addr = kbuf.mem;
pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
return ret;
}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 5fcac46aaf6b..5f4ae5476e19 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -10,8 +10,7 @@
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/crash_dump.h>
-
-#include <linux/uaccess.h>
+#include <linux/uio.h>
static inline bool is_crashed_pfn_valid(unsigned long pfn)
{
@@ -29,21 +28,8 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn)
#endif
}
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there might be no pte mapped
- * in the current kernel.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
{
void *vaddr;
@@ -54,14 +40,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
return -EFAULT;
vaddr = kmap_local_pfn(pfn);
-
- if (!userbuf) {
- memcpy(buf, vaddr + offset, csize);
- } else {
- if (copy_to_user(buf, vaddr + offset, csize))
- csize = -EFAULT;
- }
-
+ csize = copy_to_iter(vaddr + offset, csize, iter);
kunmap_local(vaddr);
return csize;
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045e82e8945b..e75bc2f217ff 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -8,11 +8,12 @@
#include <linux/errno.h>
#include <linux/crash_dump.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <linux/io.h>
+#include <linux/cc_platform.h>
-static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf,
+static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn,
+ size_t csize, unsigned long offset,
bool encrypted)
{
void *vaddr;
@@ -28,50 +29,36 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
if (!vaddr)
return -ENOMEM;
- if (userbuf) {
- if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
- iounmap((void __iomem *)vaddr);
- return -EFAULT;
- }
- } else
- memcpy(buf, vaddr + offset, csize);
+ csize = copy_to_iter(vaddr + offset, csize, iter);
- set_iounmap_nonlazy();
iounmap((void __iomem *)vaddr);
return csize;
}
-/**
- * copy_oldmem_page - copy one page of memory
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from the old kernel's memory. For this page, there is no pte
- * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
{
- return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
+ return __copy_oldmem_page(iter, pfn, csize, offset, false);
}
-/**
+/*
* copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
* memory with the encryption mask set to accommodate kdump on SME-enabled
* machines.
*/
-ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn,
+ size_t csize, unsigned long offset)
{
- return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
+ return __copy_oldmem_page(iter, pfn, csize, offset, true);
}
ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, sev_active());
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 6a4cb71c2498..5cd51f25f446 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE];
int __initdata of_ioapic;
-void __init early_init_dt_scan_chosen_arch(unsigned long node)
-{
- BUG();
-}
-
void __init early_init_dt_add_memory_arch(u64 base, u64 size)
{
BUG();
@@ -139,12 +134,11 @@ static void __init dtb_cpu_setup(void)
{
struct device_node *dn;
u32 apic_id, version;
- int ret;
version = GET_APIC_VERSION(apic_read(APIC_LVR));
for_each_of_cpu_node(dn) {
- ret = of_property_read_u32(dn, "reg", &apic_id);
- if (ret < 0) {
+ apic_id = of_get_cpu_hwid(dn, 0);
+ if (apic_id == ~0U) {
pr_warn("%pOF: missing local APIC ID\n", dn);
continue;
}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 759d392cbe9f..3b58d8703094 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -77,9 +77,6 @@ asmlinkage noinstr void __noreturn doublefault_shim(void)
* some way to reconstruct CR3. We could make a credible guess based
* on cpu_tlbstate, but that would be racy and would not account for
* PTI.
- *
- * Instead, don't bother. We can return through
- * rewind_stack_do_exit() instead.
*/
panic("cannot return from double fault\n");
}
@@ -100,9 +97,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
-#ifndef CONFIG_X86_32_LAZY_GS
- .gs = __KERNEL_STACK_CANARY,
-#endif
+ .gs = 0,
.__cr3 = __pa_nodebug(swapper_pg_dir),
},
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 299c20f0a38b..afae4dd77495 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -69,7 +69,7 @@ static void printk_stack_address(unsigned long address, int reliable,
const char *log_lvl)
{
touch_nmi_watchdog();
- printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
+ printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
@@ -81,12 +81,6 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
/* The user space code from other tasks cannot be accessed. */
if (regs != task_pt_regs(current))
return -EPERM;
- /*
- * Make sure userspace isn't trying to trick us into dumping kernel
- * memory by pointing the userspace instruction pointer at it.
- */
- if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX))
- return -EINVAL;
/*
* Even if named copy_from_user_nmi() this can be invoked from
@@ -351,7 +345,7 @@ unsigned long oops_begin(void)
}
NOKPROBE_SYMBOL(oops_begin);
-void __noreturn rewind_stack_do_exit(int signr);
+void __noreturn rewind_stack_and_make_dead(int signr);
void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
@@ -386,7 +380,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
* reuse the task stack and that existing poisons are invalid.
*/
kasan_unpoison_task_stack(current);
- rewind_stack_do_exit(signr);
+ rewind_stack_and_make_dead(signr);
}
NOKPROBE_SYMBOL(oops_end);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5601b95944fa..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type)
{
BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ if (type == STACK_TYPE_TASK)
+ return "TASK";
+
if (type == STACK_TYPE_IRQ)
return "IRQ";
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
if (type == STACK_TYPE_ENTRY) {
/*
* On 64-bit, we have a generic entry stack that we
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 22aad412f965..f267205f2d5a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -31,8 +31,8 @@
* - inform the user about the firmware's notion of memory layout
* via /sys/firmware/memmap
*
- * - the hibernation code uses it to generate a kernel-independent MD5
- * fingerprint of the physical memory layout of a system.
+ * - the hibernation code uses it to generate a kernel-independent CRC32
+ * checksum of the physical memory layout of a system.
*
* - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
* passed to us by the bootloader - the major difference between
@@ -793,7 +793,7 @@ core_initcall(e820__register_nvs_regions);
#endif
/*
- * Allocate the requested number of bytes with the requsted alignment
+ * Allocate the requested number of bytes with the requested alignment
* and return (the physical address) to the caller. Also register this
* range in the 'kexec' E820 table as a reserved range.
*
@@ -995,8 +995,10 @@ early_param("memmap", parse_memmap_opt);
*/
void __init e820__reserve_setup_data(void)
{
+ struct setup_indirect *indirect;
struct setup_data *data;
- u64 pa_data;
+ u64 pa_data, pa_next;
+ u32 len;
pa_data = boot_params.hdr.setup_data;
if (!pa_data)
@@ -1004,6 +1006,14 @@ void __init e820__reserve_setup_data(void)
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
+ if (!data) {
+ pr_warn("e820: failed to memremap setup_data entry\n");
+ return;
+ }
+
+ len = sizeof(*data);
+ pa_next = data->next;
+
e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
/*
@@ -1015,18 +1025,27 @@ void __init e820__reserve_setup_data(void)
sizeof(*data) + data->len,
E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- if (data->type == SETUP_INDIRECT &&
- ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
- e820__range_update(((struct setup_indirect *)data->data)->addr,
- ((struct setup_indirect *)data->data)->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- e820__range_update_kexec(((struct setup_indirect *)data->data)->addr,
- ((struct setup_indirect *)data->data)->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ if (data->type == SETUP_INDIRECT) {
+ len += data->len;
+ early_memunmap(data, sizeof(*data));
+ data = early_memremap(pa_data, len);
+ if (!data) {
+ pr_warn("e820: failed to memremap indirect setup_data\n");
+ return;
+ }
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ e820__range_update(indirect->addr, indirect->len,
+ E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ e820__range_update_kexec(indirect->addr, indirect->len,
+ E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ }
}
- pa_data = data->next;
- early_memunmap(data, sizeof(*data));
+ pa_data = pa_next;
+ early_memunmap(data, len);
}
e820__update_table(e820_table);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a4b5af03dcc1..a6c1867fc7aa 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,6 +18,7 @@
#include <linux/bcma/bcma_regs.h>
#include <linux/platform_data/x86/apple.h>
#include <drm/i915_drm.h>
+#include <drm/i915_pciids.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/io_apic.h>
@@ -515,6 +516,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = {
.stolen_size = gen9_stolen_size,
};
+/* Intel integrated GPUs for which we need to reserve "stolen memory" */
static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_I830_IDS(&i830_early_ops),
INTEL_I845G_IDS(&i845_early_ops),
@@ -549,8 +551,14 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_CNL_IDS(&gen9_early_ops),
INTEL_ICL_11_IDS(&gen11_early_ops),
INTEL_EHL_IDS(&gen11_early_ops),
+ INTEL_JSL_IDS(&gen11_early_ops),
INTEL_TGL_12_IDS(&gen11_early_ops),
INTEL_RKL_IDS(&gen11_early_ops),
+ INTEL_ADLS_IDS(&gen11_early_ops),
+ INTEL_ADLP_IDS(&gen11_early_ops),
+ INTEL_ADLN_IDS(&gen11_early_ops),
+ INTEL_RPLS_IDS(&gen11_early_ops),
+ INTEL_RPLP_IDS(&gen11_early_ops),
};
struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
@@ -588,6 +596,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func)
u16 device;
int i;
+ /*
+ * Reserve "stolen memory" for an integrated GPU. If we've already
+ * found one, there's nothing to do for other (discrete) GPUs.
+ */
+ if (resource_size(&intel_graphics_stolen_res))
+ return;
+
device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
@@ -700,7 +715,7 @@ static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
- QFLAG_APPLY_ONCE, intel_graphics_quirks },
+ 0, intel_graphics_quirks },
/*
* HPET on the current version of the Baytrail platform has accuracy
* problems: it will halt in deep idle state - so we disable it.
@@ -711,12 +726,6 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3e20,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3ec4,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x8a12,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{ PCI_VENDOR_ID_BROADCOM, 0x4331,
PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index d3c531d3b244..68b38925a74f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -387,7 +387,7 @@ static int __init setup_early_printk(char *buf)
#endif
#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
if (!strncmp(buf, "xdbc", 4))
- early_xdbc_parse_parameter(buf + 4);
+ early_xdbc_parse_parameter(buf + 4, keep);
#endif
buf++;
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 2954fab15e51..794e70151203 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,7 +2,7 @@
/*
* x86 FPU bug checks:
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
/*
* Boot time CPU/FPU FDIV bug detection code:
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
new file mode 100644
index 000000000000..958accf2ccf0
--- /dev/null
+++ b/arch/x86/kernel/fpu/context.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_CONTEXT_H
+#define __X86_KERNEL_FPU_CONTEXT_H
+
+#include <asm/fpu/xstate.h>
+#include <asm/trace/fpu.h>
+
+/* Functions related to FPU context tracking */
+
+/*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Either one of these invalidation functions is enough. Invalidate
+ * a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
+ */
+static inline void __cpu_invalidate_fpregs_state(void)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+ fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
+{
+ return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+ trace_x86_fpu_regs_deactivated(fpu);
+}
+
+static inline void fpregs_activate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+ trace_x86_fpu_regs_activated(fpu);
+}
+
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ int cpu = smp_processor_id();
+
+ if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+ return;
+
+ if (!fpregs_state_valid(fpu, cpu)) {
+ /*
+ * This restores _all_ xstate which has not been
+ * established yet.
+ *
+ * If PKRU is enabled, then the PKRU value is already
+ * correct because it was either set in switch_to() or in
+ * flush_thread(). So it is excluded because it might be
+ * not up to date in current->thread.fpu.xsave state.
+ *
+ * XFD state is handled in restore_fpregs_from_fpstate().
+ */
+ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);
+
+ fpregs_activate(fpu);
+ fpu->last_cpu = cpu;
+ }
+ clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 571220ac8bea..0531d6a06df5 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -6,36 +6,44 @@
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/sched.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>
+#include <uapi/asm/kvm.h>
+
#include <linux/hardirq.h>
#include <linux/pkeys.h>
+#include <linux/vmalloc.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+DEFINE_PER_CPU(u64, xfd_state);
+#endif
+
+/* The FPU state configuration data for kernel and user space */
+struct fpu_state_config fpu_kernel_cfg __ro_after_init;
+struct fpu_state_config fpu_user_cfg __ro_after_init;
+
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
* depending on the FPU hardware format:
*/
-union fpregs_state init_fpstate __read_mostly;
+struct fpstate init_fpstate __ro_after_init;
-/*
- * Track whether the kernel is using the FPU state
- * currently.
- *
- * This flag is used:
- *
- * - by IRQ context code to potentially use the FPU
- * if it's unused.
- *
- * - to debug kernel_fpu_begin()/end() correctness
- */
+/* Track in-kernel FPU usage */
static DEFINE_PER_CPU(bool, in_kernel_fpu);
/*
@@ -43,83 +51,374 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu);
*/
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
-static bool kernel_fpu_disabled(void)
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ */
+bool irq_fpu_usable(void)
{
- return this_cpu_read(in_kernel_fpu);
-}
+ if (WARN_ON_ONCE(in_nmi()))
+ return false;
-static bool interrupted_kernel_fpu_idle(void)
-{
- return !kernel_fpu_disabled();
+ /* In kernel FPU usage already active? */
+ if (this_cpu_read(in_kernel_fpu))
+ return false;
+
+ /*
+ * When not in NMI or hard interrupt context, FPU can be used in:
+ *
+ * - Task context except from within fpregs_lock()'ed critical
+ * regions.
+ *
+ * - Soft interrupt processing context which cannot happen
+ * while in a fpregs_lock()'ed critical region.
+ */
+ if (!in_hardirq())
+ return true;
+
+ /*
+ * In hard interrupt context it's safe when soft interrupts
+ * are enabled, which means the interrupt did not hit in
+ * a fpregs_lock()'ed critical region.
+ */
+ return !softirq_count();
}
+EXPORT_SYMBOL(irq_fpu_usable);
/*
- * Were we in user mode (or vm86 mode) when we were
- * interrupted?
- *
- * Doing kernel_fpu_begin/end() is ok if we are running
- * in an interrupt context from user mode - we'll just
- * save the FPU state as required.
+ * Track AVX512 state use because it is known to slow the max clock
+ * speed of the core.
*/
-static bool interrupted_user_mode(void)
+static void update_avx_timestamp(struct fpu *fpu)
{
- struct pt_regs *regs = get_irq_regs();
- return regs && user_mode(regs);
+
+#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)
+
+ if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
+ fpu->avx512_timestamp = jiffies;
}
/*
- * Can we use the FPU in kernel mode with the
- * whole "kernel_fpu_begin/end()" sequence?
+ * Save the FPU register state in fpu->fpstate->regs. The register state is
+ * preserved.
+ *
+ * Must be called with fpregs_lock() held.
+ *
+ * The legacy FNSAVE instruction clears all FPU state unconditionally, so
+ * register state has to be reloaded. That might be a pointless exercise
+ * when the FPU is going to be used by another task right after that. But
+ * this only affects 20+ years old 32bit systems and avoids conditionals all
+ * over the place.
*
- * It's always ok in process context (ie "not interrupt")
- * but it is sometimes ok even from an irq.
+ * FXSAVE and all XSAVE variants preserve the FPU register state.
*/
-bool irq_fpu_usable(void)
+void save_fpregs_to_fpstate(struct fpu *fpu)
{
- return !in_interrupt() ||
- interrupted_user_mode() ||
- interrupted_kernel_fpu_idle();
+ if (likely(use_xsave())) {
+ os_xsave(fpu->fpstate);
+ update_avx_timestamp(fpu);
+ return;
+ }
+
+ if (likely(use_fxsr())) {
+ fxsave(&fpu->fpstate->regs.fxsave);
+ return;
+ }
+
+ /*
+ * Legacy FPU register saving, FNSAVE always clears FPU registers,
+ * so we have to reload them from the memory state.
+ */
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
+ frstor(&fpu->fpstate->regs.fsave);
}
-EXPORT_SYMBOL(irq_fpu_usable);
+
+void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
+{
+ /*
+ * AMD K7/K8 and later CPUs up to Zen don't save/restore
+ * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
+ * here by setting it to fixed values. "m" is a random variable
+ * that should be in L1.
+ */
+ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
+ asm volatile(
+ "fnclex\n\t"
+ "emms\n\t"
+ "fildl %P[addr]" /* set F?P to defined value */
+ : : [addr] "m" (fpstate));
+ }
+
+ if (use_xsave()) {
+ /*
+ * Dynamically enabled features are enabled in XCR0, but
+ * usage requires also that the corresponding bits in XFD
+ * are cleared. If the bits are set then using a related
+ * instruction will raise #NM. This allows to do the
+ * allocation of the larger FPU buffer lazy from #NM or if
+ * the task has no permission to kill it which would happen
+ * via #UD if the feature is disabled in XCR0.
+ *
+ * XFD state is following the same life time rules as
+ * XSTATE and to restore state correctly XFD has to be
+ * updated before XRSTORS otherwise the component would
+ * stay in or go into init state even if the bits are set
+ * in fpstate::regs::xsave::xfeatures.
+ */
+ xfd_update_state(fpstate);
+
+ /*
+ * Restoring state always needs to modify all features
+ * which are in @mask even if the current task cannot use
+ * extended features.
+ *
+ * So fpstate->xfeatures cannot be used here, because then
+ * a feature for which the task has no permission but was
+ * used by the previous task would not go into init state.
+ */
+ mask = fpu_kernel_cfg.max_features & mask;
+
+ os_xrstor(fpstate, mask);
+ } else {
+ if (use_fxsr())
+ fxrstor(&fpstate->regs.fxsave);
+ else
+ frstor(&fpstate->regs.fsave);
+ }
+}
+
+void fpu_reset_from_exception_fixup(void)
+{
+ restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
+
+static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
+{
+ struct fpu_state_perm *fpuperm;
+ u64 perm;
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ spin_lock_irq(&current->sighand->siglock);
+ fpuperm = &current->group_leader->thread.fpu.guest_perm;
+ perm = fpuperm->__state_perm;
+
+ /* First fpstate allocation locks down permissions. */
+ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);
+
+ spin_unlock_irq(&current->sighand->siglock);
+
+ gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED;
+}
+
+bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fpstate;
+ unsigned int size;
+
+ size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ fpstate = vzalloc(size);
+ if (!fpstate)
+ return false;
+
+ /* Leave xfd to 0 (the reset value defined by spec) */
+ __fpstate_reset(fpstate, 0);
+ fpstate_init_user(fpstate);
+ fpstate->is_valloc = true;
+ fpstate->is_guest = true;
+
+ gfpu->fpstate = fpstate;
+ gfpu->xfeatures = fpu_user_cfg.default_features;
+ gfpu->perm = fpu_user_cfg.default_features;
+
+ /*
+ * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
+ * to userspace, even when XSAVE is unsupported, so that restoring FPU
+ * state on a different CPU that does support XSAVE can cleanly load
+ * the incoming state using its natural XSAVE. In other words, KVM's
+ * uABI size may be larger than this host's default size. Conversely,
+ * the default size should never be larger than KVM's base uABI size;
+ * all features that can expand the uABI size must be opt-in.
+ */
+ gfpu->uabi_size = sizeof(struct kvm_xsave);
+ if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
+ gfpu->uabi_size = fpu_user_cfg.default_size;
+
+ fpu_init_guest_permissions(gfpu);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+
+void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fps = gfpu->fpstate;
+
+ if (!fps)
+ return;
+
+ if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use))
+ return;
+
+ gfpu->fpstate = NULL;
+ vfree(fps);
+}
+EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact and we can
- * keep registers active.
+ * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
+ * @guest_fpu: Pointer to the guest FPU container
+ * @xfeatures: Features requested by guest CPUID
+ *
+ * Enable all dynamic xfeatures according to guest perm and requested CPUID.
+ *
+ * Return: 0 on success, error code otherwise
+ */
+int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
+{
+ lockdep_assert_preemption_enabled();
+
+ /* Nothing to do if all requested features are already enabled. */
+ xfeatures &= ~guest_fpu->xfeatures;
+ if (!xfeatures)
+ return 0;
+
+ return __xfd_enable_feature(xfeatures, guest_fpu);
+}
+EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+
+#ifdef CONFIG_X86_64
+void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
+{
+ fpregs_lock();
+ guest_fpu->fpstate->xfd = xfd;
+ if (guest_fpu->fpstate->in_use)
+ xfd_update_state(guest_fpu->fpstate);
+ fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+
+/**
+ * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
*
- * The legacy FNSAVE instruction cleared all FPU state
- * unconditionally, so registers are essentially destroyed.
- * Modern FPU state can be kept in registers, if there are
- * no pending FP exceptions.
+ * Must be invoked from KVM after a VMEXIT before enabling interrupts when
+ * XFD write emulation is disabled. This is required because the guest can
+ * freely modify XFD and the state at VMEXIT is not guaranteed to be the
+ * same as the state on VMENTER. So software state has to be udpated before
+ * any operation which depends on it can take place.
+ *
+ * Note: It can be invoked unconditionally even when write emulation is
+ * enabled for the price of a then pointless MSR read.
*/
-int copy_fpregs_to_fpstate(struct fpu *fpu)
+void fpu_sync_guest_vmexit_xfd_state(void)
{
- if (likely(use_xsave())) {
- copy_xregs_to_kernel(&fpu->state.xsave);
+ struct fpstate *fps = current->thread.fpu.fpstate;
+
+ lockdep_assert_irqs_disabled();
+ if (fpu_state_size_dynamic()) {
+ rdmsrl(MSR_IA32_XFD, fps->xfd);
+ __this_cpu_write(xfd_state, fps->xfd);
+ }
+}
+EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+#endif /* CONFIG_X86_64 */
+
+int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
+{
+ struct fpstate *guest_fps = guest_fpu->fpstate;
+ struct fpu *fpu = &current->thread.fpu;
+ struct fpstate *cur_fps = fpu->fpstate;
+
+ fpregs_lock();
+ if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
+
+ /* Swap fpstate */
+ if (enter_guest) {
+ fpu->__task_fpstate = cur_fps;
+ fpu->fpstate = guest_fps;
+ guest_fps->in_use = true;
+ } else {
+ guest_fps->in_use = false;
+ fpu->fpstate = fpu->__task_fpstate;
+ fpu->__task_fpstate = NULL;
+ }
+ cur_fps = fpu->fpstate;
+
+ if (!cur_fps->is_confidential) {
+ /* Includes XFD update */
+ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+ } else {
/*
- * AVX512 state is tracked here because its use is
- * known to slow the max clock speed of the core.
+ * XSTATE is restored by firmware from encrypted
+ * memory. Make sure XFD state is correct while
+ * running with guest fpstate
*/
- if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
- fpu->avx512_timestamp = jiffies;
- return 1;
+ xfd_update_state(cur_fps);
}
- if (likely(use_fxsr())) {
- copy_fxregs_to_kernel(fpu);
- return 1;
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+
+void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
+ unsigned int size, u32 pkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ union fpregs_state *ustate = buf;
+ struct membuf mb = { .p = buf, .left = size };
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE);
+ } else {
+ memcpy(&ustate->fxsave, &kstate->regs.fxsave,
+ sizeof(ustate->fxsave));
+ /* Make it restorable on a XSAVE enabled host */
+ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
}
+}
+EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
- /*
- * Legacy FPU register saving, FNSAVE always clears FPU registers,
- * so we have to mark them inactive:
- */
- asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
+ u64 xcr0, u32 *vpkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ const union fpregs_state *ustate = buf;
+ struct pkru_state *xpkru;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
+ return -EINVAL;
+ if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
+ return 0;
+ }
+
+ if (ustate->xsave.header.xfeatures & ~xcr0)
+ return -EINVAL;
+
+ ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
+ if (ret)
+ return ret;
+ /* Retrieve PKRU if not in init state */
+ if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
+ xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
+ *vpkru = xpkru->pkru;
+ }
return 0;
}
-EXPORT_SYMBOL(copy_fpregs_to_fpstate);
+EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+#endif /* CONFIG_KVM */
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
@@ -133,11 +432,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
if (!(current->flags & PF_KTHREAD) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
- /*
- * Ignore return value -- we don't care if reg state
- * is clobbered.
- */
- copy_fpregs_to_fpstate(&current->thread.fpu);
+ save_fpregs_to_fpstate(&current->thread.fpu);
}
__cpu_invalidate_fpregs_state();
@@ -160,92 +455,166 @@ void kernel_fpu_end(void)
EXPORT_SYMBOL_GPL(kernel_fpu_end);
/*
- * Save the FPU state (mark it for reload if necessary):
- *
- * This only ever gets called for the current task.
+ * Sync the FPU register state to current's memory register state when the
+ * current task owns the FPU. The hardware register state is preserved.
*/
-void fpu__save(struct fpu *fpu)
+void fpu_sync_fpstate(struct fpu *fpu)
{
WARN_ON_FPU(fpu != &current->thread.fpu);
fpregs_lock();
trace_x86_fpu_before_save(fpu);
- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
- if (!copy_fpregs_to_fpstate(fpu)) {
- copy_kernel_to_fpregs(&fpu->state);
- }
- }
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
trace_x86_fpu_after_save(fpu);
fpregs_unlock();
}
+static inline unsigned int init_fpstate_copy_size(void)
+{
+ if (!use_xsave())
+ return fpu_kernel_cfg.default_size;
+
+ /* XSAVE(S) just needs the legacy and the xstate header part */
+ return sizeof(init_fpstate.regs.xsave);
+}
+
+static inline void fpstate_init_fxstate(struct fpstate *fpstate)
+{
+ fpstate->regs.fxsave.cwd = 0x37f;
+ fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
+}
+
/*
* Legacy x87 fpstate state init:
*/
-static inline void fpstate_init_fstate(struct fregs_state *fp)
+static inline void fpstate_init_fstate(struct fpstate *fpstate)
{
- fp->cwd = 0xffff037fu;
- fp->swd = 0xffff0000u;
- fp->twd = 0xffffffffu;
- fp->fos = 0xffff0000u;
+ fpstate->regs.fsave.cwd = 0xffff037fu;
+ fpstate->regs.fsave.swd = 0xffff0000u;
+ fpstate->regs.fsave.twd = 0xffffffffu;
+ fpstate->regs.fsave.fos = 0xffff0000u;
}
-void fpstate_init(union fpregs_state *state)
+/*
+ * Used in two places:
+ * 1) Early boot to setup init_fpstate for non XSAVE systems
+ * 2) fpu_init_fpstate_user() which is invoked from KVM
+ */
+void fpstate_init_user(struct fpstate *fpstate)
{
- if (!static_cpu_has(X86_FEATURE_FPU)) {
- fpstate_init_soft(&state->soft);
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpstate_init_soft(&fpstate->regs.soft);
return;
}
- memset(state, 0, fpu_kernel_xstate_size);
+ xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);
- if (static_cpu_has(X86_FEATURE_XSAVES))
- fpstate_init_xstate(&state->xsave);
- if (static_cpu_has(X86_FEATURE_FXSR))
- fpstate_init_fxstate(&state->fxsave);
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ fpstate_init_fxstate(fpstate);
else
- fpstate_init_fstate(&state->fsave);
+ fpstate_init_fstate(fpstate);
+}
+
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
+{
+ /* Initialize sizes and feature masks */
+ fpstate->size = fpu_kernel_cfg.default_size;
+ fpstate->user_size = fpu_user_cfg.default_size;
+ fpstate->xfeatures = fpu_kernel_cfg.default_features;
+ fpstate->user_xfeatures = fpu_user_cfg.default_features;
+ fpstate->xfd = xfd;
+}
+
+void fpstate_reset(struct fpu *fpu)
+{
+ /* Set the fpstate pointer to the default fpstate */
+ fpu->fpstate = &fpu->__fpstate;
+ __fpstate_reset(fpu->fpstate, init_fpstate.xfd);
+
+ /* Initialize the permission related info in fpu */
+ fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
+ fpu->perm.__state_size = fpu_kernel_cfg.default_size;
+ fpu->perm.__user_state_size = fpu_user_cfg.default_size;
+ /* Same defaults for guests */
+ fpu->guest_perm = fpu->perm;
}
-EXPORT_SYMBOL_GPL(fpstate_init);
-int fpu__copy(struct task_struct *dst, struct task_struct *src)
+static inline void fpu_inherit_perms(struct fpu *dst_fpu)
{
+ if (fpu_state_size_dynamic()) {
+ struct fpu *src_fpu = &current->group_leader->thread.fpu;
+
+ spin_lock_irq(&current->sighand->siglock);
+ /* Fork also inherits the permissions of the parent */
+ dst_fpu->perm = src_fpu->perm;
+ dst_fpu->guest_perm = src_fpu->guest_perm;
+ spin_unlock_irq(&current->sighand->siglock);
+ }
+}
+
+/* Clone current's FPU state on fork */
+int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
+{
+ struct fpu *src_fpu = &current->thread.fpu;
struct fpu *dst_fpu = &dst->thread.fpu;
- struct fpu *src_fpu = &src->thread.fpu;
+ /* The new task's FPU state cannot be valid in the hardware. */
dst_fpu->last_cpu = -1;
- if (!static_cpu_has(X86_FEATURE_FPU))
+ fpstate_reset(dst_fpu);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
return 0;
- WARN_ON_FPU(src_fpu != &current->thread.fpu);
+ /*
+ * Enforce reload for user space tasks and prevent kernel threads
+ * from trying to save the FPU registers on context switch.
+ */
+ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
+
+ /*
+ * No FPU state inheritance for kernel threads and IO
+ * worker threads.
+ */
+ if (minimal) {
+ /* Clear out the minimal state */
+ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
+ init_fpstate_copy_size());
+ return 0;
+ }
/*
- * Don't let 'init optimized' areas of the XSAVE area
- * leak into the child task:
+ * If a new feature is added, ensure all dynamic features are
+ * caller-saved from here!
*/
- memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
/*
- * If the FPU registers are not current just memcpy() the state.
- * Otherwise save current FPU registers directly into the child's FPU
- * context, without any memory-to-memory copying.
+ * Save the default portion of the current FPU state into the
+ * clone. Assume all dynamic features to be defined as caller-
+ * saved, which enables skipping both the expansion of fpstate
+ * and the copying of any dynamic state.
*
- * ( The function 'fails' in the FNSAVE case, which destroys
- * register contents so we have to load them back. )
+ * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
+ * copying is not valid when current uses non-default states.
*/
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
- memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
-
- else if (!copy_fpregs_to_fpstate(dst_fpu))
- copy_kernel_to_fpregs(&dst_fpu->state);
-
+ fpregs_restore_userregs();
+ save_fpregs_to_fpstate(dst_fpu);
+ if (!(clone_flags & CLONE_THREAD))
+ fpu_inherit_perms(dst_fpu);
fpregs_unlock();
- set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
+ /*
+ * Children never inherit PASID state.
+ * Force it to have its init value:
+ */
+ if (use_xsave())
+ dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;
trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
@@ -254,60 +623,13 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src)
}
/*
- * Activate the current task's in-memory FPU context,
- * if it has not been used before:
+ * Whitelist the FPU register state embedded into task_struct for hardened
+ * usercopy.
*/
-static void fpu__initialize(struct fpu *fpu)
+void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
{
- WARN_ON_FPU(fpu != &current->thread.fpu);
-
- set_thread_flag(TIF_NEED_FPU_LOAD);
- fpstate_init(&fpu->state);
- trace_x86_fpu_init_state(fpu);
-}
-
-/*
- * This function must be called before we read a task's fpstate.
- *
- * There's two cases where this gets called:
- *
- * - for the current task (when coredumping), in which case we have
- * to save the latest FPU registers into the fpstate,
- *
- * - or it's called for stopped tasks (ptrace), in which case the
- * registers were already saved by the context-switch code when
- * the task scheduled out.
- *
- * If the task has used the FPU before then save it.
- */
-void fpu__prepare_read(struct fpu *fpu)
-{
- if (fpu == &current->thread.fpu)
- fpu__save(fpu);
-}
-
-/*
- * This function must be called before we write a task's fpstate.
- *
- * Invalidate any cached FPU registers.
- *
- * After this function call, after registers in the fpstate are
- * modified and the child task has woken up, the child task will
- * restore the modified FPU state from the modified context. If we
- * didn't clear its cached status here then the cached in-registers
- * state pending on its former CPU could be restored, corrupting
- * the modifications.
- */
-void fpu__prepare_write(struct fpu *fpu)
-{
- /*
- * Only stopped child tasks can be used to modify the FPU
- * state in the fpstate buffer:
- */
- WARN_ON_FPU(fpu == &current->thread.fpu);
-
- /* Invalidate any cached state: */
- __fpu_invalidate_fpregs_state(fpu);
+ *offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
+ *size = fpu_kernel_cfg.default_size;
}
/*
@@ -340,61 +662,88 @@ void fpu__drop(struct fpu *fpu)
* Clear FPU registers by setting them up from the init fpstate.
* Caller must do fpregs_[un]lock() around it.
*/
-static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
+static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
if (use_xsave())
- copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
- else if (static_cpu_has(X86_FEATURE_FXSR))
- copy_kernel_to_fxregs(&init_fpstate.fxsave);
+ os_xrstor(&init_fpstate, features_mask);
+ else if (use_fxsr())
+ fxrstor(&init_fpstate.regs.fxsave);
else
- copy_kernel_to_fregs(&init_fpstate.fsave);
+ frstor(&init_fpstate.regs.fsave);
- if (boot_cpu_has(X86_FEATURE_OSPKE))
- copy_init_pkru_to_fpregs();
+ pkru_write_default();
}
/*
- * Clear the FPU state back to init state.
- *
- * Called by sys_execve(), by the signal handler code and by various
- * error paths.
+ * Reset current->fpu memory state to the init values.
*/
-static void fpu__clear(struct fpu *fpu, bool user_only)
+static void fpu_reset_fpregs(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ fpregs_lock();
+ fpu__drop(fpu);
+ /*
+ * This does not change the actual hardware registers. It just
+ * resets the memory image and sets TIF_NEED_FPU_LOAD so a
+ * subsequent return to usermode will reload the registers from the
+ * task's memory image.
+ *
+ * Do not use fpstate_init() here. Just copy init_fpstate which has
+ * the correct content already except for PKRU.
+ *
+ * PKRU handling does not rely on the xstate when restoring for
+ * user space as PKRU is eagerly written in switch_to() and
+ * flush_thread().
+ */
+ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ fpregs_unlock();
+}
+
+/*
+ * Reset current's user FPU states to the init states. current's
+ * supervisor states, if any, are not modified by this function. The
+ * caller guarantees that the XSTATE header in memory is intact.
+ */
+void fpu__clear_user_states(struct fpu *fpu)
{
WARN_ON_FPU(fpu != &current->thread.fpu);
- if (!static_cpu_has(X86_FEATURE_FPU)) {
- fpu__drop(fpu);
- fpu__initialize(fpu);
+ fpregs_lock();
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpu_reset_fpregs();
+ fpregs_unlock();
return;
}
- fpregs_lock();
+ /*
+ * Ensure that current's supervisor states are loaded into their
+ * corresponding registers.
+ */
+ if (xfeatures_mask_supervisor() &&
+ !fpregs_state_valid(fpu, smp_processor_id()))
+ os_xrstor_supervisor(fpu->fpstate);
- if (user_only) {
- if (!fpregs_state_valid(fpu, smp_processor_id()) &&
- xfeatures_mask_supervisor())
- copy_kernel_to_xregs(&fpu->state.xsave,
- xfeatures_mask_supervisor());
- copy_init_fpstate_to_fpregs(xfeatures_mask_user());
- } else {
- copy_init_fpstate_to_fpregs(xfeatures_mask_all);
- }
+ /* Reset user states in registers. */
+ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
+ /*
+ * Now all FPU registers have their desired values. Inform the FPU
+ * state machine that current's FPU registers are in the hardware
+ * registers. The memory image does not need to be updated because
+ * any operation relying on it has to save the registers first when
+ * current's FPU is marked active.
+ */
fpregs_mark_activate();
fpregs_unlock();
}
-void fpu__clear_user_states(struct fpu *fpu)
+void fpu_flush_thread(void)
{
- fpu__clear(fpu, true);
+ fpstate_reset(&current->thread.fpu);
+ fpu_reset_fpregs();
}
-
-void fpu__clear_all(struct fpu *fpu)
-{
- fpu__clear(fpu, false);
-}
-
/*
* Load FPU context before returning to userspace.
*/
@@ -403,7 +752,7 @@ void switch_fpu_return(void)
if (!static_cpu_has(X86_FEATURE_FPU))
return;
- __fpregs_load_activate();
+ fpregs_restore_userregs();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);
@@ -433,7 +782,6 @@ void fpregs_mark_activate(void)
fpu->last_cpu = smp_processor_id();
clear_thread_flag(TIF_NEED_FPU_LOAD);
}
-EXPORT_SYMBOL_GPL(fpregs_mark_activate);
/*
* x87 math exception handling:
@@ -456,11 +804,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
* fully reproduce the context of the exception.
*/
if (boot_cpu_has(X86_FEATURE_FXSR)) {
- cwd = fpu->state.fxsave.cwd;
- swd = fpu->state.fxsave.swd;
+ cwd = fpu->fpstate->regs.fxsave.cwd;
+ swd = fpu->fpstate->regs.fxsave.swd;
} else {
- cwd = (unsigned short)fpu->state.fsave.cwd;
- swd = (unsigned short)fpu->state.fsave.swd;
+ cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
+ swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
}
err = swd & ~cwd;
@@ -474,7 +822,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
unsigned short mxcsr = MXCSR_DEFAULT;
if (boot_cpu_has(X86_FEATURE_XMM))
- mxcsr = fpu->state.fxsave.mxcsr;
+ mxcsr = fpu->fpstate->regs.fxsave.mxcsr;
err = ~(mxcsr >> 7) & mxcsr;
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 701f196d7c68..621f4b6cac4a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -2,7 +2,7 @@
/*
* x86 FPU boot time init code:
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -10,6 +10,10 @@
#include <linux/sched/task.h>
#include <linux/init.h>
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
/*
* Initialize the registers found in all CPUs, CR0 and CR4:
*/
@@ -34,7 +38,7 @@ static void fpu__init_cpu_generic(void)
/* Flush out any pending x87 state: */
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU))
- fpstate_init_soft(&current->thread.fpu.state.soft);
+ fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
else
#endif
asm volatile ("fninit");
@@ -89,7 +93,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
/*
* Boot time FPU feature detection code:
*/
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu;
EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
static void __init fpu__init_system_mxcsr(void)
@@ -121,23 +125,14 @@ static void __init fpu__init_system_mxcsr(void)
static void __init fpu__init_system_generic(void)
{
/*
- * Set up the legacy init FPU context. (xstate init might overwrite this
- * with a more modern format, if the CPU supports it.)
+ * Set up the legacy init FPU context. Will be updated when the
+ * CPU supports XSAVE[S].
*/
- fpstate_init(&init_fpstate);
+ fpstate_init_user(&init_fpstate);
fpu__init_system_mxcsr();
}
-/*
- * Size of the FPU context state. All tasks in the system use the
- * same context size, regardless of what portion they use.
- * This is inherent to the XSAVE architecture which puts all state
- * components into a single, continuous memory block:
- */
-unsigned int fpu_kernel_xstate_size;
-EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
-
/* Get alignment of the TYPE. */
#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
@@ -162,13 +157,13 @@ static void __init fpu__init_task_struct_size(void)
* Subtract off the static size of the register state.
* It potentially has a bunch of padding.
*/
- task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+ task_size -= sizeof(current->thread.fpu.__fpstate.regs);
/*
* Add back the dynamically-calculated register state
* size.
*/
- task_size += fpu_kernel_xstate_size;
+ task_size += fpu_kernel_cfg.default_size;
/*
* We dynamically size 'struct fpu', so we require that
@@ -177,7 +172,7 @@ static void __init fpu__init_task_struct_size(void)
* you hit a compile error here, check the structure to
* see if something got added to the end.
*/
- CHECK_MEMBER_AT_END_OF(struct fpu, state);
+ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate);
CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
@@ -192,48 +187,34 @@ static void __init fpu__init_task_struct_size(void)
*/
static void __init fpu__init_system_xstate_size_legacy(void)
{
- static int on_boot_cpu __initdata = 1;
-
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
+ unsigned int size;
/*
- * Note that xstate sizes might be overwritten later during
- * fpu__init_system_xstate().
+ * Note that the size configuration might be overwritten later
+ * during fpu__init_system_xstate().
*/
-
- if (!boot_cpu_has(X86_FEATURE_FPU)) {
- fpu_kernel_xstate_size = sizeof(struct swregs_state);
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ size = sizeof(struct swregs_state);
+ } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) {
+ size = sizeof(struct fxregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE;
} else {
- if (boot_cpu_has(X86_FEATURE_FXSR))
- fpu_kernel_xstate_size =
- sizeof(struct fxregs_state);
- else
- fpu_kernel_xstate_size =
- sizeof(struct fregs_state);
+ size = sizeof(struct fregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FP;
}
- fpu_user_xstate_size = fpu_kernel_xstate_size;
+ fpu_kernel_cfg.max_size = size;
+ fpu_kernel_cfg.default_size = size;
+ fpu_user_cfg.max_size = size;
+ fpu_user_cfg.default_size = size;
+ fpstate_reset(&current->thread.fpu);
}
-/*
- * Find supported xfeatures based on cpu features and command-line input.
- * This must be called after fpu__init_parse_early_param() is called and
- * xfeatures_mask is enumerated.
- */
-u64 __init fpu__get_supported_xfeatures_mask(void)
+static void __init fpu__init_init_fpstate(void)
{
- return XFEATURE_MASK_USER_SUPPORTED |
- XFEATURE_MASK_SUPERVISOR_SUPPORTED;
-}
-
-/* Legacy code to initialize eager fpu mode. */
-static void __init fpu__init_system_ctx_switch(void)
-{
- static bool on_boot_cpu __initdata = 1;
-
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
+ /* Bring init_fpstate size and features up to date */
+ init_fpstate.size = fpu_kernel_cfg.max_size;
+ init_fpstate.xfeatures = fpu_kernel_cfg.max_features;
}
/*
@@ -242,6 +223,7 @@ static void __init fpu__init_system_ctx_switch(void)
*/
void __init fpu__init_system(struct cpuinfo_x86 *c)
{
+ fpstate_reset(&current->thread.fpu);
fpu__init_system_early_generic(c);
/*
@@ -252,8 +234,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
- fpu__init_system_xstate();
+ fpu__init_system_xstate(fpu_kernel_cfg.max_size);
fpu__init_task_struct_size();
-
- fpu__init_system_ctx_switch();
+ fpu__init_init_fpstate();
}
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
new file mode 100644
index 000000000000..dbdb31f55fc7
--- /dev/null
+++ b/arch/x86/kernel/fpu/internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_INTERNAL_H
+#define __X86_KERNEL_FPU_INTERNAL_H
+
+extern struct fpstate init_fpstate;
+
+/* CPU feature check wrappers */
+static __always_inline __pure bool use_xsave(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_FXSR);
+}
+
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/* Used in init.c */
+extern void fpstate_init_user(struct fpstate *fpstate);
+extern void fpstate_reset(struct fpu *fpu);
+
+#endif
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
new file mode 100644
index 000000000000..098f367bb8a7
--- /dev/null
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_LEGACY_H
+#define __X86_KERNEL_FPU_LEGACY_H
+
+#include <asm/fpu/types.h>
+
+extern unsigned int mxcsr_feature_mask;
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+ asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
+/*
+ * Returns 0 on success or the trap number when the operation raises an
+ * exception.
+ */
+#define user_insn(insn, output, input...) \
+({ \
+ int err; \
+ \
+ might_fault(); \
+ \
+ asm volatile(ASM_STAC "\n" \
+ "1: " #insn "\n" \
+ "2: " ASM_CLAC "\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn_err(insn, output, input...) \
+({ \
+ int err; \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \
+ : [err] "=r" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn(insn, output, input...) \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \
+ : output : input)
+
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+ else
+ return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+}
+
+static inline void fxrstor(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_safe(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void frstor(struct fregs_state *fx)
+{
+ kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_safe(struct fregs_state *fx)
+{
+ return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c413756ba89f..75ffaef8c299 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -2,11 +2,17 @@
/*
* FPU register's regset abstraction, for ptrace, core dumps, etc.
*/
-#include <asm/fpu/internal.h>
+#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
+
+#include <asm/fpu/api.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
-#include <linux/sched/task_stack.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
/*
* The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
@@ -26,18 +32,58 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
return 0;
}
+/*
+ * The regset get() functions are invoked from:
+ *
+ * - coredump to dump the current task's fpstate. If the current task
+ * owns the FPU then the memory state has to be synchronized and the
+ * FPU register state preserved. Otherwise fpstate is already in sync.
+ *
+ * - ptrace to dump fpstate of a stopped task, in which case the registers
+ * have already been saved to fpstate on context switch.
+ */
+static void sync_fpstate(struct fpu *fpu)
+{
+ if (fpu == &current->thread.fpu)
+ fpu_sync_fpstate(fpu);
+}
+
+/*
+ * Invalidate cached FPU registers before modifying the stopped target
+ * task's fpstate.
+ *
+ * This forces the target task on resume to restore the FPU registers from
+ * modified fpstate. Otherwise the task might skip the restore and operate
+ * with the cached FPU registers which discards the modifications.
+ */
+static void fpu_force_restore(struct fpu *fpu)
+{
+ /*
+ * Only stopped child tasks can be used to modify the FPU
+ * state in the fpstate buffer:
+ */
+ WARN_ON_FPU(fpu == &current->thread.fpu);
+
+ __fpu_invalidate_fpregs_state(fpu);
+}
+
int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
struct membuf to)
{
struct fpu *fpu = &target->thread.fpu;
- if (!boot_cpu_has(X86_FEATURE_FXSR))
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR))
return -ENODEV;
- fpu__prepare_read(fpu);
- fpstate_sanitize_xstate(fpu);
+ sync_fpstate(fpu);
+
+ if (!use_xsave()) {
+ return membuf_write(&to, &fpu->fpstate->regs.fxsave,
+ sizeof(fpu->fpstate->regs.fxsave));
+ }
- return membuf_write(&to, &fpu->state.fxsave, sizeof(struct fxregs_state));
+ copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
+ return 0;
}
int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -45,62 +91,51 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
const void *kbuf, const void __user *ubuf)
{
struct fpu *fpu = &target->thread.fpu;
+ struct fxregs_state newstate;
int ret;
- if (!boot_cpu_has(X86_FEATURE_FXSR))
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR))
return -ENODEV;
- fpu__prepare_write(fpu);
- fpstate_sanitize_xstate(fpu);
+ /* No funny business with partial or oversized writes is permitted. */
+ if (pos != 0 || count != sizeof(newstate))
+ return -EINVAL;
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fxsave, 0, -1);
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1);
+ if (ret)
+ return ret;
- /*
- * mxcsr reserved bits must be masked to zero for security reasons.
- */
- fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+ /* Do not allow an invalid MXCSR value. */
+ if (newstate.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
- /*
- * update the header bits in the xsave header, indicating the
- * presence of FP and SSE state.
- */
- if (boot_cpu_has(X86_FEATURE_XSAVE))
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ fpu_force_restore(fpu);
- return ret;
+ /* Copy the state */
+ memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate));
+
+ /* Clear xmm8..15 for 32-bit callers */
+ BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16);
+ if (in_ia32_syscall())
+ memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16);
+
+ /* Mark FP and SSE as in use when XSAVE is enabled */
+ if (use_xsave())
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+
+ return 0;
}
int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
struct membuf to)
{
- struct fpu *fpu = &target->thread.fpu;
- struct xregs_state *xsave;
-
- if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
return -ENODEV;
- xsave = &fpu->state.xsave;
-
- fpu__prepare_read(fpu);
+ sync_fpstate(&target->thread.fpu);
- if (using_compacted_format()) {
- copy_xstate_to_kernel(to, xsave);
- return 0;
- } else {
- fpstate_sanitize_xstate(fpu);
- /*
- * Copy the 48 bytes defined by the software into the xsave
- * area in the thread struct, so that we can copy the whole
- * area to user using one user_regset_copyout().
- */
- memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
- /*
- * Copy the xstate memory layout.
- */
- return membuf_write(&to, xsave, fpu_user_xstate_size);
- }
+ copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE);
+ return 0;
}
int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -108,44 +143,34 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
const void *kbuf, const void __user *ubuf)
{
struct fpu *fpu = &target->thread.fpu;
- struct xregs_state *xsave;
+ struct xregs_state *tmpbuf = NULL;
int ret;
- if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
return -ENODEV;
/*
* A whole standard-format XSAVE buffer is needed:
*/
- if ((pos != 0) || (count < fpu_user_xstate_size))
+ if (pos != 0 || count != fpu_user_cfg.max_size)
return -EFAULT;
- xsave = &fpu->state.xsave;
-
- fpu__prepare_write(fpu);
+ if (!kbuf) {
+ tmpbuf = vmalloc(count);
+ if (!tmpbuf)
+ return -ENOMEM;
- if (using_compacted_format()) {
- if (kbuf)
- ret = copy_kernel_to_xstate(xsave, kbuf);
- else
- ret = copy_user_to_xstate(xsave, ubuf);
- } else {
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
- if (!ret)
- ret = validate_user_xstate_header(&xsave->header);
+ if (copy_from_user(tmpbuf, ubuf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
- /*
- * mxcsr reserved bits must be masked to zero for security reasons.
- */
- xsave->i387.mxcsr &= mxcsr_feature_mask;
-
- /*
- * In case of failure, mark all states as init:
- */
- if (ret)
- fpstate_init(&fpu->state);
+ fpu_force_restore(fpu);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
+out:
+ vfree(tmpbuf);
return ret;
}
@@ -221,10 +246,10 @@ static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
* FXSR floating point environment conversions.
*/
-void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
+ struct task_struct *tsk,
+ struct fxregs_state *fxsave)
{
- struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -258,6 +283,12 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
memcpy(&to[i], &from[i], sizeof(to[0]));
}
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+ __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave);
+}
+
void convert_to_fxsr(struct fxregs_state *fxsave,
const struct user_i387_ia32_struct *env)
@@ -290,25 +321,29 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
{
struct fpu *fpu = &target->thread.fpu;
struct user_i387_ia32_struct env;
+ struct fxregs_state fxsave, *fx;
- fpu__prepare_read(fpu);
+ sync_fpstate(fpu);
- if (!boot_cpu_has(X86_FEATURE_FPU))
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
return fpregs_soft_get(target, regset, to);
- if (!boot_cpu_has(X86_FEATURE_FXSR)) {
- return membuf_write(&to, &fpu->state.fsave,
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
+ return membuf_write(&to, &fpu->fpstate->regs.fsave,
sizeof(struct fregs_state));
}
- fpstate_sanitize_xstate(fpu);
+ if (use_xsave()) {
+ struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) };
- if (to.left == sizeof(env)) {
- convert_from_fxsr(to.p, target);
- return 0;
+ /* Handle init state optimized xstate correctly */
+ copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
+ fx = &fxsave;
+ } else {
+ fx = &fpu->fpstate->regs.fxsave;
}
- convert_from_fxsr(&env, target);
+ __convert_from_fxsr(&env, target, fx);
return membuf_write(&to, &env, sizeof(env));
}
@@ -320,31 +355,32 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
struct user_i387_ia32_struct env;
int ret;
- fpu__prepare_write(fpu);
- fpstate_sanitize_xstate(fpu);
+ /* No funny business with partial or oversized writes is permitted. */
+ if (pos != 0 || count != sizeof(struct user_i387_ia32_struct))
+ return -EINVAL;
- if (!boot_cpu_has(X86_FEATURE_FPU))
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
- if (!boot_cpu_has(X86_FEATURE_FXSR))
- return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fsave, 0,
- -1);
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+ if (ret)
+ return ret;
- if (pos > 0 || count < sizeof(env))
- convert_from_fxsr(&env, target);
+ fpu_force_restore(fpu);
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
- if (!ret)
- convert_to_fxsr(&target->thread.fpu.state.fxsave, &env);
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env);
+ else
+ memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env));
/*
- * update the header bit in the xsave header, indicating the
+ * Update the header bit in the xsave header, indicating the
* presence of FP.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE))
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
- return ret;
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+
+ return 0;
}
#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..91d4b6de58ab 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -7,37 +7,40 @@
#include <linux/cpu.h>
#include <linux/pagemap.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/xstate.h>
#include <asm/sigframe.h>
+#include <asm/trapnr.h>
#include <asm/trace/fpu.h>
-static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
/*
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
*/
-static inline int check_for_xstate(struct fxregs_state __user *buf,
- void __user *fpstate,
- struct _fpx_sw_bytes *fx_sw)
+static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+ struct _fpx_sw_bytes *fx_sw)
{
int min_xstate_size = sizeof(struct fxregs_state) +
sizeof(struct xstate_header);
+ void __user *fpstate = fxbuf;
unsigned int magic2;
- if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
- return -1;
+ if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
+ return false;
/* Check for the first magic field and other error scenarios. */
if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
fx_sw->xstate_size < min_xstate_size ||
- fx_sw->xstate_size > fpu_user_xstate_size ||
+ fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
fx_sw->xstate_size > fx_sw->extended_size)
- return -1;
+ goto setfx;
/*
* Check for the presence of second magic word at the end of memory
@@ -45,26 +48,34 @@ static inline int check_for_xstate(struct fxregs_state __user *buf,
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
- || magic2 != FP_XSTATE_MAGIC2)
- return -1;
-
- return 0;
+ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
+ return false;
+
+ if (likely(magic2 == FP_XSTATE_MAGIC2))
+ return true;
+setfx:
+ trace_x86_fpu_xstate_check_failed(&current->thread.fpu);
+
+ /* Set the parameters for fx only state */
+ fx_sw->magic1 = 0;
+ fx_sw->xstate_size = sizeof(struct fxregs_state);
+ fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
+ return true;
}
/*
* Signal frame handlers.
*/
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
{
if (use_fxsr()) {
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave;
struct user_i387_ia32_struct env;
struct _fpstate_32 __user *fp = buf;
fpregs_lock();
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- copy_fxregs_to_kernel(&tsk->thread.fpu);
+ fxsave(&tsk->thread.fpu.fpstate->regs.fxsave);
fpregs_unlock();
convert_from_fxsr(&env, tsk);
@@ -72,33 +83,54 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
if (__copy_to_user(buf, &env, sizeof(env)) ||
__put_user(xsave->i387.swd, &fp->status) ||
__put_user(X86_FXSR_MAGIC, &fp->magic))
- return -1;
+ return false;
} else {
struct fregs_state __user *fp = buf;
u32 swd;
+
if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
- return -1;
+ return false;
}
- return 0;
+ return true;
}
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed to by the fpstate pointer in the sigcontext.
+ * This is saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
+ struct fpstate *fpstate)
+{
+ sw_bytes->magic1 = FP_XSTATE_MAGIC1;
+ sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
+ sw_bytes->xfeatures = fpstate->user_xfeatures;
+ sw_bytes->xstate_size = fpstate->user_size;
+
+ if (ia32_frame)
+ sw_bytes->extended_size += sizeof(struct fregs_state);
+}
+
+static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
+ struct fpstate *fpstate)
{
struct xregs_state __user *x = buf;
- struct _fpx_sw_bytes *sw_bytes;
+ struct _fpx_sw_bytes sw_bytes = {};
u32 xfeatures;
int err;
/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
- sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
- err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+ save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
+ err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));
if (!use_xsave())
- return err;
+ return !err;
err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *)(buf + fpu_user_xstate_size));
+ (__u32 __user *)(buf + fpstate->user_size));
/*
* Read the xfeatures which we copied (directly from the cpu or
@@ -121,23 +153,17 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
- return err;
+ return !err;
}
static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
{
- int err;
-
if (use_xsave())
- err = copy_xregs_to_user(buf);
- else if (use_fxsr())
- err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+ return xsave_to_user_sigframe(buf);
+ if (use_fxsr())
+ return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
- err = copy_fregs_to_user((struct fregs_state __user *) buf);
-
- if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
- err = -EFAULT;
- return err;
+ return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
}
/*
@@ -150,10 +176,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* buf == buf_fx for 64-bit frames and 32-bit fsave frame.
* buf != buf_fx for 32-bit frames with fxstate.
*
- * Try to save it directly to the user frame with disabled page fault handler.
- * If this fails then do the slow path where the FPU state is first saved to
- * task's fpu->state and then copy it to the user frame pointed to by the
- * aligned pointer 'buf_fx'.
+ * Save it directly to the user frame with disabled page fault handler. If
+ * that faults, try to clear the frame which handles the page fault.
*
* If this is a 32-bit frame with fxstate, put a fsave header before
* the aligned state at 'buf_fx'.
@@ -161,10 +185,11 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
* indicating the absence/presence of the extended state to the user.
*/
-int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
{
struct task_struct *tsk = current;
- int ia32_fxstate = (buf != buf_fx);
+ struct fpstate *fpstate = tsk->thread.fpu.fpstate;
+ bool ia32_fxstate = (buf != buf_fx);
int ret;
ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
@@ -172,13 +197,25 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
if (!static_cpu_has(X86_FEATURE_FPU)) {
struct user_i387_ia32_struct fp;
+
fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
.left = sizeof(fp)});
- return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0;
+ return !copy_to_user(buf, &fp, sizeof(fp));
}
if (!access_ok(buf, size))
- return -EACCES;
+ return false;
+
+ if (use_xsave()) {
+ struct xregs_state __user *xbuf = buf_fx;
+
+ /*
+ * Clear the xsave header first, so that reserved fields are
+ * initialized to zero.
+ */
+ if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
+ return false;
+ }
retry:
/*
* Load the FPU registers if they are not valid for the current task.
@@ -188,7 +225,7 @@ retry:
*/
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
- __fpregs_load_activate();
+ fpregs_restore_userregs();
pagefault_disable();
ret = copy_fpregs_to_sigframe(buf_fx);
@@ -196,191 +233,143 @@ retry:
fpregs_unlock();
if (ret) {
- if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
+ if (!__clear_user(buf_fx, fpstate->user_size))
goto retry;
- return -EFAULT;
+ return false;
}
/* Save the fsave header for the 32-bit frames. */
- if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
- return -1;
+ if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
+ return false;
- if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
- return -1;
+ if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
+ return false;
- return 0;
+ return true;
}
-static inline void
-sanitize_restored_user_xstate(union fpregs_state *state,
- struct user_i387_ia32_struct *ia32_env,
- u64 user_xfeatures, int fx_only)
+static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
+ u64 xrestore, bool fx_only)
{
- struct xregs_state *xsave = &state->xsave;
- struct xstate_header *header = &xsave->header;
-
if (use_xsave()) {
- /*
- * Note: we don't need to zero the reserved bits in the
- * xstate_header here because we either didn't copy them at all,
- * or we checked earlier that they aren't set.
- */
+ u64 init_bv = ufeatures & ~xrestore;
+ int ret;
- /*
- * 'user_xfeatures' might have bits clear which are
- * set in header->xfeatures. This represents features that
- * were in init state prior to a signal delivery, and need
- * to be reset back to the init state. Clear any user
- * feature bits which are set in the kernel buffer to get
- * them back to the init state.
- *
- * Supervisor state is unchanged by input from userspace.
- * Ensure supervisor state bits stay set and supervisor
- * state is not modified.
- */
- if (fx_only)
- header->xfeatures = XFEATURE_MASK_FPSSE;
+ if (likely(!fx_only))
+ ret = xrstor_from_user_sigframe(buf, xrestore);
else
- header->xfeatures &= user_xfeatures |
- xfeatures_mask_supervisor();
- }
-
- if (use_fxsr()) {
- /*
- * mscsr reserved bits must be masked to zero for security
- * reasons.
- */
- xsave->i387.mxcsr &= mxcsr_feature_mask;
+ ret = fxrstor_from_user_sigframe(buf);
- if (ia32_env)
- convert_to_fxsr(&state->fxsave, ia32_env);
+ if (!ret && unlikely(init_bv))
+ os_xrstor(&init_fpstate, init_bv);
+ return ret;
+ } else if (use_fxsr()) {
+ return fxrstor_from_user_sigframe(buf);
+ } else {
+ return frstor_from_user_sigframe(buf);
}
}
/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ * Attempt to restore the FPU registers directly from user memory.
+ * Pagefaults are handled and any errors returned are fatal.
*/
-static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
+ bool fx_only, unsigned int size)
{
- u64 init_bv;
- int r;
+ struct fpu *fpu = &current->thread.fpu;
+ int ret;
- if (use_xsave()) {
- if (fx_only) {
- init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
+retry:
+ fpregs_lock();
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpu->fpstate);
+ pagefault_disable();
+ ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
+ xrestore, fx_only);
+ pagefault_enable();
- r = copy_user_to_fxregs(buf);
- if (!r)
- copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
- return r;
- } else {
- init_bv = xfeatures_mask_user() & ~xbv;
+ if (unlikely(ret)) {
+ /*
+ * The above did an FPU restore operation, restricted to
+ * the user portion of the registers, and failed, but the
+ * microcode might have modified the FPU registers
+ * nevertheless.
+ *
+ * If the FPU registers do not belong to current, then
+ * invalidate the FPU register state otherwise the task
+ * might preempt current and return to user space with
+ * corrupted FPU registers.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ __cpu_invalidate_fpregs_state();
+ fpregs_unlock();
- r = copy_user_to_xregs(buf, xbv);
- if (!r && unlikely(init_bv))
- copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
- return r;
- }
- } else if (use_fxsr()) {
- return copy_user_to_fxregs(buf);
- } else
- return copy_user_to_fregs(buf);
+ /* Try to handle #PF, but anything else is fatal. */
+ if (ret != X86_TRAP_PF)
+ return false;
+
+ if (!fault_in_readable(buf, size))
+ goto retry;
+ return false;
+ }
+
+ /*
+ * Restore supervisor states: previous context switch etc has done
+ * XSAVES and saved the supervisor states in the kernel buffer from
+ * which they can be restored now.
+ *
+ * It would be optimal to handle this with a single XRSTORS, but
+ * this does not work because the rest of the FPU registers have
+ * been restored from a user buffer directly.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
+ os_xrstor_supervisor(fpu->fpstate);
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return true;
}
-static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+ bool ia32_fxstate)
{
- struct user_i387_ia32_struct *envp = NULL;
- int state_size = fpu_kernel_xstate_size;
- int ia32_fxstate = (buf != buf_fx);
struct task_struct *tsk = current;
struct fpu *fpu = &tsk->thread.fpu;
struct user_i387_ia32_struct env;
+ bool success, fx_only = false;
+ union fpregs_state *fpregs;
+ unsigned int state_size;
u64 user_xfeatures = 0;
- int fx_only = 0;
- int ret = 0;
-
- ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
- IS_ENABLED(CONFIG_IA32_EMULATION));
-
- if (!buf) {
- fpu__clear_user_states(fpu);
- return 0;
- }
-
- if (!access_ok(buf, size))
- return -EACCES;
-
- if (!static_cpu_has(X86_FEATURE_FPU))
- return fpregs_soft_set(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, buf) != 0;
if (use_xsave()) {
struct _fpx_sw_bytes fx_sw_user;
- if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
- /*
- * Couldn't find the extended state information in the
- * memory layout. Restore just the FP/SSE and init all
- * the other extended state.
- */
- state_size = sizeof(struct fxregs_state);
- fx_only = 1;
- trace_x86_fpu_xstate_check_failed(fpu);
- } else {
- state_size = fx_sw_user.xstate_size;
- user_xfeatures = fx_sw_user.xfeatures;
- }
- }
- if ((unsigned long)buf_fx % 64)
- fx_only = 1;
+ if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
+ return false;
- if (!ia32_fxstate) {
- /*
- * Attempt to restore the FPU registers directly from user
- * memory. For that to succeed, the user access cannot cause
- * page faults. If it does, fall back to the slow path below,
- * going through the kernel buffer with the enabled pagefault
- * handler.
- */
- fpregs_lock();
- pagefault_disable();
- ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only);
- pagefault_enable();
- if (!ret) {
-
- /*
- * Restore supervisor states: previous context switch
- * etc has done XSAVES and saved the supervisor states
- * in the kernel buffer from which they can be restored
- * now.
- *
- * We cannot do a single XRSTORS here - which would
- * be nice - because the rest of the FPU registers are
- * being restored from a user buffer directly. The
- * single XRSTORS happens below, when the user buffer
- * has been copied to the kernel one.
- */
- if (test_thread_flag(TIF_NEED_FPU_LOAD) &&
- xfeatures_mask_supervisor())
- copy_kernel_to_xregs(&fpu->state.xsave,
- xfeatures_mask_supervisor());
- fpregs_mark_activate();
- fpregs_unlock();
- return 0;
- }
- fpregs_unlock();
+ fx_only = !fx_sw_user.magic1;
+ state_size = fx_sw_user.xstate_size;
+ user_xfeatures = fx_sw_user.xfeatures;
} else {
- /*
- * For 32-bit frames with fxstate, copy the fxstate so it can
- * be reconstructed later.
- */
- ret = __copy_from_user(&env, buf, sizeof(env));
- if (ret)
- goto err_out;
- envp = &env;
+ user_xfeatures = XFEATURE_MASK_FPSSE;
+ state_size = fpu->fpstate->user_size;
}
+ if (likely(!ia32_fxstate)) {
+ /* Restore the FPU registers directly from user memory. */
+ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
+ state_size);
+ }
+
+ /*
+ * Copy the legacy state because the FP portion of the FX frame has
+ * to be ignored for histerical raisins. The legacy state is folded
+ * in once the larger state has been copied.
+ */
+ if (__copy_from_user(&env, buf, sizeof(env)))
+ return false;
+
/*
* By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
* not modified on context switch and that the xstate is considered
@@ -388,114 +377,136 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
* the optimisation).
*/
fpregs_lock();
-
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-
/*
- * Supervisor states are not modified by user space input. Save
- * current supervisor states first and invalidate the FPU regs.
+ * If supervisor states are available then save the
+ * hardware state in current's fpstate so that the
+ * supervisor state is preserved. Save the full state for
+ * simplicity. There is no point in optimizing this by only
+ * saving the supervisor states and then shuffle them to
+ * the right place in memory. It's ia32 mode. Shrug.
*/
if (xfeatures_mask_supervisor())
- copy_supervisor_to_kernel(&fpu->state.xsave);
+ os_xsave(fpu->fpstate);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
+ __cpu_invalidate_fpregs_state();
fpregs_unlock();
+ fpregs = &fpu->fpstate->regs;
if (use_xsave() && !fx_only) {
- u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
-
- if (using_compacted_format()) {
- ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
+ if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
+ return false;
+ } else {
+ if (__copy_from_user(&fpregs->fxsave, buf_fx,
+ sizeof(fpregs->fxsave)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Reject invalid MXCSR values. */
+ if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return false;
} else {
- ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
-
- if (!ret && state_size > offsetof(struct xregs_state, header))
- ret = validate_user_xstate_header(&fpu->state.xsave.header);
+ /* Mask invalid bits out for historical reasons (broken hardware). */
+ fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
}
- if (ret)
- goto err_out;
- sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
- fx_only);
+ /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
+ if (use_xsave())
+ fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ }
- fpregs_lock();
- if (unlikely(init_bv))
- copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+ /* Fold the legacy FP storage */
+ convert_to_fxsr(&fpregs->fxsave, &env);
+ fpregs_lock();
+ if (use_xsave()) {
/*
- * Restore previously saved supervisor xstates along with
- * copied-in user xstates.
+ * Remove all UABI feature bits not set in user_xfeatures
+ * from the memory xstate header which makes the full
+ * restore below bring them into init state. This works for
+ * fx_only mode as well because that has only FP and SSE
+ * set in user_xfeatures.
+ *
+ * Preserve supervisor states!
*/
- ret = copy_kernel_to_xregs_err(&fpu->state.xsave,
- user_xfeatures | xfeatures_mask_supervisor());
-
- } else if (use_fxsr()) {
- ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
- if (ret) {
- ret = -EFAULT;
- goto err_out;
- }
-
- sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
- fx_only);
-
- fpregs_lock();
- if (use_xsave()) {
- u64 init_bv;
-
- init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
- copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
- }
+ u64 mask = user_xfeatures | xfeatures_mask_supervisor();
- ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave);
+ fpregs->xsave.header.xfeatures &= mask;
+ success = !os_xrstor_safe(fpu->fpstate,
+ fpu_kernel_cfg.max_features);
} else {
- ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
- if (ret)
- goto err_out;
-
- fpregs_lock();
- ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
+ success = !fxrstor_safe(&fpregs->fxsave);
}
- if (!ret)
+
+ if (likely(success))
fpregs_mark_activate();
- else
- fpregs_deactivate(fpu);
- fpregs_unlock();
-err_out:
- if (ret)
- fpu__clear_user_states(fpu);
- return ret;
+ fpregs_unlock();
+ return success;
}
-static inline int xstate_sigframe_size(void)
+static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
{
- return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
- fpu_user_xstate_size;
+ unsigned int size = fpstate->user_size;
+
+ return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
}
/*
* Restore FPU state from a sigframe:
*/
-int fpu__restore_sig(void __user *buf, int ia32_frame)
+bool fpu__restore_sig(void __user *buf, int ia32_frame)
{
+ struct fpu *fpu = &current->thread.fpu;
void __user *buf_fx = buf;
- int size = xstate_sigframe_size();
+ bool ia32_fxstate = false;
+ bool success = false;
+ unsigned int size;
+ if (unlikely(!buf)) {
+ fpu__clear_user_states(fpu);
+ return true;
+ }
+
+ size = xstate_sigframe_size(fpu->fpstate);
+
+ ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ /*
+ * Only FXSR enabled systems need the FX state quirk.
+ * FRSTOR does not need it and can use the fast path.
+ */
if (ia32_frame && use_fxsr()) {
buf_fx = buf + sizeof(struct fregs_state);
size += sizeof(struct fregs_state);
+ ia32_fxstate = true;
+ }
+
+ if (!access_ok(buf, size))
+ goto out;
+
+ if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
+ success = !fpregs_soft_set(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct),
+ NULL, buf);
+ } else {
+ success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
}
- return __fpu__restore_sig(buf, buf_fx, size);
+out:
+ if (unlikely(!success))
+ fpu__clear_user_states(fpu);
+ return success;
}
unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size)
{
- unsigned long frame_size = xstate_sigframe_size();
+ unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate);
*buf_fx = sp = round_down(sp - frame_size, 64);
if (ia32_frame && use_fxsr()) {
@@ -507,28 +518,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
return sp;
}
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-void fpu__init_prepare_fx_sw_frame(void)
+
+unsigned long __init fpu__get_fpstate_size(void)
{
- int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+ unsigned long ret = fpu_user_cfg.max_size;
- fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
- fx_sw_reserved.extended_size = size;
- fx_sw_reserved.xfeatures = xfeatures_mask_user();
- fx_sw_reserved.xstate_size = fpu_user_xstate_size;
+ if (use_xsave())
+ ret += FP_XSTATE_MAGIC2_SIZE;
- if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
- IS_ENABLED(CONFIG_X86_32)) {
- int fsave_header_size = sizeof(struct fregs_state);
+ /*
+ * This space is needed on (most) 32-bit kernels, or when a 32-bit
+ * app is running on a 64-bit kernel. To keep things simple, just
+ * assume the worst case and always include space for 'freg_state',
+ * even for 64-bit apps on 64-bit kernels. This wastes a bit of
+ * space, but keeps the code simple.
+ */
+ if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+ ret += sizeof(struct fregs_state);
- fx_sw_reserved_ia32 = fx_sw_reserved;
- fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
- }
+ return ret;
}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 683749b80ae2..c8340156bfd2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -4,21 +4,33 @@
*
* Author: Suresh Siddha <suresh.b.siddha@intel.com>
*/
+#include <linux/bitops.h>
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
+#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
#include <asm/fpu/api.h>
-#include <asm/fpu/internal.h>
-#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xcr.h>
#include <asm/tlbflush.h>
-#include <asm/cpufeature.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+#define for_each_extended_xfeature(bit, mask) \
+ (bit) = FIRST_EXTENDED_XFEATURE; \
+ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
/*
* Although we spell it out in here, the Processor Trace
@@ -39,39 +51,40 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "AMX Tile config" ,
+ "AMX Tile data" ,
+ "unknown xstate feature" ,
};
-static short xsave_cpuid_features[] __initdata = {
- X86_FEATURE_FPU,
- X86_FEATURE_XMM,
- X86_FEATURE_AVX,
- X86_FEATURE_MPX,
- X86_FEATURE_MPX,
- X86_FEATURE_AVX512F,
- X86_FEATURE_AVX512F,
- X86_FEATURE_AVX512F,
- X86_FEATURE_INTEL_PT,
- X86_FEATURE_PKU,
- X86_FEATURE_ENQCMD,
+static unsigned short xsave_cpuid_features[] __initdata = {
+ [XFEATURE_FP] = X86_FEATURE_FPU,
+ [XFEATURE_SSE] = X86_FEATURE_XMM,
+ [XFEATURE_YMM] = X86_FEATURE_AVX,
+ [XFEATURE_BNDREGS] = X86_FEATURE_MPX,
+ [XFEATURE_BNDCSR] = X86_FEATURE_MPX,
+ [XFEATURE_OPMASK] = X86_FEATURE_AVX512F,
+ [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
+ [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
+ [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
+ [XFEATURE_PKRU] = X86_FEATURE_PKU,
+ [XFEATURE_PASID] = X86_FEATURE_ENQCMD,
+ [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
+ [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
};
-/*
- * This represents the full set of bits that should ever be set in a kernel
- * XSAVE buffer, both supervisor and user xstates.
- */
-u64 xfeatures_mask_all __read_mostly;
-
-static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
-/*
- * The XSAVE area of kernel can be in standard or compacted format;
- * it is always in standard format for user mode. This is the user
- * mode standard format size used for signal and ptrace frames.
- */
-unsigned int fpu_user_xstate_size;
+#define XSTATE_FLAG_SUPERVISOR BIT(0)
+#define XSTATE_FLAG_ALIGNED64 BIT(1)
/*
* Return whether the system supports a given xfeature.
@@ -80,7 +93,7 @@ unsigned int fpu_user_xstate_size;
*/
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
- u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
+ u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
if (unlikely(feature_name)) {
long xfeature_idx, max_idx;
@@ -111,96 +124,42 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
-static bool xfeature_is_supervisor(int xfeature_nr)
+static bool xfeature_is_aligned64(int xfeature_nr)
{
- /*
- * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1)
- * returns ECX[0] set to (1) for a supervisor state, and cleared (0)
- * for a user state.
- */
- u32 eax, ebx, ecx, edx;
-
- cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
- return ecx & 1;
+ return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
}
-/*
- * When executing XSAVEOPT (or other optimized XSAVE instructions), if
- * a processor implementation detects that an FPU state component is still
- * (or is again) in its initialized state, it may clear the corresponding
- * bit in the header.xfeatures field, and can skip the writeout of registers
- * to the corresponding memory layout.
- *
- * This means that when the bit is zero, the state component might still contain
- * some previous - non-initialized register state.
- *
- * Before writing xstate information to user-space we sanitize those components,
- * to always ensure that the memory layout of a feature will be in the init state
- * if the corresponding header bit is zero. This is to ensure that user-space doesn't
- * see some stale state in the memory layout during signal handling, debugging etc.
- */
-void fpstate_sanitize_xstate(struct fpu *fpu)
+static bool xfeature_is_supervisor(int xfeature_nr)
{
- struct fxregs_state *fx = &fpu->state.fxsave;
- int feature_bit;
- u64 xfeatures;
-
- if (!use_xsaveopt())
- return;
-
- xfeatures = fpu->state.xsave.header.xfeatures;
-
- /*
- * None of the feature bits are in init state. So nothing else
- * to do for us, as the memory layout is up to date.
- */
- if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all)
- return;
-
- /*
- * FP is in init state
- */
- if (!(xfeatures & XFEATURE_MASK_FP)) {
- fx->cwd = 0x37f;
- fx->swd = 0;
- fx->twd = 0;
- fx->fop = 0;
- fx->rip = 0;
- fx->rdp = 0;
- memset(fx->st_space, 0, sizeof(fx->st_space));
- }
+ return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
+}
- /*
- * SSE is in init state
- */
- if (!(xfeatures & XFEATURE_MASK_SSE))
- memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
+static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
+{
+ unsigned int offs, i;
/*
- * First two features are FPU and SSE, which above we handled
- * in a special way already:
+ * Non-compacted format and legacy features use the cached fixed
+ * offsets.
*/
- feature_bit = 0x2;
- xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2;
+ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
+ xfeature <= XFEATURE_SSE)
+ return xstate_offsets[xfeature];
/*
- * Update all the remaining memory layouts according to their
- * standard xstate layout, if their header bit is in the init
- * state:
+ * Compacted format offsets depend on the actual content of the
+ * compacted xsave area which is determined by the xcomp_bv header
+ * field.
*/
- while (xfeatures) {
- if (xfeatures & 0x1) {
- int offset = xstate_comp_offsets[feature_bit];
- int size = xstate_sizes[feature_bit];
-
- memcpy((void *)fx + offset,
- (void *)&init_fpstate.xsave + offset,
- size);
- }
-
- xfeatures >>= 1;
- feature_bit++;
+ offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ for_each_extended_xfeature(i, xcomp_bv) {
+ if (xfeature_is_aligned64(i))
+ offs = ALIGN(offs, 64);
+ if (i == xfeature)
+ break;
+ offs += xstate_sizes[i];
}
+ return offs;
}
/*
@@ -209,51 +168,49 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
*/
void fpu__init_cpu_xstate(void)
{
- u64 unsup_bits;
-
- if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
return;
- /*
- * Unsupported supervisor xstates should not be found in
- * the xfeatures mask.
- */
- unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
- WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n",
- unsup_bits);
-
- xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
cr4_set_bits(X86_CR4_OSXSAVE);
/*
+ * Must happen after CR4 setup and before xsetbv() to allow KVM
+ * lazy passthrough. Write independent of the dynamic state static
+ * key as that does not work on the boot CPU. This also ensures
+ * that any stale state is wiped out from XFD.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XFD))
+ wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
+
+ /*
* XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
* managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
* states can be set here.
*/
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
/*
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
*/
if (boot_cpu_has(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
- xfeatures_mask_dynamic());
+ xfeatures_mask_independent());
}
}
static bool xfeature_enabled(enum xfeature xfeature)
{
- return xfeatures_mask_all & BIT_ULL(xfeature);
+ return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
}
/*
* Record the offsets and sizes of various xstates contained
* in the XSAVE state memory layout.
*/
-static void __init setup_xstate_features(void)
+static void __init setup_xstate_cache(void)
{
u32 eax, ebx, ecx, edx, i;
- /* start at the beginnning of the "extended state" */
+ /* start at the beginning of the "extended state" */
unsigned int last_good_offset = offsetof(struct xregs_state,
extended_state_area);
/*
@@ -269,13 +226,11 @@ static void __init setup_xstate_features(void)
xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
xmm_space);
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
xstate_sizes[i] = eax;
+ xstate_flags[i] = ecx;
/*
* If an xfeature is supervisor state, the offset in EBX is
@@ -321,6 +276,8 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
print_xstate_feature(XFEATURE_MASK_PKRU);
print_xstate_feature(XFEATURE_MASK_PASID);
+ print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
+ print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
}
/*
@@ -333,162 +290,100 @@ static void __init print_xstate_features(void)
} while (0)
/*
- * We could cache this like xstate_size[], but we only use
- * it here, so it would be a waste of space.
- */
-static int xfeature_is_aligned(int xfeature_nr)
-{
- u32 eax, ebx, ecx, edx;
-
- CHECK_XFEATURE(xfeature_nr);
-
- if (!xfeature_enabled(xfeature_nr)) {
- WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n",
- xfeature_nr);
- return 0;
- }
-
- cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
- /*
- * The value returned by ECX[1] indicates the alignment
- * of state component 'i' when the compacted format
- * of the extended region of an XSAVE area is used:
- */
- return !!(ecx & 2);
-}
-
-/*
- * This function sets up offsets and sizes of all extended states in
- * xsave area. This supports both standard format and compacted format
- * of the xsave area.
+ * Print out xstate component offsets and sizes
*/
-static void __init setup_xstate_comp_offsets(void)
+static void __init print_xstate_offset_size(void)
{
- unsigned int next_offset;
int i;
- /*
- * The FP xstates and SSE xstates are legacy states. They are always
- * in the fixed offsets in the xsave area in either compacted form
- * or standard form.
- */
- xstate_comp_offsets[XFEATURE_FP] = 0;
- xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
- xmm_space);
-
- if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (xfeature_enabled(i))
- xstate_comp_offsets[i] = xstate_offsets[i];
- }
- return;
- }
-
- next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
- if (xfeature_is_aligned(i))
- next_offset = ALIGN(next_offset, 64);
-
- xstate_comp_offsets[i] = next_offset;
- next_offset += xstate_sizes[i];
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
+ i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
+ i, xstate_sizes[i]);
}
}
/*
- * Setup offsets of a supervisor-state-only XSAVES buffer:
- *
- * The offsets stored in xstate_comp_offsets[] only work for one specific
- * value of the Requested Feature BitMap (RFBM). In cases where a different
- * RFBM value is used, a different set of offsets is required. This set of
- * offsets is for when RFBM=xfeatures_mask_supervisor().
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
*/
-static void __init setup_supervisor_only_offsets(void)
+static __init void os_xrstor_booting(struct xregs_state *xstate)
{
- unsigned int next_offset;
- int i;
-
- next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i) || !xfeature_is_supervisor(i))
- continue;
+ u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
- if (xfeature_is_aligned(i))
- next_offset = ALIGN(next_offset, 64);
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
- xstate_supervisor_only_offsets[i] = next_offset;
- next_offset += xstate_sizes[i];
- }
+ /*
+ * We should never fault when copying from a kernel buffer, and the FPU
+ * state we set at boot time should be valid.
+ */
+ WARN_ON_FPU(err);
}
/*
- * Print out xstate component offsets and sizes
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
*/
-static void __init print_xstate_offset_size(void)
-{
- int i;
-
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
- pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
- i, xstate_comp_offsets[i], i, xstate_sizes[i]);
- }
-}
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_XTILE)
/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
{
- static int on_boot_cpu __initdata = 1;
-
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return;
- setup_xstate_features();
print_xstate_features();
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
- xfeatures_mask_all;
+ xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
/*
* Init all the features state with header.xfeatures being 0x0
*/
- copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+ os_xrstor_booting(&init_fpstate.regs.xsave);
/*
- * Dump the init state again. This is to identify the init state
- * of any feature which is not represented by all zero's.
- */
- copy_xregs_to_kernel_booting(&init_fpstate.xsave);
-}
-
-static int xfeature_uncompacted_offset(int xfeature_nr)
-{
- u32 eax, ebx, ecx, edx;
-
- /*
- * Only XSAVES supports supervisor states and it uses compacted
- * format. Checking a supervisor state's uncompacted offset is
- * an error.
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVEC/S is available because XSAVEC/S uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
*/
- if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) {
- WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
- return -1;
- }
-
- CHECK_XFEATURE(xfeature_nr);
- cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
- return ebx;
+ fxsave(&init_fpstate.regs.fxsave);
}
int xfeature_size(int xfeature_nr)
@@ -500,25 +395,12 @@ int xfeature_size(int xfeature_nr)
return eax;
}
-/*
- * 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
- * 2. using the compacted format
- *
- * Use this function when dealing with the compacted format so
- * that it is obvious which aspect of 'XSAVES' is being handled
- * by the calling code.
- */
-int using_compacted_format(void)
-{
- return boot_cpu_has(X86_FEATURE_XSAVES);
-}
-
/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_user_xstate_header(const struct xstate_header *hdr)
+static int validate_user_xstate_header(const struct xstate_header *hdr,
+ struct fpstate *fpstate)
{
/* No unknown or supervisor features may be set */
- if (hdr->xfeatures & ~xfeatures_mask_user())
+ if (hdr->xfeatures & ~fpstate->user_xfeatures)
return -EINVAL;
/* Userspace must use the uncompacted format */
@@ -538,7 +420,7 @@ int validate_user_xstate_header(const struct xstate_header *hdr)
return 0;
}
-static void __xstate_dump_leaves(void)
+static void __init __xstate_dump_leaves(void)
{
int i;
u32 eax, ebx, ecx, edx;
@@ -573,12 +455,73 @@ static void __xstate_dump_leaves(void)
} \
} while (0)
+/**
+ * check_xtile_data_against_struct - Check tile data state size.
+ *
+ * Calculate the state size by multiplying the single tile size which is
+ * recorded in a C struct, and the number of tiles that the CPU informs.
+ * Compare the provided size with the calculation.
+ *
+ * @size: The tile data state size
+ *
+ * Returns: 0 on success, -EINVAL on mismatch.
+ */
+static int __init check_xtile_data_against_struct(int size)
+{
+ u32 max_palid, palid, state_size;
+ u32 eax, ebx, ecx, edx;
+ u16 max_tile;
+
+ /*
+ * Check the maximum palette id:
+ * eax: the highest numbered palette subleaf.
+ */
+ cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
+
+ /*
+ * Cross-check each tile size and find the maximum number of
+ * supported tiles.
+ */
+ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
+ u16 tile_size, max;
+
+ /*
+ * Check the tile size info:
+ * eax[31:16]: bytes per title
+ * ebx[31:16]: the max names (or max number of tiles)
+ */
+ cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
+ tile_size = eax >> 16;
+ max = ebx >> 16;
+
+ if (tile_size != sizeof(struct xtile_data)) {
+ pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA),
+ sizeof(struct xtile_data), tile_size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+
+ if (max > max_tile)
+ max_tile = max;
+ }
+
+ state_size = sizeof(struct xtile_data) * max_tile;
+ if (size != state_size) {
+ pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA), state_size, size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+ return 0;
+}
+
/*
* We have a C struct for each 'xstate'. We need to ensure
* that our software representation matches what the CPU
* tells us about the state's size.
*/
-static void check_xstate_against_struct(int nr)
+static bool __init check_xstate_against_struct(int nr)
{
/*
* Ask the CPU for the size of the state.
@@ -596,6 +539,11 @@ static void check_xstate_against_struct(int nr)
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state);
+ XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
+
+ /* The tile data size varies between implementations. */
+ if (nr == XFEATURE_XTILE_DATA)
+ check_xtile_data_against_struct(sz);
/*
* Make *SURE* to add any feature numbers in below if
@@ -605,10 +553,25 @@ static void check_xstate_against_struct(int nr)
if ((nr < XFEATURE_YMM) ||
(nr >= XFEATURE_MAX) ||
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
- ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) {
+ ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
+ return false;
}
+ return true;
+}
+
+static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+{
+ unsigned int topmost = fls64(xfeatures) - 1;
+ unsigned int offset = xstate_offsets[topmost];
+
+ if (topmost <= XFEATURE_SSE)
+ return sizeof(struct xregs_state);
+
+ if (compacted)
+ offset = xfeature_get_offset(xfeatures, topmost);
+ return offset + xstate_sizes[topmost];
}
/*
@@ -616,48 +579,34 @@ static void check_xstate_against_struct(int nr)
* how large the XSAVE buffer needs to be. We are recalculating
* it to be safe.
*
- * Dynamic XSAVE features allocate their own buffers and are not
+ * Independent XSAVE features allocate their own buffers and are not
* covered by these checks. Only the size of the buffer for task->fpu
* is checked here.
*/
-static void do_extra_xstate_size_checks(void)
+static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
{
- int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+ bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
int i;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
- check_xstate_against_struct(i);
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ if (!check_xstate_against_struct(i))
+ return false;
/*
* Supervisor state components can be managed only by
- * XSAVES, which is compacted-format only.
- */
- if (!using_compacted_format())
- XSTATE_WARN_ON(xfeature_is_supervisor(i));
-
- /* Align from the end of the previous feature */
- if (xfeature_is_aligned(i))
- paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
- /*
- * The offset of a given state in the non-compacted
- * format is given to us in a CPUID leaf. We check
- * them for being ordered (increasing offsets) in
- * setup_xstate_features().
- */
- if (!using_compacted_format())
- paranoid_xstate_size = xfeature_uncompacted_offset(i);
- /*
- * The compacted-format offset always depends on where
- * the previous state ended.
+ * XSAVES.
*/
- paranoid_xstate_size += xfeature_size(i);
+ if (!xsaves && xfeature_is_supervisor(i)) {
+ XSTATE_WARN_ON(1);
+ return false;
+ }
}
- XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
+ size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
+ XSTATE_WARN_ON(size != kernel_size);
+ return size == kernel_size;
}
-
/*
* Get total size of enabled xstates in XCR0 | IA32_XSS.
*
@@ -665,8 +614,11 @@ static void do_extra_xstate_size_checks(void)
* the size of the *user* states. If we use it to size a buffer
* that we use 'XSAVES' on, we could potentially overflow the
* buffer because 'XSAVES' saves system states too.
+ *
+ * This also takes compaction into account. So this works for
+ * XSAVEC as well.
*/
-static unsigned int __init get_xsaves_size(void)
+static unsigned int __init get_compacted_size(void)
{
unsigned int eax, ebx, ecx, edx;
/*
@@ -676,39 +628,43 @@ static unsigned int __init get_xsaves_size(void)
* containing all the state components
* corresponding to bits currently set in
* XCR0 | IA32_XSS.
+ *
+ * When XSAVES is not available but XSAVEC is (virt), then there
+ * are no supervisor states, but XSAVEC still uses compacted
+ * format.
*/
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
return ebx;
}
/*
- * Get the total size of the enabled xstates without the dynamic supervisor
+ * Get the total size of the enabled xstates without the independent supervisor
* features.
*/
-static unsigned int __init get_xsaves_size_no_dynamic(void)
+static unsigned int __init get_xsave_compacted_size(void)
{
- u64 mask = xfeatures_mask_dynamic();
+ u64 mask = xfeatures_mask_independent();
unsigned int size;
if (!mask)
- return get_xsaves_size();
+ return get_compacted_size();
- /* Disable dynamic features. */
+ /* Disable independent features. */
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
/*
* Ask the hardware what size is required of the buffer.
* This is the size required for the task->fpu buffer.
*/
- size = get_xsaves_size();
+ size = get_compacted_size();
- /* Re-enable dynamic features so XSAVES will work on them again. */
+ /* Re-enable independent features so XSAVES will work on them again. */
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
return size;
}
-static unsigned int __init get_xsave_size(void)
+static unsigned int __init get_xsave_size_user(void)
{
unsigned int eax, ebx, ecx, edx;
/*
@@ -726,44 +682,55 @@ static unsigned int __init get_xsave_size(void)
* Will the runtime-enumerated 'xstate_size' fit in the init
* task's statically-allocated buffer?
*/
-static bool is_supported_xstate_size(unsigned int test_xstate_size)
+static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
{
- if (test_xstate_size <= sizeof(union fpregs_state))
+ if (test_xstate_size <= sizeof(init_fpstate.regs))
return true;
pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
- sizeof(union fpregs_state), test_xstate_size);
+ sizeof(init_fpstate.regs), test_xstate_size);
return false;
}
static int __init init_xstate_size(void)
{
/* Recompute the context size for enabled features: */
- unsigned int possible_xstate_size;
- unsigned int xsave_size;
+ unsigned int user_size, kernel_size, kernel_default_size;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
- xsave_size = get_xsave_size();
+ /* Uncompacted user space size */
+ user_size = get_xsave_size_user();
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- possible_xstate_size = get_xsaves_size_no_dynamic();
+ /*
+ * XSAVES kernel size includes supervisor states and uses compacted
+ * format. XSAVEC uses compacted format, but does not save
+ * supervisor states.
+ *
+ * XSAVE[OPT] do not support supervisor states so kernel and user
+ * size is identical.
+ */
+ if (compacted)
+ kernel_size = get_xsave_compacted_size();
else
- possible_xstate_size = xsave_size;
+ kernel_size = user_size;
- /* Ensure we have the space to store all enabled: */
- if (!is_supported_xstate_size(possible_xstate_size))
+ kernel_default_size =
+ xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
+
+ /* Ensure we have the space to store all default enabled features. */
+ if (!is_supported_xstate_size(kernel_default_size))
return -EINVAL;
- /*
- * The size is OK, we are definitely going to use xsave,
- * make it known to the world that we need more space.
- */
- fpu_kernel_xstate_size = possible_xstate_size;
- do_extra_xstate_size_checks();
+ if (!paranoid_xstate_size_valid(kernel_size))
+ return -EINVAL;
+
+ fpu_kernel_cfg.max_size = kernel_size;
+ fpu_user_cfg.max_size = user_size;
+
+ fpu_kernel_cfg.default_size = kernel_default_size;
+ fpu_user_cfg.default_size =
+ xstate_calculate_size(fpu_user_cfg.default_features, false);
- /*
- * User space is always in standard format.
- */
- fpu_user_xstate_size = xsave_size;
return 0;
}
@@ -771,27 +738,38 @@ static int __init init_xstate_size(void)
* We enabled the XSAVE hardware, but something went wrong and
* we can not use it. Disable it.
*/
-static void fpu__init_disable_system_xstate(void)
+static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
{
- xfeatures_mask_all = 0;
+ fpu_kernel_cfg.max_features = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ /* Restore the legacy size.*/
+ fpu_kernel_cfg.max_size = legacy_size;
+ fpu_kernel_cfg.default_size = legacy_size;
+ fpu_user_cfg.max_size = legacy_size;
+ fpu_user_cfg.default_size = legacy_size;
+
+ /*
+ * Prevent enabling the static branch which enables writes to the
+ * XFD MSR.
+ */
+ init_fpstate.xfd = 0;
+
+ fpstate_reset(&current->thread.fpu);
}
/*
* Enable and initialize the xsave feature.
* Called once per system bootup.
*/
-void __init fpu__init_system_xstate(void)
+void __init fpu__init_system_xstate(unsigned int legacy_size)
{
unsigned int eax, ebx, ecx, edx;
- static int on_boot_cpu __initdata = 1;
+ u64 xfeatures;
int err;
int i;
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
-
if (!boot_cpu_has(X86_FEATURE_FPU)) {
pr_info("x86/fpu: No FPU detected\n");
return;
@@ -812,22 +790,22 @@ void __init fpu__init_system_xstate(void)
* Find user xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xfeatures_mask_all = eax + ((u64)edx << 32);
+ fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
/*
* Find supervisor xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
- xfeatures_mask_all |= ecx + ((u64)edx << 32);
+ fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
- if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
* booting without it. This is too early to BUG().
*/
pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
- xfeatures_mask_all);
+ fpu_kernel_cfg.max_features);
goto out_disable;
}
@@ -835,39 +813,90 @@ void __init fpu__init_system_xstate(void)
* Clear XSAVE features that are disabled in the normal CPUID.
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
- if (!boot_cpu_has(xsave_cpuid_features[i]))
- xfeatures_mask_all &= ~BIT_ULL(i);
+ unsigned short cid = xsave_cpuid_features[i];
+
+ /* Careful: X86_FEATURE_FPU is 0! */
+ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
+ fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
}
- xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
+ if (!cpu_feature_enabled(X86_FEATURE_XFD))
+ fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+ else
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+
+ fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
+ fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+
+ /* Clean out dynamic features from default */
+ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
+ fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ fpu_user_cfg.default_features = fpu_user_cfg.max_features;
+ fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ /* Store it for paranoia check at the end */
+ xfeatures = fpu_kernel_cfg.max_features;
+
+ /*
+ * Initialize the default XFD state in initfp_state and enable the
+ * dynamic sizing mechanism if dynamic states are available. The
+ * static key cannot be enabled here because this runs before
+ * jump_label_init(). This is delayed to an initcall.
+ */
+ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
+
+ /* Set up compaction feature bit */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
+ cpu_feature_enabled(X86_FEATURE_XSAVES))
+ setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
+
+ /* Cache size, offset and flags for initialization */
+ setup_xstate_cache();
+
err = init_xstate_size();
if (err)
goto out_disable;
+ /* Reset the state for the current task */
+ fpstate_reset(&current->thread.fpu);
+
/*
* Update info used for ptrace frames; use standard-format size and no
* supervisor xstates:
*/
- update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user());
+ update_regset_xstate_info(fpu_user_cfg.max_size,
+ fpu_user_cfg.max_features);
- fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
- setup_xstate_comp_offsets();
- setup_supervisor_only_offsets();
- print_xstate_offset_size();
+ /*
+ * Paranoia check whether something in the setup modified the
+ * xfeatures mask.
+ */
+ if (xfeatures != fpu_kernel_cfg.max_features) {
+ pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
+ xfeatures, fpu_kernel_cfg.max_features);
+ goto out_disable;
+ }
+
+ print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
- xfeatures_mask_all,
- fpu_kernel_xstate_size,
- boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
+ fpu_kernel_cfg.max_features,
+ fpu_kernel_cfg.max_size,
+ boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
return;
out_disable:
/* something went wrong, try to boot without any XSAVE support */
- fpu__init_disable_system_xstate();
+ fpu__init_disable_system_xstate(legacy_size);
}
/*
@@ -878,17 +907,20 @@ void fpu__resume_cpu(void)
/*
* Restore XCR0 on xsave capable CPUs:
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE))
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
/*
* Restore IA32_XSS. The same CPUID bit enumerates support
* of XSAVES and MSR_IA32_XSS.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
- xfeatures_mask_dynamic());
+ xfeatures_mask_independent());
}
+
+ if (fpu_state_size_dynamic())
+ wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
}
/*
@@ -898,13 +930,19 @@ void fpu__resume_cpu(void)
*/
static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
- if (!xfeature_enabled(xfeature_nr)) {
- WARN_ON_FPU(1);
+ u64 xcomp_bv = xsave->header.xcomp_bv;
+
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
return NULL;
+
+ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
+ if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
+ return NULL;
}
- return (void *)xsave + xstate_comp_offsets[xfeature_nr];
+ return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
}
+
/*
* Given the xsave area and a state inside, this function returns the
* address of the state.
@@ -935,8 +973,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
* We should not ever be requesting features that we
* have not enabled.
*/
- WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)),
- "get of unsupported state");
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
/*
* This assumes the last 'xsave*' instruction to
* have requested that 'xfeature_nr' be saved.
@@ -953,37 +992,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return __raw_xsave_addr(xsave, xfeature_nr);
}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
-
-/*
- * This wraps up the common operations that need to occur when retrieving
- * data from xsave state. It first ensures that the current task was
- * using the FPU and retrieves the data in to a buffer. It then calculates
- * the offset of the requested field in the buffer.
- *
- * This function is safe to call whether the FPU is in use or not.
- *
- * Note that this only works on the current task.
- *
- * Inputs:
- * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
- * XFEATURE_SSE, etc...)
- * Output:
- * address of the state in the xsave area or NULL if the state
- * is not present or is in its 'init state'.
- */
-const void *get_xsave_field_ptr(int xfeature_nr)
-{
- struct fpu *fpu = &current->thread.fpu;
-
- /*
- * fpu__save() takes the CPU's xstate registers
- * and saves them off to the 'fpu memory buffer.
- */
- fpu__save(fpu);
-
- return get_xsave_addr(&fpu->state.xsave, xfeature_nr);
-}
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -992,17 +1000,16 @@ const void *get_xsave_field_ptr(int xfeature_nr)
* rights for @pkey to @init_val.
*/
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val)
+ unsigned long init_val)
{
- u32 old_pkru;
- int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
- u32 new_pkru_bits = 0;
+ u32 old_pkru, new_pkru_bits = 0;
+ int pkey_shift;
/*
* This check implies XSAVE support. OSPKE only gets
* set if we enable XSAVE and we enable PKU in XCR0.
*/
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return -EINVAL;
/*
@@ -1010,7 +1017,8 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
* values originating from in-kernel users. Complain
* if a bad value is observed.
*/
- WARN_ON_ONCE(pkey >= arch_max_pkey());
+ if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
+ return -EINVAL;
/* Set the bits we need in PKRU: */
if (init_val & PKEY_DISABLE_ACCESS)
@@ -1019,6 +1027,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
new_pkru_bits |= PKRU_WD_BIT;
/* Shift the bits in to the correct place in PKRU for pkey: */
+ pkey_shift = pkey * PKRU_BITS_PER_PKEY;
new_pkru_bits <<= pkey_shift;
/* Get old PKRU and mask off any old bits in place: */
@@ -1032,118 +1041,199 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
}
#endif /* ! CONFIG_ARCH_HAS_PKEYS */
-/*
- * Weird legacy quirk: SSE and YMM states store information in the
- * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
- * area is marked as unused in the xfeatures header, we need to copy
- * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
- */
-static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
+static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
+ void *init_xstate, unsigned int size)
{
- if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
- return false;
-
- if (xfeatures & XFEATURE_MASK_FP)
- return false;
-
- return true;
-}
-
-static void fill_gap(struct membuf *to, unsigned *last, unsigned offset)
-{
- if (*last >= offset)
- return;
- membuf_write(to, (void *)&init_fpstate.xsave + *last, offset - *last);
- *last = offset;
+ membuf_write(to, from_xstate ? xstate : init_xstate, size);
}
-static void copy_part(struct membuf *to, unsigned *last, unsigned offset,
- unsigned size, void *from)
-{
- fill_gap(to, last, offset);
- membuf_write(to, from, size);
- *last = offset + size;
-}
-
-/*
- * Convert from kernel XSAVES compacted format to standard format and copy
- * to a kernel-space ptrace buffer.
+/**
+ * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @fpstate: The fpstate buffer from which to copy
+ * @pkru_val: The PKRU value to store in the PKRU component
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
*
- * It supports partial copy but pos always starts from zero. This is called
- * from xstateregs_get() and there we check the CPU has XSAVES.
+ * It supports partial copy but @to.pos always starts from zero.
*/
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
+void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u32 pkru_val, enum xstate_copy_mode copy_mode)
{
+ const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+ struct xregs_state *xinit = &init_fpstate.regs.xsave;
+ struct xregs_state *xsave = &fpstate->regs.xsave;
struct xstate_header header;
- const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
- unsigned size = to.left;
- unsigned last = 0;
+ unsigned int zerofrom;
+ u64 mask;
int i;
- /*
- * The destination is a ptrace buffer; we put in only user xstates:
- */
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= xfeatures_mask_user();
-
- if (header.xfeatures & XFEATURE_MASK_FP)
- copy_part(&to, &last, 0, off_mxcsr, &xsave->i387);
- if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
- copy_part(&to, &last, off_mxcsr,
- MXCSR_AND_FLAGS_SIZE, &xsave->i387.mxcsr);
- if (header.xfeatures & XFEATURE_MASK_FP)
- copy_part(&to, &last, offsetof(struct fxregs_state, st_space),
- 128, &xsave->i387.st_space);
- if (header.xfeatures & XFEATURE_MASK_SSE)
- copy_part(&to, &last, xstate_offsets[XFEATURE_SSE],
- 256, &xsave->i387.xmm_space);
- /*
- * Fill xsave->i387.sw_reserved value for ptrace frame:
- */
- copy_part(&to, &last, offsetof(struct fxregs_state, sw_reserved),
- 48, xstate_fx_sw_bytes);
+
+ /* Mask out the feature bits depending on copy mode */
+ switch (copy_mode) {
+ case XSTATE_COPY_FP:
+ header.xfeatures &= XFEATURE_MASK_FP;
+ break;
+
+ case XSTATE_COPY_FX:
+ header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
+ break;
+
+ case XSTATE_COPY_XSAVE:
+ header.xfeatures &= fpstate->user_xfeatures;
+ break;
+ }
+
+ /* Copy FP state up to MXCSR */
+ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
+ &xinit->i387, off_mxcsr);
+
+ /* Copy MXCSR when SSE or YMM are set in the feature mask */
+ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
+ &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
+ MXCSR_AND_FLAGS_SIZE);
+
+ /* Copy the remaining FP state */
+ copy_feature(header.xfeatures & XFEATURE_MASK_FP,
+ &to, &xsave->i387.st_space, &xinit->i387.st_space,
+ sizeof(xsave->i387.st_space));
+
+ /* Copy the SSE state - shared with YMM, but independently managed */
+ copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
+ &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
+ sizeof(xsave->i387.xmm_space));
+
+ if (copy_mode != XSTATE_COPY_XSAVE)
+ goto out;
+
+ /* Zero the padding area */
+ membuf_zero(&to, sizeof(xsave->i387.padding));
+
+ /* Copy xsave->i387.sw_reserved */
+ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
+
+ /* Copy the user space relevant state of @xsave->header */
+ membuf_write(&to, &header, sizeof(header));
+
+ zerofrom = offsetof(struct xregs_state, extended_state_area);
+
/*
- * Copy xregs_state->header:
+ * The ptrace buffer is in non-compacted XSAVE format. In
+ * non-compacted format disabled features still occupy state space,
+ * but there is no state to copy from in the compacted
+ * init_fpstate. The gap tracking will zero these states.
*/
- copy_part(&to, &last, offsetof(struct xregs_state, header),
- sizeof(header), &header);
+ mask = fpstate->user_xfeatures;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ for_each_extended_xfeature(i, mask) {
/*
- * Copy only in-use xstates:
+ * If there was a feature or alignment gap, zero the space
+ * in the destination buffer.
*/
- if ((header.xfeatures >> i) & 1) {
- void *src = __raw_xsave_addr(xsave, i);
+ if (zerofrom < xstate_offsets[i])
+ membuf_zero(&to, xstate_offsets[i] - zerofrom);
- copy_part(&to, &last, xstate_offsets[i],
- xstate_sizes[i], src);
+ if (i == XFEATURE_PKRU) {
+ struct pkru_state pkru = {0};
+ /*
+ * PKRU is not necessarily up to date in the
+ * XSAVE buffer. Use the provided value.
+ */
+ pkru.pkru = pkru_val;
+ membuf_write(&to, &pkru, sizeof(pkru));
+ } else {
+ copy_feature(header.xfeatures & BIT_ULL(i), &to,
+ __raw_xsave_addr(xsave, i),
+ __raw_xsave_addr(xinit, i),
+ xstate_sizes[i]);
}
-
+ /*
+ * Keep track of the last copied state in the non-compacted
+ * target buffer for gap zeroing.
+ */
+ zerofrom = xstate_offsets[i] + xstate_sizes[i];
}
- fill_gap(&to, &last, size);
+
+out:
+ if (to.left)
+ membuf_zero(&to, to.left);
}
-/*
- * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
- * and copy to the target thread. This is called from xstateregs_set().
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @tsk: The task from which to copy the saved xstate
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
*/
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode copy_mode)
+{
+ __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
+ tsk->thread.pkru, copy_mode);
+}
+
+static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
+ const void *kbuf, const void __user *ubuf)
{
+ if (kbuf) {
+ memcpy(dst, kbuf + offset, size);
+ } else {
+ if (copy_from_user(dst, ubuf + offset, size))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+
+static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
+ const void __user *ubuf)
+{
+ struct xregs_state *xsave = &fpstate->regs.xsave;
unsigned int offset, size;
- int i;
struct xstate_header hdr;
+ u64 mask;
+ int i;
offset = offsetof(struct xregs_state, header);
- size = sizeof(hdr);
-
- memcpy(&hdr, kbuf + offset, size);
+ if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
+ return -EFAULT;
- if (validate_user_xstate_header(&hdr))
+ if (validate_user_xstate_header(&hdr, fpstate))
return -EINVAL;
+ /* Validate MXCSR when any of the related features is in use */
+ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
+ if (hdr.xfeatures & mask) {
+ u32 mxcsr[2];
+
+ offset = offsetof(struct fxregs_state, mxcsr);
+ if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
+ return -EFAULT;
+
+ /* Reserved bits in MXCSR must be zero. */
+ if (mxcsr[0] & ~mxcsr_feature_mask)
+ return -EINVAL;
+
+ /* SSE and YMM require MXCSR even when FP is not in use. */
+ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
+ xsave->i387.mxcsr = mxcsr[0];
+ xsave->i387.mxcsr_mask = mxcsr[1];
+ }
+ }
+
for (i = 0; i < XFEATURE_MAX; i++) {
- u64 mask = ((u64)1 << i);
+ mask = BIT_ULL(i);
if (hdr.xfeatures & mask) {
void *dst = __raw_xsave_addr(xsave, i);
@@ -1151,16 +1241,11 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
offset = xstate_offsets[i];
size = xstate_sizes[i];
- memcpy(dst, kbuf + offset, size);
+ if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
+ return -EFAULT;
}
}
- if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
- offset = offsetof(struct fxregs_state, mxcsr);
- size = MXCSR_AND_FLAGS_SIZE;
- memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
- }
-
/*
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
@@ -1176,186 +1261,489 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
}
/*
- * Convert from a ptrace or sigreturn standard-format user-space buffer to
- * kernel XSAVES format and copy to the target thread. This is called from
- * xstateregs_set(), as well as potentially from the sigreturn() and
- * rt_sigreturn() system calls.
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
+ * format and copy to the target thread. Used by ptrace and KVM.
*/
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
{
- unsigned int offset, size;
- int i;
- struct xstate_header hdr;
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL);
+}
- offset = offsetof(struct xregs_state, header);
- size = sizeof(hdr);
+/*
+ * Convert from a sigreturn standard-format user-space buffer to kernel
+ * XSAVE[S] format and copy to the target thread. This is called from the
+ * sigreturn() and rt_sigreturn() system calls.
+ */
+int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
+ const void __user *ubuf)
+{
+ return copy_uabi_to_xstate(fpstate, NULL, ubuf);
+}
- if (__copy_from_user(&hdr, ubuf + offset, size))
- return -EFAULT;
+static bool validate_independent_components(u64 mask)
+{
+ u64 xchk;
- if (validate_user_xstate_header(&hdr))
- return -EINVAL;
+ if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
+ return false;
- for (i = 0; i < XFEATURE_MAX; i++) {
- u64 mask = ((u64)1 << i);
+ xchk = ~xfeatures_mask_independent();
- if (hdr.xfeatures & mask) {
- void *dst = __raw_xsave_addr(xsave, i);
+ if (WARN_ON_ONCE(!mask || mask & xchk))
+ return false;
- offset = xstate_offsets[i];
- size = xstate_sizes[i];
+ return true;
+}
- if (__copy_from_user(dst, ubuf + offset, size))
- return -EFAULT;
- }
- }
+/**
+ * xsaves - Save selected components to a kernel xstate buffer
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to save
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized as
+ * XSAVES does not write the full xstate header. Before first use the
+ * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
+ * can #GP.
+ *
+ * The feature mask must be a subset of the independent features.
+ */
+void xsaves(struct xregs_state *xstate, u64 mask)
+{
+ int err;
- if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
- offset = offsetof(struct fxregs_state, mxcsr);
- size = MXCSR_AND_FLAGS_SIZE;
- if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
- return -EFAULT;
- }
+ if (!validate_independent_components(mask))
+ return;
+
+ XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
+}
+
+/**
+ * xrstors - Restore selected components from a kernel xstate buffer
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to restore
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized
+ * otherwise XRSTORS from that buffer can #GP.
+ *
+ * Proper usage is to restore the state which was saved with
+ * xsaves() into @xstate.
+ *
+ * The feature mask must be a subset of the independent features.
+ */
+void xrstors(struct xregs_state *xstate, u64 mask)
+{
+ int err;
+
+ if (!validate_independent_components(mask))
+ return;
+
+ XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
+{
+ void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
+
+ if (addr)
+ memset(addr, 0, xstate_sizes[xfeature]);
+}
+EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+#endif
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
+ * can safely operate on the @fpstate buffer.
+ */
+static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ u64 xfd = __this_cpu_read(xfd_state);
+
+ if (fpstate->xfd == xfd)
+ return true;
+
+ /*
+ * The XFD MSR does not match fpstate->xfd. That's invalid when
+ * the passed in fpstate is current's fpstate.
+ */
+ if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
+ return false;
/*
- * The state that came in from userspace was user-state only.
- * Mask all the user states out of 'xfeatures':
+ * XRSTOR(S) from init_fpstate are always correct as it will just
+ * bring all components into init state and not read from the
+ * buffer. XSAVE(S) raises #PF after init.
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
+ if (fpstate == &init_fpstate)
+ return rstor;
/*
- * Add back in the features that came in from userspace:
+ * XSAVE(S): clone(), fpu_swap_kvm_fpu()
+ * XRSTORS(S): fpu_swap_kvm_fpu()
*/
- xsave->header.xfeatures |= hdr.xfeatures;
+ /*
+ * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
+ * the buffer area for XFD-disabled state components.
+ */
+ mask &= ~xfd;
+
+ /*
+ * Remove features which are valid in fpstate. They
+ * have space allocated in fpstate.
+ */
+ mask &= ~fpstate->xfeatures;
+
+ /*
+ * Any remaining state components in 'mask' might be written
+ * by XSAVE/XRSTOR. Fail validation it found.
+ */
+ return !mask;
+}
+
+void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
+}
+#endif /* CONFIG_X86_DEBUG_FPU */
+
+static int __init xfd_update_static_branch(void)
+{
+ /*
+ * If init_fpstate.xfd has bits set then dynamic features are
+ * available and the dynamic sizing must be enabled.
+ */
+ if (init_fpstate.xfd)
+ static_branch_enable(&__fpu_state_size_dynamic);
return 0;
}
+arch_initcall(xfd_update_static_branch)
-/*
- * Save only supervisor states to the kernel buffer. This blows away all
- * old states, and is intended to be used only in __fpu__restore_sig(), where
- * user states are restored from the user buffer.
+void fpstate_free(struct fpu *fpu)
+{
+ if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
+ vfree(fpu->fpstate);
+}
+
+/**
+ * fpstate_realloc - Reallocate struct fpstate for the requested new features
+ *
+ * @xfeatures: A bitmap of xstate features which extend the enabled features
+ * of that task
+ * @ksize: The required size for the kernel buffer
+ * @usize: The required size for user space buffers
+ * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations
+ *
+ * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
+ * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
+ * with large states are likely to live longer.
+ *
+ * Returns: 0 on success, -ENOMEM on allocation error.
*/
-void copy_supervisor_to_kernel(struct xregs_state *xstate)
+static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
+ unsigned int usize, struct fpu_guest *guest_fpu)
{
- struct xstate_header *header;
- u64 max_bit, min_bit;
- u32 lmask, hmask;
- int err, i;
+ struct fpu *fpu = &current->thread.fpu;
+ struct fpstate *curfps, *newfps = NULL;
+ unsigned int fpsize;
+ bool in_use;
- if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES)))
- return;
+ fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
- if (!xfeatures_mask_supervisor())
- return;
+ newfps = vzalloc(fpsize);
+ if (!newfps)
+ return -ENOMEM;
+ newfps->size = ksize;
+ newfps->user_size = usize;
+ newfps->is_valloc = true;
- max_bit = __fls(xfeatures_mask_supervisor());
- min_bit = __ffs(xfeatures_mask_supervisor());
+ /*
+ * When a guest FPU is supplied, use @guest_fpu->fpstate
+ * as reference independent whether it is in use or not.
+ */
+ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
- lmask = xfeatures_mask_supervisor();
- hmask = xfeatures_mask_supervisor() >> 32;
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+ /* Determine whether @curfps is the active fpstate */
+ in_use = fpu->fpstate == curfps;
- /* We should never fault when copying to a kernel buffer: */
- if (WARN_ON_FPU(err))
- return;
+ if (guest_fpu) {
+ newfps->is_guest = true;
+ newfps->is_confidential = curfps->is_confidential;
+ newfps->in_use = curfps->in_use;
+ guest_fpu->xfeatures |= xfeatures;
+ guest_fpu->uabi_size = usize;
+ }
+ fpregs_lock();
/*
- * At this point, the buffer has only supervisor states and must be
- * converted back to normal kernel format.
+ * If @curfps is in use, ensure that the current state is in the
+ * registers before swapping fpstate as that might invalidate it
+ * due to layout changes.
*/
- header = &xstate->header;
- header->xcomp_bv |= xfeatures_mask_all;
+ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+
+ newfps->xfeatures = curfps->xfeatures | xfeatures;
+
+ if (!guest_fpu)
+ newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
+
+ newfps->xfd = curfps->xfd & ~xfeatures;
+
+ /* Do the final updates within the locked region */
+ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
+
+ if (guest_fpu) {
+ guest_fpu->fpstate = newfps;
+ /* If curfps is active, update the FPU fpstate pointer */
+ if (in_use)
+ fpu->fpstate = newfps;
+ } else {
+ fpu->fpstate = newfps;
+ }
+
+ if (in_use)
+ xfd_update_state(fpu->fpstate);
+ fpregs_unlock();
+
+ /* Only free valloc'ed state */
+ if (curfps && curfps->is_valloc)
+ vfree(curfps);
+
+ return 0;
+}
+
+static int validate_sigaltstack(unsigned int usize)
+{
+ struct task_struct *thread, *leader = current->group_leader;
+ unsigned long framesize = get_sigframe_size();
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ /* get_sigframe_size() is based on fpu_user_cfg.max_size */
+ framesize -= fpu_user_cfg.max_size;
+ framesize += usize;
+ for_each_thread(leader, thread) {
+ if (thread->sas_ss_size && thread->sas_ss_size < framesize)
+ return -ENOSPC;
+ }
+ return 0;
+}
+static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
+{
/*
- * This only moves states up in the buffer. Start with
- * the last state and move backwards so that states are
- * not overwritten until after they are moved. Note:
- * memmove() allows overlapping src/dst buffers.
+ * This deliberately does not exclude !XSAVES as we still might
+ * decide to optionally context switch XCR0 or talk the silicon
+ * vendors into extending XFD for the pre AMX states, especially
+ * AVX512.
*/
- for (i = max_bit; i >= min_bit; i--) {
- u8 *xbuf = (u8 *)xstate;
-
- if (!((header->xfeatures >> i) & 1))
- continue;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+ struct fpu *fpu = &current->group_leader->thread.fpu;
+ struct fpu_state_perm *perm;
+ unsigned int ksize, usize;
+ u64 mask;
+ int ret = 0;
+
+ /* Check whether fully enabled */
+ if ((permitted & requested) == requested)
+ return 0;
- /* Move xfeature 'i' into its normal location */
- memmove(xbuf + xstate_comp_offsets[i],
- xbuf + xstate_supervisor_only_offsets[i],
- xstate_sizes[i]);
+ /* Calculate the resulting kernel state size */
+ mask = permitted | requested;
+ /* Take supervisor states into account on the host */
+ if (!guest)
+ mask |= xfeatures_mask_supervisor();
+ ksize = xstate_calculate_size(mask, compacted);
+
+ /* Calculate the resulting user state size */
+ mask &= XFEATURE_MASK_USER_SUPPORTED;
+ usize = xstate_calculate_size(mask, false);
+
+ if (!guest) {
+ ret = validate_sigaltstack(usize);
+ if (ret)
+ return ret;
}
+
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
+ WRITE_ONCE(perm->__state_perm, mask);
+ /* Protected by sighand lock */
+ perm->__state_size = ksize;
+ perm->__user_state_size = usize;
+ return ret;
}
-/**
- * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
- * an xsave area
- * @xstate: A pointer to an xsave area
- * @mask: Represent the dynamic supervisor features saved into the xsave area
- *
- * Only the dynamic supervisor states sets in the mask are saved into the xsave
- * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
- * supervisor feature). Besides the dynamic supervisor states, the legacy
- * region and XSAVE header are also saved into the xsave area. The supervisor
- * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
- * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
- *
- * The xsave area must be 64-bytes aligned.
+/*
+ * Permissions array to map facilities with more than one component
*/
-void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+static const u64 xstate_prctl_req[XFEATURE_MAX] = {
+ [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
+};
+
+static int xstate_request_perm(unsigned long idx, bool guest)
{
- u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
- u32 lmask, hmask;
- int err;
+ u64 permitted, requested;
+ int ret;
- if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
- return;
+ if (idx >= XFEATURE_MAX)
+ return -EINVAL;
- if (WARN_ON_FPU(!dynamic_mask))
- return;
+ /*
+ * Look up the facility mask which can require more than
+ * one xstate component.
+ */
+ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
+ requested = xstate_prctl_req[idx];
+ if (!requested)
+ return -EOPNOTSUPP;
- lmask = dynamic_mask;
- hmask = dynamic_mask >> 32;
+ if ((fpu_user_cfg.max_features & requested) != requested)
+ return -EOPNOTSUPP;
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+ /* Lockless quick check */
+ permitted = xstate_get_group_perm(guest);
+ if ((permitted & requested) == requested)
+ return 0;
- /* Should never fault when copying to a kernel buffer */
- WARN_ON_FPU(err);
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+ permitted = xstate_get_group_perm(guest);
+
+ /* First vCPU allocation locks the permissions. */
+ if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
+ ret = -EBUSY;
+ else
+ ret = __xstate_request_perm(permitted, requested, guest);
+ spin_unlock_irq(&current->sighand->siglock);
+ return ret;
}
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
+{
+ u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+ struct fpu_state_perm *perm;
+ unsigned int ksize, usize;
+ struct fpu *fpu;
+
+ if (!xfd_event) {
+ if (!guest_fpu)
+ pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+ return 0;
+ }
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+
+ /* If not permitted let it die */
+ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
+ spin_unlock_irq(&current->sighand->siglock);
+ return -EPERM;
+ }
+
+ fpu = &current->group_leader->thread.fpu;
+ perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
+ ksize = perm->__state_size;
+ usize = perm->__user_state_size;
+
+ /*
+ * The feature is permitted. State size is sufficient. Dropping
+ * the lock is safe here even if more features are added from
+ * another task, the retrieved buffer sizes are valid for the
+ * currently requested feature(s).
+ */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Try to allocate a new fpstate. If that fails there is no way
+ * out.
+ */
+ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
+ return -EFAULT;
+ return 0;
+}
+
+int xfd_enable_feature(u64 xfd_err)
+{
+ return __xfd_enable_feature(xfd_err, NULL);
+}
+
+#else /* CONFIG_X86_64 */
+static inline int xstate_request_perm(unsigned long idx, bool guest)
+{
+ return -EPERM;
+}
+#endif /* !CONFIG_X86_64 */
+
+u64 xstate_get_guest_group_perm(void)
+{
+ return xstate_get_group_perm(true);
+}
+EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+
/**
- * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
- * an xsave area
- * @xstate: A pointer to an xsave area
- * @mask: Represent the dynamic supervisor features restored from the xsave area
+ * fpu_xstate_prctl - xstate permission operations
+ * @tsk: Redundant pointer to current
+ * @option: A subfunction of arch_prctl()
+ * @arg2: option argument
+ * Return: 0 if successful; otherwise, an error code
+ *
+ * Option arguments:
*
- * Only the dynamic supervisor states sets in the mask are restored from the
- * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
- * dynamic supervisor feature). Besides the dynamic supervisor states, the
- * legacy region and XSAVE header are also restored from the xsave area. The
- * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
- * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
+ * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
+ * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
+ * ARCH_REQ_XCOMP_PERM: Facility number requested
*
- * The xsave area must be 64-bytes aligned.
+ * For facilities which require more than one XSTATE component, the request
+ * must be the highest state component number related to that facility,
+ * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
+ * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
*/
-void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
+long fpu_xstate_prctl(int option, unsigned long arg2)
{
- u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
- u32 lmask, hmask;
- int err;
+ u64 __user *uptr = (u64 __user *)arg2;
+ u64 permitted, supported;
+ unsigned long idx = arg2;
+ bool guest = false;
- if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
- return;
+ switch (option) {
+ case ARCH_GET_XCOMP_SUPP:
+ supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
+ return put_user(supported, uptr);
- if (WARN_ON_FPU(!dynamic_mask))
- return;
+ case ARCH_GET_XCOMP_PERM:
+ /*
+ * Lockless snapshot as it can also change right after the
+ * dropping the lock.
+ */
+ permitted = xstate_get_host_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
- lmask = dynamic_mask;
- hmask = dynamic_mask >> 32;
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ permitted = xstate_get_guest_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
- XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ guest = true;
+ fallthrough;
- /* Should never fault when copying from a kernel buffer */
- WARN_ON_FPU(err);
+ case ARCH_REQ_XCOMP_PERM:
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return -EOPNOTSUPP;
+
+ return xstate_request_perm(idx, guest);
+
+ default:
+ return -EINVAL;
+ }
}
#ifdef CONFIG_PROC_PID_ARCH_STATUS
@@ -1402,60 +1790,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
-
-#ifdef CONFIG_IOMMU_SUPPORT
-void update_pasid(void)
-{
- u64 pasid_state;
- u32 pasid;
-
- if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
- return;
-
- if (!current->mm)
- return;
-
- pasid = READ_ONCE(current->mm->pasid);
- /* Set the valid bit in the PASID MSR/state only for valid pasid. */
- pasid_state = pasid == PASID_DISABLED ?
- pasid : pasid | MSR_IA32_PASID_VALID;
-
- /*
- * No need to hold fregs_lock() since the task's fpstate won't
- * be changed by others (e.g. ptrace) while the task is being
- * switched to or is in IPI.
- */
- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
- /* The MSR is active and can be directly updated. */
- wrmsrl(MSR_IA32_PASID, pasid_state);
- } else {
- struct fpu *fpu = &current->thread.fpu;
- struct ia32_pasid_state *ppasid_state;
- struct xregs_state *xsave;
-
- /*
- * The CPU's xstate registers are not currently active. Just
- * update the PASID state in the memory buffer here. The
- * PASID MSR will be loaded when returning to user mode.
- */
- xsave = &fpu->state.xsave;
- xsave->header.xfeatures |= XFEATURE_MASK_PASID;
- ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
- /*
- * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
- * won't be NULL and no need to check its value.
- *
- * Only update the task's PASID state when it's different
- * from the mm's pasid.
- */
- if (ppasid_state->pasid != pasid_state) {
- /*
- * Invalid fpregs so that state restoring will pick up
- * the PASID state.
- */
- __fpu_invalidate_fpregs_state(fpu);
- ppasid_state->pasid = pasid_state;
- }
- }
-}
-#endif /* CONFIG_IOMMU_SUPPORT */
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
new file mode 100644
index 000000000000..5ad47031383b
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -0,0 +1,326 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_XSTATE_H
+#define __X86_KERNEL_FPU_XSTATE_H
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/xcr.h>
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(u64, xfd_state);
+#endif
+
+static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
+{
+ /*
+ * XRSTORS requires these bits set in xcomp_bv, or it will
+ * trigger #GP:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED))
+ xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
+}
+
+static inline u64 xstate_get_group_perm(bool guest)
+{
+ struct fpu *fpu = &current->group_leader->thread.fpu;
+ struct fpu_state_perm *perm;
+
+ /* Pairs with WRITE_ONCE() in xstate_request_perm() */
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ return READ_ONCE(perm->__state_perm);
+}
+
+static inline u64 xstate_get_host_group_perm(void)
+{
+ return xstate_get_group_perm(false);
+}
+
+enum xstate_copy_mode {
+ XSTATE_COPY_FP,
+ XSTATE_COPY_FX,
+ XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
+extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u32 pkru_val, enum xstate_copy_mode copy_mode);
+extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode mode);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
+extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+
+
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system_xstate(unsigned int legacy_size);
+
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
+static inline u64 xfeatures_mask_supervisor(void)
+{
+ return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+}
+
+static inline u64 xfeatures_mask_independent(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
+ return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+
+ return XFEATURE_MASK_INDEPENDENT;
+}
+
+/* XSAVE/XRSTOR wrapper functions */
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX "0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27"
+#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/*
+ * After this @err contains 0 on success or the trap number when the
+ * operation raises an exception.
+ */
+#define XSTATE_OP(op, st, lmask, hmask, err) \
+ asm volatile("1:" op "\n\t" \
+ "xor %[err], %[err]\n" \
+ "2:\n\t" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor
+ * states in addition to XSAVEC.
+ *
+ * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports
+ * compacted storage format in addition to XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err) \
+ asm volatile(ALTERNATIVE_3(XSAVE, \
+ XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVEC, X86_FEATURE_XSAVEC, \
+ XSAVES, X86_FEATURE_XSAVES) \
+ "\n" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask) \
+ asm volatile(ALTERNATIVE(XRSTOR, \
+ XRSTORS, X86_FEATURE_XSAVES) \
+ "\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \
+ : \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
+extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
+#else
+static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
+#endif
+
+#ifdef CONFIG_X86_64
+static inline void xfd_update_state(struct fpstate *fpstate)
+{
+ if (fpu_state_size_dynamic()) {
+ u64 xfd = fpstate->xfd;
+
+ if (__this_cpu_read(xfd_state) != xfd) {
+ wrmsrl(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+ }
+ }
+}
+
+extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
+#else
+static inline void xfd_update_state(struct fpstate *fpstate) { }
+
+static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
+ return -EPERM;
+}
+#endif
+
+/*
+ * Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
+ */
+static inline void os_xsave(struct fpstate *fpstate)
+{
+ u64 mask = fpstate->xfeatures;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ WARN_ON_FPU(!alternatives_patched);
+ xfd_validate_state(fpstate, mask, false);
+
+ XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
+
+ /* We should never fault when copying to a kernel buffer: */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
+ */
+static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ xfd_validate_state(fpstate, mask, true);
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/* Restore of supervisor state. Does not require XFD */
+static inline void os_xrstor_supervisor(struct fpstate *fpstate)
+{
+ u64 mask = xfeatures_mask_supervisor();
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/*
+ * XSAVE itself always writes all requested xfeatures. Removing features
+ * from the request bitmap reduces the features which are written.
+ * Generate a mask of features which must be written to a sigframe. The
+ * unset features can be optimized away and not written.
+ *
+ * This optimization is user-visible. Only use for states where
+ * uninitialized sigframe contents are tolerable, like dynamic features.
+ *
+ * Users of buffers produced with this optimization must check XSTATE_BV
+ * to determine which features have been optimized out.
+ */
+static inline u64 xfeatures_need_sigframe_write(void)
+{
+ u64 xfeaures_to_write;
+
+ /* In-use features must be written: */
+ xfeaures_to_write = xfeatures_in_use();
+
+ /* Also write all non-optimizable sigframe features: */
+ xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
+ ~XFEATURE_MASK_SIGFRAME_INITOPT;
+
+ return xfeaures_to_write;
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for backward compatibility for
+ * old applications which don't understand the compacted format of the
+ * xsave area.
+ *
+ * The caller has to zero buf::header before calling this because XSAVE*
+ * does not touch the reserved fields in the header.
+ */
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
+{
+ /*
+ * Include the features which are not xsaved/rstored by the kernel
+ * internally, e.g. PKRU. That's user space ABI and also required
+ * to allow the signal handler to modify PKRU.
+ */
+ struct fpstate *fpstate = current->thread.fpu.fpstate;
+ u64 mask = fpstate->user_xfeatures;
+ u32 lmask;
+ u32 hmask;
+ int err;
+
+ /* Optimize away writing unnecessary xfeatures: */
+ if (fpu_state_size_dynamic())
+ mask &= xfeatures_need_sigframe_write();
+
+ lmask = mask;
+ hmask = mask >> 32;
+ xfd_validate_state(fpstate, mask, false);
+
+ stac();
+ XSTATE_OP(XSAVE, buf, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
+{
+ struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ xfd_validate_state(current->thread.fpu.fpstate, mask, true);
+
+ stac();
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
+{
+ struct xregs_state *xstate = &fpstate->regs.xsave;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpstate);
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ return err;
+}
+
+
+#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 7edbd5ee5ed4..5b4efc927d80 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -37,7 +37,7 @@
static int ftrace_poke_late = 0;
-int ftrace_arch_code_modify_prepare(void)
+void ftrace_arch_code_modify_prepare(void)
__acquires(&text_mutex)
{
/*
@@ -47,10 +47,9 @@ int ftrace_arch_code_modify_prepare(void)
*/
mutex_lock(&text_mutex);
ftrace_poke_late = 1;
- return 0;
}
-int ftrace_arch_code_modify_post_process(void)
+void ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
/*
@@ -61,12 +60,11 @@ int ftrace_arch_code_modify_post_process(void)
text_poke_finish();
ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
- return 0;
}
static const char *ftrace_nop_replace(void)
{
- return ideal_nops[NOP_ATOMIC5];
+ return x86_nops[5];
}
static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
@@ -252,11 +250,6 @@ void arch_ftrace_update_code(int command)
ftrace_modify_all_code(command);
}
-int __init ftrace_dyn_arch_init(void)
-{
- return 0;
-}
-
/* Currently only x86_64 supports dynamic trampolines */
#ifdef CONFIG_X86_64
@@ -308,7 +301,7 @@ union ftrace_op_code_union {
} __attribute__((packed));
};
-#define RET_SIZE 1
+#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS)
static unsigned long
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
@@ -321,12 +314,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long offset;
unsigned long npages;
unsigned long size;
- unsigned long retq;
unsigned long *ptr;
void *trampoline;
void *ip;
/* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
+ unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
union ftrace_op_code_union op_ptr;
int ret;
@@ -364,12 +357,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
goto fail;
ip = trampoline + size;
-
- /* The trampoline ends with ret(q) */
- retq = (unsigned long)ftrace_stub;
- ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE);
- if (WARN_ON(ret < 0))
- goto fail;
+ memcpy(ip, retq, RET_SIZE);
/* No need to test direct calls on created trampolines */
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
@@ -377,7 +365,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
ip = trampoline + (jmp_offset - start_offset);
if (WARN_ON(*(char *)ip != 0x75))
goto fail;
- ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2);
+ ret = copy_from_kernel_nofault(ip, x86_nops[2], 2);
if (ret < 0)
goto fail;
}
@@ -527,7 +515,7 @@ static void *addr_from_call(void *ptr)
return ptr + CALL_INSN_SIZE + call.disp;
}
-void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
unsigned long frame_pointer);
/*
@@ -541,7 +529,8 @@ static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
void *ptr;
if (ops && ops->trampoline) {
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \
+ defined(CONFIG_FUNCTION_GRAPH_TRACER)
/*
* We only know about function graph tracer setting as static
* trampoline.
@@ -588,9 +577,8 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
+#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
extern void ftrace_graph_call(void);
-
static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
@@ -618,19 +606,17 @@ int ftrace_disable_ftrace_graph_caller(void)
return ftrace_mod_jmp(ip, &ftrace_stub);
}
-
-#endif /* !CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
/*
* Hook the return address and push it in the stack of return addrs
* in current thread info.
*/
-void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
unsigned long frame_pointer)
{
unsigned long return_hooker = (unsigned long)&return_to_handler;
- unsigned long old;
- int faulted;
+ int bit;
/*
* When resuming from suspend-to-ram, this function can be indirectly
@@ -650,37 +636,25 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
- /*
- * Protect against fault, even if it shouldn't
- * happen. This tool is too much intrusive to
- * ignore such a protection.
- */
- asm volatile(
- "1: " _ASM_MOV " (%[parent]), %[old]\n"
- "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
- " movl $0, %[faulted]\n"
- "3:\n"
-
- ".section .fixup, \"ax\"\n"
- "4: movl $1, %[faulted]\n"
- " jmp 3b\n"
- ".previous\n"
-
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 4b)
-
- : [old] "=&r" (old), [faulted] "=r" (faulted)
- : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
- : "memory"
- );
-
- if (unlikely(faulted)) {
- ftrace_graph_stop();
- WARN_ON(1);
+ bit = ftrace_test_recursion_trylock(ip, *parent);
+ if (bit < 0)
return;
- }
- if (function_graph_enter(old, self_addr, frame_pointer, parent))
- *parent = old;
+ if (!function_graph_enter(*parent, ip, frame_pointer, parent))
+ *parent = return_hooker;
+
+ ftrace_test_recursion_unlock(bit);
}
+
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
+{
+ struct pt_regs *regs = &fregs->regs;
+ unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
+
+ prepare_ftrace_return(ip, (unsigned long *)stack, 0);
+}
+#endif
+
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index e405fe1a8bf4..a0ed0e4a2c0c 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -19,7 +19,7 @@
#endif
SYM_FUNC_START(__fentry__)
- ret
+ RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
@@ -84,7 +84,7 @@ ftrace_graph_call:
/* This is weak to keep gas from relaxing the jumps */
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
- ret
+ RET
SYM_CODE_END(ftrace_caller)
SYM_CODE_START(ftrace_regs_caller)
@@ -177,7 +177,7 @@ SYM_CODE_START(ftrace_graph_caller)
popl %edx
popl %ecx
popl %eax
- ret
+ RET
SYM_CODE_END(ftrace_graph_caller)
.globl return_to_handler
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 7c273846c687..dfeb227de561 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -132,7 +132,7 @@
#ifdef CONFIG_DYNAMIC_FTRACE
SYM_FUNC_START(__fentry__)
- retq
+ RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
@@ -145,6 +145,7 @@ SYM_FUNC_START(ftrace_caller)
movq %rcx, RSP(%rsp)
SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
@@ -155,6 +156,7 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
movq $0, CS(%rsp)
SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
call ftrace_stub
/* Handlers can change the RIP */
@@ -169,23 +171,20 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
* layout here.
*/
SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_caller);
+STACK_FRAME_NON_STANDARD_FP(ftrace_caller)
SYM_FUNC_START(ftrace_epilogue)
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
- jmp ftrace_stub
-#endif
-
/*
* This is weak to keep gas from relaxing the jumps.
- * It is also used to copy the retq for trampolines.
*/
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
UNWIND_HINT_FUNC
- retq
+ ENDBR
+ RET
SYM_FUNC_END(ftrace_epilogue)
SYM_FUNC_START(ftrace_regs_caller)
@@ -197,6 +196,7 @@ SYM_FUNC_START(ftrace_regs_caller)
/* save_mcount_regs fills in first two parameters */
SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
@@ -226,6 +226,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
leaq (%rsp), %rcx
SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
call ftrace_stub
/* Copy flags back to SS, to restore them */
@@ -251,9 +252,9 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
* If ORIG_RAX is anything but zero, make this a call to that.
* See arch_ftrace_set_direct_caller().
*/
- movq ORIG_RAX(%rsp), %rax
testq %rax, %rax
SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
jnz 1f
restore_mcount_regs
@@ -267,6 +268,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL)
* to the return.
*/
SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
jmp ftrace_epilogue
/* Swap the flags with orig_rax */
@@ -281,6 +283,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_regs_caller)
+STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
@@ -289,17 +292,9 @@ SYM_FUNC_START(__fentry__)
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
-fgraph_trace:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL)
- retq
+ ENDBR
+ RET
trace:
/* save_mcount_regs fills in first two parameters */
@@ -315,27 +310,18 @@ trace:
CALL_NOSPEC r8
restore_mcount_regs
- jmp fgraph_trace
+ jmp ftrace_stub
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
+STACK_FRAME_NON_STANDARD_FP(__fentry__)
+
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-SYM_FUNC_START(ftrace_graph_caller)
- /* Saves rbp into %rdx and fills first parameter */
- save_mcount_regs
-
- leaq MCOUNT_REG_SIZE+8(%rsp), %rsi
- movq $0, %rdx /* No framepointers needed */
- call prepare_ftrace_return
-
- restore_mcount_regs
-
- retq
-SYM_FUNC_END(ftrace_graph_caller)
-
-SYM_FUNC_START(return_to_handler)
- subq $24, %rsp
+SYM_CODE_START(return_to_handler)
+ UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
+ subq $16, %rsp
/* Save the return values */
movq %rax, (%rsp)
@@ -347,7 +333,18 @@ SYM_FUNC_START(return_to_handler)
movq %rax, %rdi
movq 8(%rsp), %rdx
movq (%rsp), %rax
- addq $24, %rsp
- JMP_NOSPEC rdi
-SYM_FUNC_END(return_to_handler)
+
+ addq $16, %rsp
+ /*
+ * Jump back to the old return address. This cannot be JMP_NOSPEC rdi
+ * since IBT would demand that contain ENDBR, which simply isn't so for
+ * return addresses. Use a retpoline here to keep the RSB balanced.
+ */
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call .Ldo_rop
+ int3
+.Ldo_rop:
+ mov %rdi, (%rsp)
+ RET
+SYM_CODE_END(return_to_handler)
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5e9beb77cafd..bd4a34100ed0 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -19,7 +19,7 @@
#include <linux/start_kernel.h>
#include <linux/io.h>
#include <linux/memblock.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/pgtable.h>
#include <asm/processor.h>
@@ -39,7 +39,8 @@
#include <asm/realmode.h>
#include <asm/extable.h>
#include <asm/trapnr.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
+#include <asm/tdx.h>
/*
* Manage page tables very early on.
@@ -104,7 +105,7 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
static bool __head check_la57_support(unsigned long physaddr)
{
/*
- * 5-level paging is detected and enabled at kernel decomression
+ * 5-level paging is detected and enabled at kernel decompression
* stage. Only check if it has been enabled there.
*/
if (!(native_read_cr4() & X86_CR4_LA57))
@@ -126,6 +127,49 @@ static bool __head check_la57_support(unsigned long physaddr)
}
#endif
+static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
+{
+ unsigned long vaddr, vaddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ /*
+ * On SNP, transition the page to shared in the RMP table so that
+ * it is consistent with the page table attribute change.
+ *
+ * __start_bss_decrypted has a virtual address in the high range
+ * mapping (kernel .text). PVALIDATE, by way of
+ * early_snp_set_memory_shared(), requires a valid virtual
+ * address but the kernel is currently running off of the identity
+ * mapping so use __pa() to get a *currently* valid virtual address.
+ */
+ early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD);
+
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
/* Code in __startup_64() can be relocated during execution, but the compiler
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
@@ -135,7 +179,6 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
- unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -163,9 +206,6 @@ unsigned long __head __startup_64(unsigned long physaddr,
if (load_delta & ~PMD_PAGE_MASK)
for (;;);
- /* Activate Secure Memory Encryption (SME) if supported and enabled */
- sme_enable(bp);
-
/* Include the SME encryption mask in the fixup value */
load_delta += sme_get_me_mask();
@@ -276,38 +316,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
*/
*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
- /* Encrypt the kernel and related (if SME is active) */
- sme_encrypt_kernel(bp);
-
- /*
- * Clear the memory encryption mask from the .bss..decrypted section.
- * The bss section will be memset to zero later in the initialization so
- * there is no need to zero it after changing the memory encryption
- * attribute.
- */
- if (mem_encrypt_active()) {
- vaddr = (unsigned long)__start_bss_decrypted;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
- i = pmd_index(vaddr);
- pmd[i] -= sme_get_me_mask();
- }
- }
-
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
-}
-
-unsigned long __startup_secondary_64(void)
-{
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
+ return sme_postprocess_startup(bp, pmd);
}
/* Wipe all early page tables except for the kernel symbol map */
@@ -409,6 +418,9 @@ void __init do_early_exception(struct pt_regs *regs, int trapnr)
trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs))
return;
+ if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs))
+ return;
+
early_fixup_exception(regs, trapnr);
}
@@ -480,6 +492,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
+ /*
+ * This needs to happen *before* kasan_early_init() because latter maps stuff
+ * into that page.
+ */
clear_page(init_top_pgt);
/*
@@ -491,8 +507,21 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
kasan_early_init();
+ /*
+ * Flush global TLB entries which could be left over from the trampoline page
+ * table.
+ *
+ * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs
+ * instrument native_write_cr4() so KASAN must be initialized for that
+ * instrumentation to work.
+ */
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
+
idt_setup_early_handler();
+ /* Needed before cc_platform_has() can be used for TDX */
+ tdx_early_init();
+
copy_bootdata(__va(real_mode_data));
/*
@@ -579,8 +608,10 @@ static void startup_64_load_idt(unsigned long physbase)
void early_setup_idt(void)
{
/* VMM Communication Exception */
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ setup_ghcb();
set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb);
+ }
bringup_idt_descr.address = (unsigned long)bringup_idt_table;
native_load_idt(&bringup_idt_descr);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7ed84c282233..eb8656bac99b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -318,8 +318,8 @@ SYM_FUNC_START(startup_32_smp)
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
- movl $(__KERNEL_STACK_CANARY),%eax
- movl %eax,%gs
+ xorl %eax,%eax
+ movl %eax,%gs # clear possible garbage in %gs
xorl %eax,%eax # Clear LDT
lldt %ax
@@ -339,22 +339,8 @@ SYM_FUNC_END(startup_32_smp)
*/
__INIT
setup_once:
-#ifdef CONFIG_STACKPROTECTOR
- /*
- * Configure the stack canary. The linker can't handle this by
- * relocation. Manually set base address in stack canary
- * segment descriptor.
- */
- movl $gdt_page,%eax
- movl $stack_canary,%ecx
- movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
- shrl $16, %ecx
- movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
- movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
-#endif
-
andl $0,setup_once_ref /* Once is enough, thanks */
- ret
+ RET
SYM_FUNC_START(early_idt_handler_array)
# 36(%esp) %eflags
@@ -446,7 +432,7 @@ SYM_FUNC_START(early_ignore_irq)
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
- call printk
+ call _printk
call dump_stack
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 04bddaaba8e2..92c4afa2b729 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -62,13 +62,42 @@ SYM_CODE_START_NOALIGN(startup_64)
*/
/* Set up the stack for verify_cpu(), similar to initial_stack below */
- leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
+ leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp
leaq _text(%rip), %rdi
+
+ /*
+ * initial_gs points to initial fixed_percpu_data struct with storage for
+ * the stack protector canary. Global pointer fixups are needed at this
+ * stage, so apply them as is done in fixup_pointer(), and initialize %gs
+ * such that the canary can be accessed at %gs:40 for subsequent C calls.
+ */
+ movl $MSR_GS_BASE, %ecx
+ movq initial_gs(%rip), %rax
+ movq $_text, %rdx
+ subq %rdx, %rax
+ addq %rdi, %rax
+ movq %rax, %rdx
+ shrq $32, %rdx
+ wrmsr
+
pushq %rsi
call startup_64_setup_env
popq %rsi
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Activate SEV/SME memory encryption if supported/enabled. This needs to
+ * be done now, since this also includes setup of the SEV-SNP CPUID table,
+ * which needs to be done before any CPUID instructions are executed in
+ * subsequent code.
+ */
+ movq %rsi, %rdi
+ pushq %rsi
+ call sme_enable
+ popq %rsi
+#endif
+
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
leaq .Lon_kernel_cs(%rip), %rax
@@ -99,6 +128,7 @@ SYM_CODE_END(startup_64)
SYM_CODE_START(secondary_startup_64)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
@@ -127,21 +157,38 @@ SYM_CODE_START(secondary_startup_64)
*/
SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
/*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
*/
- pushq %rsi
- call __startup_secondary_64
- popq %rsi
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ movq sme_me_mask, %rax
+#else
+ xorq %rax, %rax
+#endif
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(init_top_pgt - __START_KERNEL_map), %rax
1:
+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support.
+ * Clearing MCE may fault in some environments (that also force #MC
+ * support). Any machine check that occurs before #MC support is fully
+ * configured will crash the system regardless of the CR4.MCE value set
+ * here.
+ */
+ movq %cr4, %rcx
+ andl $X86_CR4_MCE, %ecx
+#else
+ movl $0, %ecx
+#endif
+
/* Enable PAE mode, PGE and LA57 */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
@@ -166,15 +213,33 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
call sev_verify_cbit
popq %rsi
- /* Switch to new page-table */
+ /*
+ * Switch to new page-table
+ *
+ * For the boot CPU this switches to early_top_pgt which still has the
+ * indentity mappings present. The secondary CPUs will switch to the
+ * init_top_pgt here, away from the trampoline_pgd and unmap the
+ * indentity mapped ranges.
+ */
movq %rax, %cr3
+ /*
+ * Do a global TLB flush after the CR3 switch to make sure the TLB
+ * entries from the identity mapping are flushed.
+ */
+ movq %cr4, %rcx
+ movq %rcx, %rax
+ xorq $X86_CR4_PGE, %rcx
+ movq %rcx, %cr4
+ movq %rax, %cr4
+
/* Ensure I am executing from virtual addresses */
movq $1f, %rax
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
1:
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR // above
/*
* We must switch to a new descriptor in kernel space for the GDT
@@ -229,13 +294,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
+ /*
+ * Preserve current value of EFER for comparison and to skip
+ * EFER writes if no change was made (for TDX guest)
+ */
+ movl %eax, %edx
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
-1: wrmsr /* Make changes effective */
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+1: cmpl %edx, %eax
+ je 1f
+ xor %edx, %edx
+ wrmsr /* Make changes effective */
+1:
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
@@ -282,6 +357,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
pushq %rax # target address in negative space
lretq
.Lafter_lret:
+ ANNOTATE_NOENDBR
SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
@@ -311,6 +387,7 @@ SYM_CODE_END(start_cpu0)
*/
SYM_CODE_START_NOALIGN(vc_boot_ghcb)
UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
/* Build pt_regs */
PUSH_AND_CLEAR_REGS
@@ -328,7 +405,6 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb)
/* Remove Error Code */
addq $8, %rsp
- /* Pure iret required here - don't use INTERRUPT_RETURN */
iretq
SYM_CODE_END(vc_boot_ghcb)
#endif
@@ -343,10 +419,10 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif
/*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
+ * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
* reliably detect the end of the stack.
*/
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS)
+SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
__FINITDATA
__INIT
@@ -355,9 +431,11 @@ SYM_CODE_START(early_idt_handler_array)
.rept NUM_EXCEPTION_VECTORS
.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
UNWIND_HINT_IRET_REGS
+ ENDBR
pushq $0 # Dummy error code, to make stack frame uniform
.else
UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
.endif
pushq $i # 72(%rsp) Vector number
jmp early_idt_handler_common
@@ -365,10 +443,11 @@ SYM_CODE_START(early_idt_handler_array)
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
- UNWIND_HINT_IRET_REGS offset=16
SYM_CODE_END(early_idt_handler_array)
+ ANNOTATE_NOENDBR // early_idt_handler_array[NUM_EXCEPTION_VECTORS]
SYM_CODE_START_LOCAL(early_idt_handler_common)
+ UNWIND_HINT_IRET_REGS offset=16
/*
* The stack is the hardware frame, an error code or zero, and the
* vector number.
@@ -409,11 +488,14 @@ SYM_CODE_END(early_idt_handler_common)
* early_idt_handler_array can't be used because it returns via the
* paravirtualized INTERRUPT_RETURN and pv-ops don't work that early.
*
+ * XXX it does, fix this.
+ *
* This handler will end up in the .init.text section and not be
* available to boot secondary CPUs.
*/
SYM_CODE_START_NOALIGN(vc_no_ghcb)
UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
/* Build pt_regs */
PUSH_AND_CLEAR_REGS
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 08651a4e6aa0..71f336425e58 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -10,6 +10,7 @@
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
+#include <asm/mwait.h>
#undef pr_fmt
#define pr_fmt(fmt) "hpet: " fmt
@@ -508,7 +509,7 @@ static struct irq_chip hpet_msi_controller __ro_after_init = {
.irq_set_affinity = msi_domain_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_write_msi_msg = hpet_msi_write_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP,
};
static int hpet_msi_init(struct irq_domain *domain,
@@ -916,6 +917,83 @@ static bool __init hpet_counting(void)
return false;
}
+static bool __init mwait_pc10_supported(void)
+{
+ unsigned int eax, ebx, ecx, mwait_substates;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
+ return false;
+
+ if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return false;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+
+ return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
+ (ecx & CPUID5_ECX_INTERRUPT_BREAK) &&
+ (mwait_substates & (0xF << 28));
+}
+
+/*
+ * Check whether the system supports PC10. If so force disable HPET as that
+ * stops counting in PC10. This check is overbroad as it does not take any
+ * of the following into account:
+ *
+ * - ACPI tables
+ * - Enablement of intel_idle
+ * - Command line arguments which limit intel_idle C-state support
+ *
+ * That's perfectly fine. HPET is a piece of hardware designed by committee
+ * and the only reasons why it is still in use on modern systems is the
+ * fact that it is impossible to reliably query TSC and CPU frequency via
+ * CPUID or firmware.
+ *
+ * If HPET is functional it is useful for calibrating TSC, but this can be
+ * done via PMTIMER as well which seems to be the last remaining timer on
+ * X86/INTEL platforms that has not been completely wreckaged by feature
+ * creep.
+ *
+ * In theory HPET support should be removed altogether, but there are older
+ * systems out there which depend on it because TSC and APIC timer are
+ * dysfunctional in deeper C-states.
+ *
+ * It's only 20 years now that hardware people have been asked to provide
+ * reliable and discoverable facilities which can be used for timekeeping
+ * and per CPU timer interrupts.
+ *
+ * The probability that this problem is going to be solved in the
+ * forseeable future is close to zero, so the kernel has to be cluttered
+ * with heuristics to keep up with the ever growing amount of hardware and
+ * firmware trainwrecks. Hopefully some day hardware people will understand
+ * that the approach of "This can be fixed in software" is not sustainable.
+ * Hope dies last...
+ */
+static bool __init hpet_is_pc10_damaged(void)
+{
+ unsigned long long pcfg;
+
+ /* Check whether PC10 substates are supported */
+ if (!mwait_pc10_supported())
+ return false;
+
+ /* Check whether PC10 is enabled in PKG C-state limit */
+ rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
+ if ((pcfg & 0xF) < 8)
+ return false;
+
+ if (hpet_force_user) {
+ pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n");
+ return false;
+ }
+
+ pr_info("HPET dysfunctional in PC10. Force disabled.\n");
+ boot_hpet_disable = true;
+ return true;
+}
+
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
@@ -929,6 +1007,9 @@ int __init hpet_enable(void)
if (!is_hpet_capable())
return 0;
+ if (hpet_is_pc10_damaged())
+ return 0;
+
hpet_set_mapping();
if (!hpet_virt_address)
return 0;
@@ -1354,8 +1435,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
hpet_rtc_timer_reinit();
memset(&curr_time, 0, sizeof(struct rtc_time));
- if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
- mc146818_get_time(&curr_time);
+ if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) {
+ if (unlikely(mc146818_get_time(&curr_time) < 0)) {
+ pr_err_ratelimited("unable to read current time from RTC\n");
+ return IRQ_HANDLED;
+ }
+ }
if (hpet_rtc_flags & RTC_UIE &&
curr_time.tm_sec != hpet_prev_update_sec) {
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 282b4ee1339f..15aefa3f3e18 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -235,15 +235,15 @@ static char irq_trigger[2];
*/
static void restore_ELCR(char *trigger)
{
- outb(trigger[0], 0x4d0);
- outb(trigger[1], 0x4d1);
+ outb(trigger[0], PIC_ELCR1);
+ outb(trigger[1], PIC_ELCR2);
}
static void save_ELCR(char *trigger)
{
/* IRQ 0,1,2,8,13 are marked as reserved */
- trigger[0] = inb(0x4d0) & 0xF8;
- trigger[1] = inb(0x4d1) & 0xDE;
+ trigger[0] = inb(PIC_ELCR1) & 0xF8;
+ trigger[1] = inb(PIC_ELCR2) & 0xDE;
}
static void i8259A_resume(void)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index ee1a283f8e96..a58c6bc1cd68 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -10,6 +10,7 @@
#include <asm/proto.h>
#include <asm/desc.h>
#include <asm/hw_irq.h>
+#include <asm/idtentry.h>
#define DPL0 0x0
#define DPL3 0x3
@@ -35,12 +36,16 @@
#define SYSG(_vector, _addr) \
G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+#ifdef CONFIG_X86_64
/*
* Interrupt gate with interrupt stack. The _ist index is the index in
* the tss.ist[] array, but for the descriptor it needs to start at 1.
*/
#define ISTG(_vector, _addr, _ist) \
G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+#else
+#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr)
+#endif
/* Task gate */
#define TSKG(_vector, _gdt) \
@@ -64,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = {
*/
INTG(X86_TRAP_PF, asm_exc_page_fault),
#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+ INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
+#endif
};
/*
@@ -74,7 +82,7 @@ static const __initconst struct idt_data early_idts[] = {
*/
static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DE, asm_exc_divide_error),
- INTG(X86_TRAP_NMI, asm_exc_nmi),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
INTG(X86_TRAP_BR, asm_exc_bounds),
INTG(X86_TRAP_UD, asm_exc_invalid_op),
INTG(X86_TRAP_NM, asm_exc_device_not_available),
@@ -91,12 +99,20 @@ static const __initconst struct idt_data def_idts[] = {
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
#else
- INTG(X86_TRAP_DF, asm_exc_double_fault),
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
#endif
- INTG(X86_TRAP_DB, asm_exc_debug),
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
#ifdef CONFIG_X86_MCE
- INTG(X86_TRAP_MC, asm_exc_machine_check),
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+
+#ifdef CONFIG_X86_KERNEL_IBT
+ INTG(X86_TRAP_CP, asm_exc_control_protection),
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
#endif
SYSG(X86_TRAP_OF, asm_exc_overflow),
@@ -221,22 +237,6 @@ static const __initconst struct idt_data early_pf_idts[] = {
INTG(X86_TRAP_PF, asm_exc_page_fault),
};
-/*
- * The exceptions which use Interrupt stacks. They are setup after
- * cpu_init() when the TSS has been initialized.
- */
-static const __initconst struct idt_data ist_idts[] = {
- ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
- ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
- ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
-#ifdef CONFIG_X86_MCE
- ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
-#endif
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
-#endif
-};
-
/**
* idt_setup_early_pf - Initialize the idt table with early pagefault handler
*
@@ -245,7 +245,7 @@ static const __initconst struct idt_data ist_idts[] = {
* after that.
*
* Note, that X86_64 cannot install the real #PF handler in
- * idt_setup_early_traps() because the memory intialization needs the #PF
+ * idt_setup_early_traps() because the memory initialization needs the #PF
* handler from the early_idt_handler_array to initialize the early page
* tables.
*/
@@ -254,14 +254,6 @@ void __init idt_setup_early_pf(void)
idt_setup_from_table(idt_table, early_pf_idts,
ARRAY_SIZE(early_pf_idts), true);
}
-
-/**
- * idt_setup_ist_traps - Initialize the idt table with traps using IST
- */
-void __init idt_setup_ist_traps(void)
-{
- idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
-}
#endif
static void __init idt_map_in_cea(void)
@@ -288,7 +280,7 @@ void __init idt_setup_apic_and_irq_gates(void)
idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) {
- entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+ entry = irq_entries_start + IDT_ALIGN * (i - FIRST_EXTERNAL_VECTOR);
set_intr_gate(i, entry);
}
@@ -299,7 +291,7 @@ void __init idt_setup_apic_and_irq_gates(void)
* system_vectors bitmap. Otherwise they show up in
* /proc/interrupts.
*/
- entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR);
+ entry = spurious_entries_start + IDT_ALIGN * (i - FIRST_SYSTEM_VECTOR);
set_intr_gate(i, entry);
}
#endif
@@ -331,11 +323,10 @@ void __init idt_setup_early_handler(void)
/**
* idt_invalidate - Invalidate interrupt descriptor table
- * @addr: The virtual address of the 'invalid' IDT
*/
-void idt_invalidate(void *addr)
+void idt_invalidate(void)
{
- struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 };
+ static const struct desc_ptr idt = { .address = 0, .size = 0 };
load_idt(&idt);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 58aa712973ac..766ffe3ba313 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -291,8 +291,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
{
if (handler)
kvm_posted_intr_wakeup_handler = handler;
- else
+ else {
kvm_posted_intr_wakeup_handler = dummy_handler;
+ synchronize_rcu();
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
@@ -338,7 +340,7 @@ void fixup_irqs(void)
irq_migrate_all_off_this_cpu();
/*
- * We can remove mdelay() and then send spuriuous interrupts to
+ * We can remove mdelay() and then send spurious interrupts to
* new cpu targets for all the irqs that were handled previously by
* this cpu. While it works, I have seen spurious interrupt messages
* (nothing wrong but still...).
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 044902d5a3c4..e5dd6da78713 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
}
+#ifndef CONFIG_PREEMPT_RT
void do_softirq_own_stack(void)
{
struct irq_stack *irqstk;
@@ -148,6 +149,7 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
+#endif
void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 8ef35063964b..aaf9e776f323 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -7,9 +7,11 @@
/*
* unsigned long native_save_fl(void)
*/
+.pushsection .noinstr.text, "ax"
SYM_FUNC_START(native_save_fl)
pushf
pop %_ASM_AX
- ret
+ RET
SYM_FUNC_END(native_save_fl)
+.popsection
EXPORT_SYMBOL(native_save_fl)
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 1afbdd1dd777..9ff480e94511 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu)
* of the priority chain and only used when
* all other high priority cpus are out of capacity.
*/
- smt_prio = prio * smp_num_siblings / i;
+ smt_prio = prio * smp_num_siblings / (i * i);
per_cpu(sched_core_priority, cpu) = smt_prio;
i++;
}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 5ba8477c2cb7..68f091ba8443 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -15,54 +15,76 @@
#include <asm/kprobes.h>
#include <asm/alternative.h>
#include <asm/text-patching.h>
+#include <asm/insn.h>
-static void bug_at(const void *ip, int line)
+int arch_jump_entry_size(struct jump_entry *entry)
{
- /*
- * The location is not an op that we were expecting.
- * Something went wrong. Crash the box, as something could be
- * corrupting the kernel.
- */
- pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
- BUG();
+ struct insn insn = {};
+
+ insn_decode_kernel(&insn, (void *)jump_entry_code(entry));
+ BUG_ON(insn.length != 2 && insn.length != 5);
+
+ return insn.length;
}
-static const void *
-__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
+struct jump_label_patch {
+ const void *code;
+ int size;
+};
+
+static struct jump_label_patch
+__jump_label_patch(struct jump_entry *entry, enum jump_label_type type)
{
- const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
- const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
- const void *expect, *code;
+ const void *expect, *code, *nop;
const void *addr, *dest;
- int line;
+ int size;
addr = (void *)jump_entry_code(entry);
dest = (void *)jump_entry_target(entry);
- code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+ size = arch_jump_entry_size(entry);
+ switch (size) {
+ case JMP8_INSN_SIZE:
+ code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
+
+ case JMP32_INSN_SIZE:
+ code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
- if (init) {
- expect = default_nop; line = __LINE__;
- } else if (type == JUMP_LABEL_JMP) {
- expect = ideal_nop; line = __LINE__;
- } else {
- expect = code; line = __LINE__;
+ default: BUG();
}
- if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
- bug_at(addr, line);
+ if (type == JUMP_LABEL_JMP)
+ expect = nop;
+ else
+ expect = code;
+
+ if (memcmp(addr, expect, size)) {
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n",
+ addr, addr, addr, expect, size, type);
+ BUG();
+ }
if (type == JUMP_LABEL_NOP)
- code = ideal_nop;
+ code = nop;
- return code;
+ return (struct jump_label_patch){.code = code, .size = size};
}
-static inline void __jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- int init)
+static __always_inline void
+__jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
{
- const void *opcode = __jump_label_set_jump_code(entry, type, init);
+ const struct jump_label_patch jlp = __jump_label_patch(entry, type);
/*
* As long as only a single processor is running and the code is still
@@ -76,12 +98,11 @@ static inline void __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*/
if (init || system_state == SYSTEM_BOOTING) {
- text_poke_early((void *)jump_entry_code(entry), opcode,
- JUMP_LABEL_NOP_SIZE);
+ text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size);
return;
}
- text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
}
static void __ref jump_label_transform(struct jump_entry *entry,
@@ -102,7 +123,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
- const void *opcode;
+ struct jump_label_patch jlp;
if (system_state == SYSTEM_BOOTING) {
/*
@@ -113,9 +134,8 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
}
mutex_lock(&text_mutex);
- opcode = __jump_label_set_jump_code(entry, type, 0);
- text_poke_queue((void *)jump_entry_code(entry),
- opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ jlp = __jump_label_patch(entry, type);
+ text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
mutex_unlock(&text_mutex);
return true;
}
@@ -136,22 +156,6 @@ static enum {
__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- /*
- * This function is called at boot up and when modules are
- * first loaded. Check if the default nop, the one that is
- * inserted at compile time, is the ideal nop. If it is, then
- * we do not need to update the nop, and we can leave it as is.
- * If it is not, then we need to update the nop to the ideal nop.
- */
- if (jlstate == JL_STATE_START) {
- const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
- const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
-
- if (memcmp(ideal_nop, default_nop, 5) != 0)
- jlstate = JL_STATE_UPDATE;
- else
- jlstate = JL_STATE_NO_UPDATE;
- }
if (jlstate == JL_STATE_UPDATE)
jump_label_transform(entry, type, 1);
}
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 64b6da95af98..e2e89bebcbc3 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -88,11 +88,13 @@ create_setup_data_node(struct dentry *parent, int no,
static int __init create_setup_data_nodes(struct dentry *parent)
{
+ struct setup_indirect *indirect;
struct setup_data_node *node;
struct setup_data *data;
- int error;
+ u64 pa_data, pa_next;
struct dentry *d;
- u64 pa_data;
+ int error;
+ u32 len;
int no = 0;
d = debugfs_create_dir("setup_data", parent);
@@ -112,12 +114,29 @@ static int __init create_setup_data_nodes(struct dentry *parent)
error = -ENOMEM;
goto err_dir;
}
-
- if (data->type == SETUP_INDIRECT &&
- ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
- node->paddr = ((struct setup_indirect *)data->data)->addr;
- node->type = ((struct setup_indirect *)data->data)->type;
- node->len = ((struct setup_indirect *)data->data)->len;
+ pa_next = data->next;
+
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(pa_data, len, MEMREMAP_WB);
+ if (!data) {
+ kfree(node);
+ error = -ENOMEM;
+ goto err_dir;
+ }
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ node->paddr = indirect->addr;
+ node->type = indirect->type;
+ node->len = indirect->len;
+ } else {
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ }
} else {
node->paddr = pa_data;
node->type = data->type;
@@ -125,7 +144,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
}
create_setup_data_node(d, no, node);
- pa_data = data->next;
+ pa_data = pa_next;
memunmap(data);
no++;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ce831f9448e7..170d0fd68b1f 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -75,7 +75,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
if (image->type == KEXEC_TYPE_CRASH) {
len = sprintf(cmdline_ptr,
- "elfcorehdr=0x%lx ", image->arch.elf_load_addr);
+ "elfcorehdr=0x%lx ", image->elf_load_addr);
}
memcpy(cmdline_ptr + len, cmdline, cmdline_len);
cmdline_len += len;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ff7878df96b4..3a43a2dee658 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -17,7 +17,7 @@
* Updated by: Tom Rini <trini@kernel.crashing.org>
* Updated by: Jason Wessel <jason.wessel@windriver.com>
* Modified for 386 by Jim Kingdon, Cygnus Support.
- * Origianl kgdb, compatibility with 2.1.xx kernel by
+ * Original kgdb, compatibility with 2.1.xx kernel by
* David Grothe <dave@gcom.com>
* Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
* X86_64 changes from Andi Kleen's patch merged by Jim Houston
@@ -642,7 +642,7 @@ void kgdb_arch_late(void)
struct perf_event **pevent;
/*
- * Pre-allocate the hw breakpoint structions in the non-atomic
+ * Pre-allocate the hw breakpoint instructions in the non-atomic
* portion of kgdb because this operation requires mutexs to
* complete.
*/
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index 7d3a2e2daf01..c993521d4933 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -6,6 +6,7 @@
#include <asm/asm.h>
#include <asm/frame.h>
+#include <asm/insn.h>
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index df776cdca327..7c4ab8870da4 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -52,6 +52,7 @@
#include <asm/insn.h>
#include <asm/debugreg.h>
#include <asm/set_memory.h>
+#include <asm/ibt.h>
#include "common.h"
@@ -139,6 +140,8 @@ NOKPROBE_SYMBOL(synthesize_relcall);
int can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
+ insn_byte_t prefix;
+ int i;
if (search_exception_tables((unsigned long)addr))
return 0; /* Page fault may occur on this address. */
@@ -151,35 +154,39 @@ int can_boost(struct insn *insn, void *addr)
if (insn->opcode.nbytes != 1)
return 0;
- /* Can't boost Address-size override prefix */
- if (unlikely(inat_is_address_size_prefix(insn->attr)))
- return 0;
+ for_each_insn_prefix(insn, i, prefix) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(prefix);
+ /* Can't boost Address-size override prefix and CS override prefix */
+ if (prefix == 0x2e || inat_is_address_size_prefix(attr))
+ return 0;
+ }
opcode = insn->opcode.bytes[0];
- switch (opcode & 0xf0) {
- case 0x60:
- /* can't boost "bound" */
- return (opcode != 0x62);
- case 0x70:
- return 0; /* can't boost conditional jump */
- case 0x90:
- return opcode != 0x9a; /* can't boost call far */
- case 0xc0:
- /* can't boost software-interruptions */
- return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
- case 0xd0:
- /* can boost AA* and XLAT */
- return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
- case 0xe0:
- /* can boost in/out and absolute jmps */
- return ((opcode & 0x04) || opcode == 0xea);
- case 0xf0:
- /* clear and set flags are boostable */
- return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+ switch (opcode) {
+ case 0x62: /* bound */
+ case 0x70 ... 0x7f: /* Conditional jumps */
+ case 0x9a: /* Call far */
+ case 0xc0 ... 0xc1: /* Grp2 */
+ case 0xcc ... 0xce: /* software exceptions */
+ case 0xd0 ... 0xd3: /* Grp2 */
+ case 0xd6: /* (UD) */
+ case 0xd8 ... 0xdf: /* ESC */
+ case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
+ case 0xe8 ... 0xe9: /* near Call, JMP */
+ case 0xeb: /* Short JMP */
+ case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
+ case 0xf6 ... 0xf7: /* Grp3 */
+ case 0xfe: /* Grp4 */
+ /* ... are not boostable */
+ return 0;
+ case 0xff: /* Grp5 */
+ /* Only indirect jmp is boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 4;
default:
- /* CS override prefix and call are not boostable */
- return (opcode != 0x2e && opcode != 0x9a);
+ return 1;
}
}
@@ -187,17 +194,10 @@ static unsigned long
__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
{
struct kprobe *kp;
- unsigned long faddr;
+ bool faddr;
kp = get_kprobe((void *)addr);
- faddr = ftrace_location(addr);
- /*
- * Addresses inside the ftrace location are refused by
- * arch_check_ftrace_location(). Something went terribly wrong
- * if such an address is checked here.
- */
- if (WARN_ON(faddr && faddr != addr))
- return 0UL;
+ faddr = ftrace_location(addr) == addr;
/*
* Use the current code if it is not modified by Kprobe
* and it cannot be modified by ftrace.
@@ -229,7 +229,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
return 0UL;
if (faddr)
- memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
+ memcpy(buf, x86_nops[5], 5);
else
buf[0] = kp->opcode;
return (unsigned long)buf;
@@ -265,6 +265,8 @@ static int can_probe(unsigned long paddr)
/* Decode instructions */
addr = paddr - offset;
while (addr < paddr) {
+ int ret;
+
/*
* Check if the instruction has been modified by another
* kprobe, in which case we replace the breakpoint by the
@@ -276,8 +278,10 @@ static int can_probe(unsigned long paddr)
__addr = recover_probed_instruction(buf, addr);
if (!__addr)
return 0;
- kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
- insn_get_length(&insn);
+
+ ret = insn_decode_kernel(&insn, (void *)__addr);
+ if (ret < 0)
+ return 0;
/*
* Another debugging subsystem might insert this breakpoint.
@@ -291,6 +295,22 @@ static int can_probe(unsigned long paddr)
return (addr == paddr);
}
+/* If x86 supports IBT (ENDBR) it must be skipped. */
+kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
+ bool *on_func_entry)
+{
+ if (is_endbr(*(u32 *)addr)) {
+ *on_func_entry = !offset || offset == 4;
+ if (*on_func_entry)
+ offset = 4;
+
+ } else {
+ *on_func_entry = !offset;
+ }
+
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
/*
* Copy an instruction with recovering modified instruction by kprobes
* and adjust the displacement if the instruction uses the %rip-relative
@@ -301,8 +321,8 @@ static int can_probe(unsigned long paddr)
int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
{
kprobe_opcode_t buf[MAX_INSN_SIZE];
- unsigned long recovered_insn =
- recover_probed_instruction(buf, (unsigned long)src);
+ unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src);
+ int ret;
if (!recovered_insn || !insn)
return 0;
@@ -312,8 +332,9 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
MAX_INSN_SIZE))
return 0;
- kernel_insn_init(insn, dest, MAX_INSN_SIZE);
- insn_get_length(insn);
+ ret = insn_decode_kernel(insn, dest);
+ if (ret < 0)
+ return 0;
/* We can not probe force emulate prefixed instruction */
if (insn_has_emulate_prefix(insn))
@@ -357,13 +378,14 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
return insn->length;
}
-/* Prepare reljump right after instruction to boost */
-static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
- struct insn *insn)
+/* Prepare reljump or int3 right after instruction */
+static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
+ struct insn *insn)
{
int len = insn->length;
- if (can_boost(insn, p->addr) &&
+ if (!IS_ENABLED(CONFIG_PREEMPTION) &&
+ !p->post_handler && can_boost(insn, p->addr) &&
MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
/*
* These instructions can be executed directly if it
@@ -374,7 +396,12 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
len += JMP32_INSN_SIZE;
p->ainsn.boostable = 1;
} else {
- p->ainsn.boostable = 0;
+ /* Otherwise, put an int3 for trapping singlestep */
+ if (MAX_INSN_SIZE - len < INT3_INSN_SIZE)
+ return -ENOSPC;
+
+ buf[len] = INT3_INSN_OPCODE;
+ len += INT3_INSN_SIZE;
}
return len;
@@ -405,92 +432,290 @@ void *alloc_insn_page(void)
return page;
}
-/* Recover page to RW mode before releasing it */
-void free_insn_page(void *page)
+/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
+
+static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
{
- module_memfree(page);
+ switch (p->ainsn.opcode) {
+ case 0xfa: /* cli */
+ regs->flags &= ~(X86_EFLAGS_IF);
+ break;
+ case 0xfb: /* sti */
+ regs->flags |= X86_EFLAGS_IF;
+ break;
+ case 0x9c: /* pushf */
+ int3_emulate_push(regs, regs->flags);
+ break;
+ case 0x9d: /* popf */
+ regs->flags = int3_emulate_pop(regs);
+ break;
+ }
+ regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+}
+NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers);
+
+static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs)
+{
+ int3_emulate_ret(regs);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_ret);
+
+static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ func += p->ainsn.rel32;
+ int3_emulate_call(regs, func);
}
+NOKPROBE_SYMBOL(kprobe_emulate_call);
-static void set_resume_flags(struct kprobe *p, struct insn *insn)
+static nokprobe_inline
+void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
+{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ if (cond)
+ ip += p->ainsn.rel32;
+ int3_emulate_jmp(regs, ip);
+}
+
+static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
+{
+ __kprobe_emulate_jmp(p, regs, true);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jmp);
+
+static const unsigned long jcc_mask[6] = {
+ [0] = X86_EFLAGS_OF,
+ [1] = X86_EFLAGS_CF,
+ [2] = X86_EFLAGS_ZF,
+ [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
+ [4] = X86_EFLAGS_SF,
+ [5] = X86_EFLAGS_PF,
+};
+
+static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
+{
+ bool invert = p->ainsn.jcc.type & 1;
+ bool match;
+
+ if (p->ainsn.jcc.type < 0xc) {
+ match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
+ } else {
+ match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+ if (p->ainsn.jcc.type >= 0xe)
+ match = match && (regs->flags & X86_EFLAGS_ZF);
+ }
+ __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jcc);
+
+static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
+{
+ bool match;
+
+ if (p->ainsn.loop.type != 3) { /* LOOP* */
+ if (p->ainsn.loop.asize == 32)
+ match = ((*(u32 *)&regs->cx)--) != 0;
+#ifdef CONFIG_X86_64
+ else if (p->ainsn.loop.asize == 64)
+ match = ((*(u64 *)&regs->cx)--) != 0;
+#endif
+ else
+ match = ((*(u16 *)&regs->cx)--) != 0;
+ } else { /* JCXZ */
+ if (p->ainsn.loop.asize == 32)
+ match = *(u32 *)(&regs->cx) == 0;
+#ifdef CONFIG_X86_64
+ else if (p->ainsn.loop.asize == 64)
+ match = *(u64 *)(&regs->cx) == 0;
+#endif
+ else
+ match = *(u16 *)(&regs->cx) == 0;
+ }
+
+ if (p->ainsn.loop.type == 0) /* LOOPNE */
+ match = match && !(regs->flags & X86_EFLAGS_ZF);
+ else if (p->ainsn.loop.type == 1) /* LOOPE */
+ match = match && (regs->flags & X86_EFLAGS_ZF);
+
+ __kprobe_emulate_jmp(p, regs, match);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_loop);
+
+static const int addrmode_regoffs[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#endif
+};
+
+static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
+
+ int3_emulate_call(regs, regs_get_register(regs, offs));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
+
+static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
+
+ int3_emulate_jmp(regs, regs_get_register(regs, offs));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect);
+
+static int prepare_emulation(struct kprobe *p, struct insn *insn)
{
insn_byte_t opcode = insn->opcode.bytes[0];
switch (opcode) {
case 0xfa: /* cli */
case 0xfb: /* sti */
+ case 0x9c: /* pushfl */
case 0x9d: /* popf/popfd */
- /* Check whether the instruction modifies Interrupt Flag or not */
- p->ainsn.if_modifier = 1;
- break;
- case 0x9c: /* pushfl */
- p->ainsn.is_pushf = 1;
+ /*
+ * IF modifiers must be emulated since it will enable interrupt while
+ * int3 single stepping.
+ */
+ p->ainsn.emulate_op = kprobe_emulate_ifmodifiers;
+ p->ainsn.opcode = opcode;
break;
- case 0xcf: /* iret */
- p->ainsn.if_modifier = 1;
- fallthrough;
case 0xc2: /* ret/lret */
case 0xc3:
case 0xca:
case 0xcb:
- case 0xea: /* jmp absolute -- ip is correct */
- /* ip is already adjusted, no more changes required */
- p->ainsn.is_abs_ip = 1;
- /* Without resume jump, this is boostable */
- p->ainsn.boostable = 1;
+ p->ainsn.emulate_op = kprobe_emulate_ret;
break;
- case 0xe8: /* call relative - Fix return addr */
- p->ainsn.is_call = 1;
+ case 0x9a: /* far call absolute -- segment is not supported */
+ case 0xea: /* far jmp absolute -- segment is not supported */
+ case 0xcc: /* int3 */
+ case 0xcf: /* iret -- in-kernel IRET is not supported */
+ return -EOPNOTSUPP;
break;
-#ifdef CONFIG_X86_32
- case 0x9a: /* call absolute -- same as call absolute, indirect */
- p->ainsn.is_call = 1;
- p->ainsn.is_abs_ip = 1;
+ case 0xe8: /* near call relative */
+ p->ainsn.emulate_op = kprobe_emulate_call;
+ if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
break;
-#endif
- case 0xff:
+ case 0xeb: /* short jump relative */
+ case 0xe9: /* near jump relative */
+ p->ainsn.emulate_op = kprobe_emulate_jmp;
+ if (insn->immediate.nbytes == 1)
+ p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
+ else if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ break;
+ case 0x70 ... 0x7f:
+ /* 1 byte conditional jump */
+ p->ainsn.emulate_op = kprobe_emulate_jcc;
+ p->ainsn.jcc.type = opcode & 0xf;
+ p->ainsn.rel32 = *(char *)insn->immediate.bytes;
+ break;
+ case 0x0f:
opcode = insn->opcode.bytes[1];
+ if ((opcode & 0xf0) == 0x80) {
+ /* 2 bytes Conditional Jump */
+ p->ainsn.emulate_op = kprobe_emulate_jcc;
+ p->ainsn.jcc.type = opcode & 0xf;
+ if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ } else if (opcode == 0x01 &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0 &&
+ X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) {
+ /* VM extensions - not supported */
+ return -EOPNOTSUPP;
+ }
+ break;
+ case 0xe0: /* Loop NZ */
+ case 0xe1: /* Loop */
+ case 0xe2: /* Loop */
+ case 0xe3: /* J*CXZ */
+ p->ainsn.emulate_op = kprobe_emulate_loop;
+ p->ainsn.loop.type = opcode & 0x3;
+ p->ainsn.loop.asize = insn->addr_bytes * 8;
+ p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
+ break;
+ case 0xff:
+ /*
+ * Since the 0xff is an extended group opcode, the instruction
+ * is determined by the MOD/RM byte.
+ */
+ opcode = insn->modrm.bytes[0];
if ((opcode & 0x30) == 0x10) {
- /*
- * call absolute, indirect
- * Fix return addr; ip is correct.
- * But this is not boostable
- */
- p->ainsn.is_call = 1;
- p->ainsn.is_abs_ip = 1;
+ if ((opcode & 0x8) == 0x8)
+ return -EOPNOTSUPP; /* far call */
+ /* call absolute, indirect */
+ p->ainsn.emulate_op = kprobe_emulate_call_indirect;
+ } else if ((opcode & 0x30) == 0x20) {
+ if ((opcode & 0x8) == 0x8)
+ return -EOPNOTSUPP; /* far jmp */
+ /* jmp near absolute indirect */
+ p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
+ } else
break;
- } else if (((opcode & 0x31) == 0x20) ||
- ((opcode & 0x31) == 0x21)) {
- /*
- * jmp near and far, absolute indirect
- * ip is correct.
- */
- p->ainsn.is_abs_ip = 1;
- /* Without resume jump, this is boostable */
- p->ainsn.boostable = 1;
- }
+
+ if (insn->addr_bytes != sizeof(unsigned long))
+ return -EOPNOTSUPP; /* Don't support different size */
+ if (X86_MODRM_MOD(opcode) != 3)
+ return -EOPNOTSUPP; /* TODO: support memory addressing */
+
+ p->ainsn.indirect.reg = X86_MODRM_RM(opcode);
+#ifdef CONFIG_X86_64
+ if (X86_REX_B(insn->rex_prefix.value))
+ p->ainsn.indirect.reg += 8;
+#endif
+ break;
+ default:
break;
}
+ p->ainsn.size = insn->length;
+
+ return 0;
}
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- int len;
+ int ret, len;
/* Copy an instruction with recovering if other optprobe modifies it.*/
len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
if (!len)
return -EINVAL;
- /*
- * __copy_instruction can modify the displacement of the instruction,
- * but it doesn't affect boostable check.
- */
- len = prepare_boost(buf, p, &insn);
+ /* Analyze the opcode and setup emulate functions */
+ ret = prepare_emulation(p, &insn);
+ if (ret < 0)
+ return ret;
- /* Analyze the opcode and set resume flags */
- set_resume_flags(p, &insn);
+ /* Add int3 for single-step or booster jmp */
+ len = prepare_singlestep(buf, p, &insn);
+ if (len < 0)
+ return len;
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
@@ -583,42 +808,24 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
{
__this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
- = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
- if (p->ainsn.if_modifier)
- kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
-}
-
-static nokprobe_inline void clear_btf(void)
-{
- if (test_thread_flag(TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- }
+ = (regs->flags & X86_EFLAGS_IF);
}
-static nokprobe_inline void restore_btf(void)
+static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
- if (test_thread_flag(TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl |= DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
+ if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ cur->post_handler(cur, regs, 0);
}
-}
-void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- unsigned long *sara = stack_addr(regs);
-
- ri->ret_addr = (kprobe_opcode_t *) *sara;
- ri->fp = sara;
-
- /* Replace the return addr with trampoline addr */
- *sara = (unsigned long) &kretprobe_trampoline;
+ /* Restore back the original saved kprobes variables and continue. */
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ restore_previous_kprobe(kcb);
+ else
+ reset_current_kprobe();
}
-NOKPROBE_SYMBOL(arch_prepare_kretprobe);
+NOKPROBE_SYMBOL(kprobe_post_process);
static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb, int reenter)
@@ -627,7 +834,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
return;
#if !defined(CONFIG_PREEMPTION)
- if (p->ainsn.boostable && !p->post_handler) {
+ if (p->ainsn.boostable) {
/* Boost up -- we can execute copied instructions directly */
if (!reenter)
reset_current_kprobe();
@@ -646,19 +853,51 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
kcb->kprobe_status = KPROBE_REENTER;
} else
kcb->kprobe_status = KPROBE_HIT_SS;
- /* Prepare real single stepping */
- clear_btf();
- regs->flags |= X86_EFLAGS_TF;
+
+ if (p->ainsn.emulate_op) {
+ p->ainsn.emulate_op(p, regs);
+ kprobe_post_process(p, regs, kcb);
+ return;
+ }
+
+ /* Disable interrupt, and set ip register on trampoline */
regs->flags &= ~X86_EFLAGS_IF;
- /* single step inline if the instruction is an int3 */
- if (p->opcode == INT3_INSN_OPCODE)
- regs->ip = (unsigned long)p->addr;
- else
- regs->ip = (unsigned long)p->ainsn.insn;
+ regs->ip = (unsigned long)p->ainsn.insn;
}
NOKPROBE_SYMBOL(setup_singlestep);
/*
+ * Called after single-stepping. p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int3"
+ * instruction. To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction. The address of this
+ * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again
+ * right after the copied instruction.
+ * Different from the trap single-step, "int3" single-step can not
+ * handle the instruction which changes the ip register, e.g. jmp,
+ * call, conditional jmp, and the instructions which changes the IF
+ * flags because interrupt must be disabled around the single-stepping.
+ * Such instructions are software emulated, but others are single-stepped
+ * using "int3".
+ *
+ * When the 2nd "int3" handled, the regs->ip and regs->flags needs to
+ * be adjusted, so that we can resume execution on correct code.
+ */
+static void resume_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+ unsigned long orig_ip = (unsigned long)p->addr;
+
+ /* Restore saved interrupt flag and ip register */
+ regs->flags |= kcb->kprobe_saved_flags;
+ /* Note that regs->ip is executed int3 so must be a step back */
+ regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE;
+}
+NOKPROBE_SYMBOL(resume_singlestep);
+
+/*
* We have reentered the kprobe_handler(), since another probe was hit while
* within the handler. We save the original kprobes variables and just single
* step on the instruction of the new probe without calling any user handlers.
@@ -693,6 +932,12 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
}
NOKPROBE_SYMBOL(reenter_kprobe);
+static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb)
+{
+ return (kcb->kprobe_status == KPROBE_HIT_SS ||
+ kcb->kprobe_status == KPROBE_REENTER);
+}
+
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
@@ -737,7 +982,18 @@ int kprobe_int3_handler(struct pt_regs *regs)
reset_current_kprobe();
return 1;
}
- } else if (*addr != INT3_INSN_OPCODE) {
+ } else if (kprobe_is_ss(kcb)) {
+ p = kprobe_running();
+ if ((unsigned long)p->ainsn.insn < regs->ip &&
+ (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) {
+ /* Most provably this is the second int3 for singlestep */
+ resume_singlestep(p, regs, kcb);
+ kprobe_post_process(p, regs, kcb);
+ return 1;
+ }
+ }
+
+ if (*addr != INT3_INSN_OPCODE) {
/*
* The breakpoint instruction was removed right
* after we hit it. Another cpu has removed
@@ -755,146 +1011,6 @@ int kprobe_int3_handler(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(kprobe_int3_handler);
-/*
- * When a retprobed function returns, this code saves registers and
- * calls trampoline_handler() runs, which calls the kretprobe's handler.
- */
-asm(
- ".text\n"
- ".global kretprobe_trampoline\n"
- ".type kretprobe_trampoline, @function\n"
- "kretprobe_trampoline:\n"
- /* We don't bother saving the ss register */
-#ifdef CONFIG_X86_64
- " pushq %rsp\n"
- " pushfq\n"
- SAVE_REGS_STRING
- " movq %rsp, %rdi\n"
- " call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movq %rax, 19*8(%rsp)\n"
- RESTORE_REGS_STRING
- " popfq\n"
-#else
- " pushl %esp\n"
- " pushfl\n"
- SAVE_REGS_STRING
- " movl %esp, %eax\n"
- " call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movl %eax, 15*4(%esp)\n"
- RESTORE_REGS_STRING
- " popfl\n"
-#endif
- " ret\n"
- ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
-);
-NOKPROBE_SYMBOL(kretprobe_trampoline);
-STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
-
-
-/*
- * Called from kretprobe_trampoline
- */
-__used __visible void *trampoline_handler(struct pt_regs *regs)
-{
- /* fixup registers */
- regs->cs = __KERNEL_CS;
-#ifdef CONFIG_X86_32
- regs->gs = 0;
-#endif
- regs->ip = (unsigned long)&kretprobe_trampoline;
- regs->orig_ax = ~0UL;
-
- return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, &regs->sp);
-}
-NOKPROBE_SYMBOL(trampoline_handler);
-
-/*
- * Called after single-stepping. p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction. To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction. The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt. We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new ip is relative to the copied instruction. We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed flags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- */
-static void resume_execution(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- unsigned long *tos = stack_addr(regs);
- unsigned long copy_ip = (unsigned long)p->ainsn.insn;
- unsigned long orig_ip = (unsigned long)p->addr;
-
- regs->flags &= ~X86_EFLAGS_TF;
-
- /* Fixup the contents of top of stack */
- if (p->ainsn.is_pushf) {
- *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
- *tos |= kcb->kprobe_old_flags;
- } else if (p->ainsn.is_call) {
- *tos = orig_ip + (*tos - copy_ip);
- }
-
- if (!p->ainsn.is_abs_ip)
- regs->ip += orig_ip - copy_ip;
-
- restore_btf();
-}
-NOKPROBE_SYMBOL(resume_execution);
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled throughout this function.
- */
-int kprobe_debug_handler(struct pt_regs *regs)
-{
- struct kprobe *cur = kprobe_running();
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- if (!cur)
- return 0;
-
- resume_execution(cur, regs, kcb);
- regs->flags |= kcb->kprobe_saved_flags;
-
- if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
- kcb->kprobe_status = KPROBE_HIT_SSDONE;
- cur->post_handler(cur, regs, 0);
- }
-
- /* Restore back the original saved kprobes variables and continue. */
- if (kcb->kprobe_status == KPROBE_REENTER) {
- restore_previous_kprobe(kcb);
- goto out;
- }
- reset_current_kprobe();
-out:
- /*
- * if somebody else is singlestepping across a probe point, flags
- * will have TF set, in which case, continue the remaining processing
- * of do_debug, as if this is not a probe hit.
- */
- if (regs->flags & X86_EFLAGS_TF)
- return 0;
-
- return 1;
-}
-NOKPROBE_SYMBOL(kprobe_debug_handler);
-
int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
@@ -912,20 +1028,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* normal page fault.
*/
regs->ip = (unsigned long)cur->addr;
- /*
- * Trap flag (TF) has been set here because this fault
- * happened where the single stepping will be done.
- * So clear it by resetting the current kprobe:
- */
- regs->flags &= ~X86_EFLAGS_TF;
- /*
- * Since the single step (trap) has been cancelled,
- * we need to restore BTF here.
- */
- restore_btf();
/*
- * If the TF flag was set before the kprobe hit,
+ * If the IF flag was set before the kprobe hit,
* don't touch it:
*/
regs->flags |= kcb->kprobe_old_flags;
@@ -934,24 +1039,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
restore_previous_kprobe(kcb);
else
reset_current_kprobe();
- } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
- kcb->kprobe_status == KPROBE_HIT_SSDONE) {
- /*
- * We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accounting
- * these specific fault cases.
- */
- kprobes_inc_nmissed_count(cur);
-
- /*
- * We come here because instructions in the pre/post
- * handler caused the page_fault, this could happen
- * if handler tries to access user space by
- * copy_from_user(), get_user() etc. Let the
- * user-specified handler try to fix it first.
- */
- if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
- return 1;
}
return 0;
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 51c7f5271aee..dd2ec14adb77 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -12,7 +12,7 @@
#include "common.h"
-/* Ftrace callback handler for kprobes -- called under preepmt disabled */
+/* Ftrace callback handler for kprobes -- called under preempt disabled */
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
@@ -25,7 +25,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- preempt_disable_notrace();
p = get_kprobe((kprobe_opcode_t *)ip);
if (unlikely(!p) || kprobe_disabled(p))
goto out;
@@ -59,7 +58,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
__this_cpu_write(current_kprobe, NULL);
}
out:
- preempt_enable_notrace();
ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 08eb23074f92..e6b8c5362b94 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -106,7 +106,8 @@ asm (
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
- /* We don't bother saving the ss register */
+ " pushq $" __stringify(__KERNEL_DS) "\n"
+ /* Save the 'sp - 8', this will be fixed later. */
" pushq %rsp\n"
" pushfq\n"
".global optprobe_template_clac\n"
@@ -121,14 +122,17 @@ asm (
".global optprobe_template_call\n"
"optprobe_template_call:\n"
ASM_NOP5
- /* Move flags to rsp */
+ /* Copy 'regs->flags' into 'regs->ss'. */
" movq 18*8(%rsp), %rdx\n"
- " movq %rdx, 19*8(%rsp)\n"
+ " movq %rdx, 20*8(%rsp)\n"
RESTORE_REGS_STRING
- /* Skip flags entry */
- " addq $8, %rsp\n"
+ /* Skip 'regs->flags' and 'regs->sp'. */
+ " addq $16, %rsp\n"
+ /* And pop flags register from 'regs->ss'. */
" popfq\n"
#else /* CONFIG_X86_32 */
+ " pushl %ss\n"
+ /* Save the 'sp - 4', this will be fixed later. */
" pushl %esp\n"
" pushfl\n"
".global optprobe_template_clac\n"
@@ -142,12 +146,13 @@ asm (
".global optprobe_template_call\n"
"optprobe_template_call:\n"
ASM_NOP5
- /* Move flags into esp */
+ /* Copy 'regs->flags' into 'regs->ss'. */
" movl 14*4(%esp), %edx\n"
- " movl %edx, 15*4(%esp)\n"
+ " movl %edx, 16*4(%esp)\n"
RESTORE_REGS_STRING
- /* Skip flags entry */
- " addl $4, %esp\n"
+ /* Skip 'regs->flags' and 'regs->sp'. */
+ " addl $8, %esp\n"
+ /* And pop flags register from 'regs->ss'. */
" popfl\n"
#endif
".global optprobe_template_end\n"
@@ -179,6 +184,8 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
kprobes_inc_nmissed_count(&op->kp);
} else {
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* Adjust stack pointer */
+ regs->sp += sizeof(long);
/* Save skipped registers */
regs->cs = __KERNEL_CS;
#ifdef CONFIG_X86_32
@@ -312,6 +319,8 @@ static int can_optimize(unsigned long paddr)
addr = paddr - offset;
while (addr < paddr - offset + size) { /* Decode until function end */
unsigned long recovered_insn;
+ int ret;
+
if (search_exception_tables(addr))
/*
* Since some fixup code will jumps into this function,
@@ -321,8 +330,11 @@ static int can_optimize(unsigned long paddr)
recovered_insn = recover_probed_instruction(buf, addr);
if (!recovered_insn)
return 0;
- kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
- insn_get_length(&insn);
+
+ ret = insn_decode_kernel(&insn, (void *)recovered_insn);
+ if (ret < 0)
+ return 0;
+
/*
* In the case of detecting unknown breakpoint, this could be
* a padding INT3 between functions. Let's check that all the
@@ -362,10 +374,10 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op)
/* Check the addr is within the optimized instructions. */
int arch_within_optimized_kprobe(struct optimized_kprobe *op,
- unsigned long addr)
+ kprobe_opcode_t *addr)
{
- return ((unsigned long)op->kp.addr <= addr &&
- (unsigned long)op->kp.addr + op->optinsn.size > addr);
+ return (op->kp.addr <= addr &&
+ op->kp.addr + op->optinsn.size > addr);
}
/* Free optimized instruction slot */
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index d0a19121c6a4..257892fcefa7 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -91,26 +91,41 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
static int __init get_setup_data_size(int nr, size_t *size)
{
- int i = 0;
+ u64 pa_data = boot_params.hdr.setup_data, pa_next;
+ struct setup_indirect *indirect;
struct setup_data *data;
- u64 pa_data = boot_params.hdr.setup_data;
+ int i = 0;
+ u32 len;
while (pa_data) {
data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
+ pa_next = data->next;
+
if (nr == i) {
- if (data->type == SETUP_INDIRECT &&
- ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT)
- *size = ((struct setup_indirect *)data->data)->len;
- else
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(pa_data, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT)
+ *size = indirect->len;
+ else
+ *size = data->len;
+ } else {
*size = data->len;
+ }
memunmap(data);
return 0;
}
- pa_data = data->next;
+ pa_data = pa_next;
memunmap(data);
i++;
}
@@ -120,9 +135,11 @@ static int __init get_setup_data_size(int nr, size_t *size)
static ssize_t type_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
+ struct setup_indirect *indirect;
+ struct setup_data *data;
int nr, ret;
u64 paddr;
- struct setup_data *data;
+ u32 len;
ret = kobj_to_setup_data_nr(kobj, &nr);
if (ret)
@@ -135,10 +152,20 @@ static ssize_t type_show(struct kobject *kobj,
if (!data)
return -ENOMEM;
- if (data->type == SETUP_INDIRECT)
- ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type);
- else
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(paddr, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ ret = sprintf(buf, "0x%x\n", indirect->type);
+ } else {
ret = sprintf(buf, "0x%x\n", data->type);
+ }
+
memunmap(data);
return ret;
}
@@ -149,9 +176,10 @@ static ssize_t setup_data_data_read(struct file *fp,
char *buf,
loff_t off, size_t count)
{
+ struct setup_indirect *indirect;
+ struct setup_data *data;
int nr, ret = 0;
u64 paddr, len;
- struct setup_data *data;
void *p;
ret = kobj_to_setup_data_nr(kobj, &nr);
@@ -165,10 +193,27 @@ static ssize_t setup_data_data_read(struct file *fp,
if (!data)
return -ENOMEM;
- if (data->type == SETUP_INDIRECT &&
- ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
- paddr = ((struct setup_indirect *)data->data)->addr;
- len = ((struct setup_indirect *)data->data)->len;
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(paddr, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ paddr = indirect->addr;
+ len = indirect->len;
+ } else {
+ /*
+ * Even though this is technically undefined, return
+ * the data as though it is a normal setup_data struct.
+ * This will at least allow it to be inspected.
+ */
+ paddr += sizeof(*data);
+ len = data->len;
+ }
} else {
paddr += sizeof(*data);
len = data->len;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 78bb0fae3982..1a3658f7e6d9 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -26,6 +26,9 @@
#include <linux/kprobes.h>
#include <linux/nmi.h>
#include <linux/swait.h>
+#include <linux/syscore_ops.h>
+#include <linux/cc_platform.h>
+#include <linux/efi.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -37,7 +40,9 @@
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
#include <asm/ptrace.h>
+#include <asm/reboot.h>
#include <asm/svm.h>
+#include <asm/e820/api.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -64,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;
+static int has_guest_poll = 0;
/*
* No need for any "IO delay" on KVM
*/
@@ -185,7 +191,7 @@ void kvm_async_pf_task_wake(u32 token)
{
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
- struct kvm_task_sleep_node *n;
+ struct kvm_task_sleep_node *n, *dummy = NULL;
if (token == ~0) {
apf_task_wake_all();
@@ -197,28 +203,41 @@ again:
n = _find_apf_task(b, token);
if (!n) {
/*
- * async PF was not yet handled.
- * Add dummy entry for the token.
+ * Async #PF not yet handled, add a dummy entry for the token.
+ * Allocating the token must be down outside of the raw lock
+ * as the allocator is preemptible on PREEMPT_RT kernels.
*/
- n = kzalloc(sizeof(*n), GFP_ATOMIC);
- if (!n) {
+ if (!dummy) {
+ raw_spin_unlock(&b->lock);
+ dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
+
/*
- * Allocation failed! Busy wait while other cpu
- * handles async PF.
+ * Continue looping on allocation failure, eventually
+ * the async #PF will be handled and allocating a new
+ * node will be unnecessary.
+ */
+ if (!dummy)
+ cpu_relax();
+
+ /*
+ * Recheck for async #PF completion before enqueueing
+ * the dummy token to avoid duplicate list entries.
*/
- raw_spin_unlock(&b->lock);
- cpu_relax();
goto again;
}
- n->token = token;
- n->cpu = smp_processor_id();
- init_swait_queue_head(&n->wq);
- hlist_add_head(&n->link, &b->list);
+ dummy->token = token;
+ dummy->cpu = smp_processor_id();
+ init_swait_queue_head(&dummy->wq);
+ hlist_add_head(&dummy->link, &b->list);
+ dummy = NULL;
} else {
apf_task_wake_one(n);
}
raw_spin_unlock(&b->lock);
- return;
+
+ /* A dummy token might be allocated and ultimately not used. */
+ if (dummy)
+ kfree(dummy);
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
@@ -308,7 +327,7 @@ static void kvm_register_steal_time(void)
return;
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
- pr_info("stealtime: cpu %d, msr %llx\n", cpu,
+ pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
(unsigned long long) slow_virt_to_phys(st));
}
@@ -345,7 +364,7 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
- pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
+ pr_debug("setup async PF for cpu %d\n", smp_processor_id());
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
@@ -371,34 +390,17 @@ static void kvm_pv_disable_apf(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(apf_reason.enabled, 0);
- pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
+ pr_debug("disable async PF for cpu %d\n", smp_processor_id());
}
-static void kvm_pv_guest_cpu_reboot(void *unused)
+static void kvm_disable_steal_time(void)
{
- /*
- * We disable PV EOI before we load a new kernel by kexec,
- * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
- * New kernel can re-enable when it boots.
- */
- if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
- wrmsrl(MSR_KVM_PV_EOI_EN, 0);
- kvm_pv_disable_apf();
- kvm_disable_steal_time();
-}
+ if (!has_steal_clock)
+ return;
-static int kvm_pv_reboot_notify(struct notifier_block *nb,
- unsigned long code, void *unused)
-{
- if (code == SYS_RESTART)
- on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
- return NOTIFY_DONE;
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}
-static struct notifier_block kvm_pv_reboot_nb = {
- .notifier_call = kvm_pv_reboot_notify,
-};
-
static u64 kvm_steal_clock(int cpu)
{
u64 steal;
@@ -416,14 +418,6 @@ static u64 kvm_steal_clock(int cpu)
return steal;
}
-void kvm_disable_steal_time(void)
-{
- if (!has_steal_clock)
- return;
-
- wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
-}
-
static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
{
early_set_memory_decrypted((unsigned long) ptr, size);
@@ -441,7 +435,7 @@ static void __init sev_map_percpu_data(void)
{
int cpu;
- if (!sev_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
for_each_possible_cpu(cpu) {
@@ -451,27 +445,55 @@ static void __init sev_map_percpu_data(void)
}
}
-static bool pv_tlb_flush_supported(void)
+static void kvm_guest_cpu_offline(bool shutdown)
{
- return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
- !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
- kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
+ kvm_disable_steal_time();
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
+ wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0);
+ kvm_pv_disable_apf();
+ if (!shutdown)
+ apf_task_wake_all();
+ kvmclock_disable();
}
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+static int kvm_cpu_online(unsigned int cpu)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kvm_guest_cpu_init();
+ local_irq_restore(flags);
+ return 0;
+}
#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
+static bool pv_tlb_flush_supported(void)
+{
+ return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
+ !boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (num_possible_cpus() != 1));
+}
+
static bool pv_ipi_supported(void)
{
- return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
+ return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
+ (num_possible_cpus() != 1));
}
static bool pv_sched_yield_supported(void)
{
return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
- kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
+ !boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (num_possible_cpus() != 1));
}
#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
@@ -509,7 +531,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
ipi_bitmap <<= min - apic_id;
min = apic_id;
- } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+ } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
max = apic_id < max ? max : apic_id;
} else {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
@@ -549,6 +571,55 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
__send_ipi_mask(local_mask, vector);
}
+static int __init setup_efi_kvm_sev_migration(void)
+{
+ efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
+ efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
+ efi_status_t status;
+ unsigned long size;
+ bool enabled;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
+ !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
+ return 0;
+
+ if (!efi_enabled(EFI_BOOT))
+ return 0;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
+ pr_info("%s : EFI runtime services are not enabled\n", __func__);
+ return 0;
+ }
+
+ size = sizeof(enabled);
+
+ /* Get variable contents into buffer */
+ status = efi.get_variable(efi_sev_live_migration_enabled,
+ &efi_variable_guid, NULL, &size, &enabled);
+
+ if (status == EFI_NOT_FOUND) {
+ pr_info("%s : EFI live migration variable not found\n", __func__);
+ return 0;
+ }
+
+ if (status != EFI_SUCCESS) {
+ pr_info("%s : EFI variable retrieval failed\n", __func__);
+ return 0;
+ }
+
+ if (enabled == 0) {
+ pr_info("%s: live migration disabled in EFI\n", __func__);
+ return 0;
+ }
+
+ pr_info("%s : live migration enabled in EFI\n", __func__);
+ wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
+
+ return 1;
+}
+
+late_initcall(setup_efi_kvm_sev_migration);
+
/*
* Set the IPI entry points
*/
@@ -567,13 +638,61 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
/* Make sure other vCPUs get a chance to run if they need to. */
for_each_cpu(cpu, mask) {
- if (vcpu_is_preempted(cpu)) {
+ if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
break;
}
}
}
+static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ /*
+ * The local vCPU is never preempted, so we do not explicitly
+ * skip check for local vCPU - it will never be cleared from
+ * flushmask.
+ */
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_multi(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+ int cpu;
+
+ if (!kvm_para_available() || nopv)
+ return 0;
+
+ if (pv_tlb_flush_supported() || pv_ipi_supported())
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+
+ return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
static void __init kvm_smp_prepare_boot_cpu(void)
{
/*
@@ -587,57 +706,113 @@ static void __init kvm_smp_prepare_boot_cpu(void)
kvm_spinlock_init();
}
-static void kvm_guest_cpu_offline(void)
+static int kvm_cpu_down_prepare(unsigned int cpu)
{
- kvm_disable_steal_time();
- if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
- wrmsrl(MSR_KVM_PV_EOI_EN, 0);
- kvm_pv_disable_apf();
- apf_task_wake_all();
-}
+ unsigned long flags;
-static int kvm_cpu_online(unsigned int cpu)
-{
- local_irq_disable();
- kvm_guest_cpu_init();
- local_irq_enable();
+ local_irq_save(flags);
+ kvm_guest_cpu_offline(false);
+ local_irq_restore(flags);
return 0;
}
-static int kvm_cpu_down_prepare(unsigned int cpu)
+#endif
+
+static int kvm_suspend(void)
{
- local_irq_disable();
- kvm_guest_cpu_offline();
- local_irq_enable();
+ u64 val = 0;
+
+ kvm_guest_cpu_offline(false);
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ rdmsrl(MSR_KVM_POLL_CONTROL, val);
+ has_guest_poll = !(val & 1);
+#endif
return 0;
}
+
+static void kvm_resume(void)
+{
+ kvm_cpu_online(raw_smp_processor_id());
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+ wrmsrl(MSR_KVM_POLL_CONTROL, 0);
#endif
+}
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
+static struct syscore_ops kvm_syscore_ops = {
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static void kvm_pv_guest_cpu_reboot(void *unused)
{
- u8 state;
- int cpu;
- struct kvm_steal_time *src;
- struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+ kvm_guest_cpu_offline(true);
+}
- cpumask_copy(flushmask, cpumask);
- /*
- * We have to call flush only on online vCPUs. And
- * queue flush_on_enter for pre-empted vCPUs
- */
- for_each_cpu(cpu, flushmask) {
- src = &per_cpu(steal_time, cpu);
- state = READ_ONCE(src->preempted);
- if ((state & KVM_VCPU_PREEMPTED)) {
- if (try_cmpxchg(&src->preempted, &state,
- state | KVM_VCPU_FLUSH_TLB))
- __cpumask_clear_cpu(cpu, flushmask);
- }
- }
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
+/*
+ * After a PV feature is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written.
+ */
+#ifdef CONFIG_KEXEC_CORE
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+ kvm_guest_cpu_offline(true);
+ native_machine_crash_shutdown(regs);
+}
+#endif
+
+#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
+bool __kvm_vcpu_is_preempted(long cpu);
+
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
- native_flush_tlb_others(flushmask, info);
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+ASM_ENDBR
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+ASM_RET
+".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
+".popsection");
+
+#endif
static void __init kvm_guest_init(void)
{
@@ -650,13 +825,10 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
- pv_ops.time.steal_clock = kvm_steal_clock;
- }
+ static_call_update(pv_steal_clock, kvm_steal_clock);
- if (pv_tlb_flush_supported()) {
- pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
- pr_info("KVM setup pv remote TLB flush\n");
+ pv_ops.lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -668,6 +840,12 @@ static void __init kvm_guest_init(void)
}
#ifdef CONFIG_SMP
+ if (pv_tlb_flush_supported()) {
+ pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+ pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (pv_sched_yield_supported()) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
@@ -681,6 +859,12 @@ static void __init kvm_guest_init(void)
kvm_guest_cpu_init();
#endif
+#ifdef CONFIG_KEXEC_CORE
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
+#endif
+
+ register_syscore_ops(&kvm_syscore_ops);
+
/*
* Hard lockup detection is enabled by default. Disable it, as guests
* can get false positives too easily, for example if the host is
@@ -695,7 +879,7 @@ static noinline uint32_t __kvm_cpuid_base(void)
return 0; /* So we don't blow up on old processors */
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
+ return hypervisor_cpuid_base(KVM_SIGNATURE, 0);
return 0;
}
@@ -734,7 +918,7 @@ static uint32_t __init kvm_detect(void)
static void __init kvm_apic_init(void)
{
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
if (pv_ipi_supported())
kvm_setup_pv_ipi();
#endif
@@ -745,8 +929,62 @@ static bool __init kvm_msi_ext_dest_id(void)
return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
}
+static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
+{
+ kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
+ KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+}
+
static void __init kvm_init_platform(void)
{
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
+ kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
+ unsigned long nr_pages;
+ int i;
+
+ pv_ops.mmu.notify_page_enc_status_changed =
+ kvm_sev_hc_page_enc_status;
+
+ /*
+ * Reset the host's shared pages list related to kernel
+ * specific page encryption status settings before we load a
+ * new kernel by kexec. Reset the page encryption status
+ * during early boot intead of just before kexec to avoid SMP
+ * races during kvm_pv_guest_cpu_reboot().
+ * NOTE: We cannot reset the complete shared pages list
+ * here as we need to retain the UEFI/OVMF firmware
+ * specific settings.
+ */
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
+
+ kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
+ nr_pages,
+ KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+ }
+
+ /*
+ * Ensure that _bss_decrypted section is marked as decrypted in the
+ * shared pages list.
+ */
+ nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted,
+ PAGE_SIZE);
+ early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
+ nr_pages, 0);
+
+ /*
+ * If not booted using EFI, enable Live migration support.
+ */
+ if (!efi_enabled(EFI_BOOT))
+ wrmsrl(MSR_KVM_MIGRATION_CONTROL,
+ KVM_MIGRATION_READY);
+ }
kvmclock_init();
x86_platform.apic_post_init = kvm_apic_init;
}
@@ -794,32 +1032,6 @@ static __init int activate_jump_labels(void)
}
arch_initcall(activate_jump_labels);
-static __init int kvm_alloc_cpumask(void)
-{
- int cpu;
- bool alloc = false;
-
- if (!kvm_para_available() || nopv)
- return 0;
-
- if (pv_tlb_flush_supported())
- alloc = true;
-
-#if defined(CONFIG_SMP)
- if (pv_ipi_supported())
- alloc = true;
-#endif
-
- if (alloc)
- for_each_possible_cpu(cpu) {
- zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
- GFP_KERNEL, cpu_to_node(cpu));
- }
-
- return 0;
-}
-arch_initcall(kvm_alloc_cpumask);
-
#ifdef CONFIG_PARAVIRT_SPINLOCKS
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
@@ -850,46 +1062,14 @@ static void kvm_wait(u8 *ptr, u8 val)
} else {
local_irq_disable();
+ /* safe_halt() will enable IRQ */
if (READ_ONCE(*ptr) == val)
safe_halt();
-
- local_irq_enable();
+ else
+ local_irq_enable();
}
}
-#ifdef CONFIG_X86_32
-__visible bool __kvm_vcpu_is_preempted(long cpu)
-{
- struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
-
- return !!(src->preempted & KVM_VCPU_PREEMPTED);
-}
-PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
-
-#else
-
-#include <asm/asm-offsets.h>
-
-extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
-
-/*
- * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
- * restoring to/from the stack.
- */
-asm(
-".pushsection .text;"
-".global __raw_callee_save___kvm_vcpu_is_preempted;"
-".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
-"__raw_callee_save___kvm_vcpu_is_preempted:"
-"movq __per_cpu_offset(,%rdi,8), %rax;"
-"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
-"setne %al;"
-"ret;"
-".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
-".popsection");
-
-#endif
-
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -933,10 +1113,6 @@ void __init kvm_spinlock_init(void)
pv_ops.lock.wait = kvm_wait;
pv_ops.lock.kick = kvm_kick_cpu;
- if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
- pv_ops.lock.vcpu_is_preempted =
- PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
- }
/*
* When PV spinlock is enabled which is preferred over
* virt_spin_lock(), virt_spin_lock_key's value is meaningless.
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1fc0962c89c0..16333ba1904b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -16,11 +16,10 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/set_memory.h>
+#include <linux/cc_platform.h>
#include <asm/hypervisor.h>
-#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
-#include <asm/reboot.h>
#include <asm/kvmclock.h>
static int kvmclock __initdata = 1;
@@ -50,18 +49,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
static struct pvclock_vsyscall_time_info
hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
static struct pvclock_wall_clock wall_clock __bss_decrypted;
-static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
static struct pvclock_vsyscall_time_info *hvclock_mem;
-
-static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
-{
- return &this_cpu_read(hv_clock_per_cpu)->pvti;
-}
-
-static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
-{
- return this_cpu_read(hv_clock_per_cpu);
-}
+DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
/*
* The wallclock is the time of day when we booted. Since then, some time may
@@ -106,7 +96,7 @@ static inline void kvm_sched_clock_init(bool stable)
if (!stable)
clear_sched_clock_stable();
kvm_sched_clock_offset = kvm_clock_read();
- pv_ops.time.sched_clock = kvm_sched_clock_read;
+ paravirt_set_sched_clock(kvm_sched_clock_read);
pr_info("kvm-clock: using sched offset of %llu cycles",
kvm_sched_clock_offset);
@@ -184,7 +174,7 @@ static void kvm_register_clock(char *txt)
pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
wrmsrl(msr_kvm_system_time, pa);
- pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+ pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}
static void kvm_save_sched_clock_state(void)
@@ -203,28 +193,9 @@ static void kvm_setup_secondary_clock(void)
}
#endif
-/*
- * After the clock is registered, the host will keep writing to the
- * registered memory location. If the guest happens to shutdown, this memory
- * won't be valid. In cases like kexec, in which you install a new kernel, this
- * means a random memory location will be kept being written. So before any
- * kind of shutdown from our side, we unregister the clock by writing anything
- * that does not have the 'enable' bit set in the msr
- */
-#ifdef CONFIG_KEXEC_CORE
-static void kvm_crash_shutdown(struct pt_regs *regs)
-{
- native_write_msr(msr_kvm_system_time, 0, 0);
- kvm_disable_steal_time();
- native_machine_crash_shutdown(regs);
-}
-#endif
-
-static void kvm_shutdown(void)
+void kvmclock_disable(void)
{
native_write_msr(msr_kvm_system_time, 0, 0);
- kvm_disable_steal_time();
- native_machine_shutdown();
}
static void __init kvmclock_init_mem(void)
@@ -252,7 +223,7 @@ static void __init kvmclock_init_mem(void)
* hvclock is shared between the guest and the hypervisor, must
* be mapped decrypted.
*/
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
r = set_memory_decrypted((unsigned long) hvclock_mem,
1UL << order);
if (r) {
@@ -268,6 +239,9 @@ static void __init kvmclock_init_mem(void)
static int __init kvm_setup_vsyscall_timeinfo(void)
{
+ if (!kvm_para_available() || !kvmclock || nopv)
+ return 0;
+
kvmclock_init_mem();
#ifdef CONFIG_X86_64
@@ -351,10 +325,6 @@ void __init kvmclock_init(void)
#endif
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
- machine_ops.shutdown = kvm_shutdown;
-#ifdef CONFIG_KEXEC_CORE
- machine_ops.crash_shutdown = kvm_crash_shutdown;
-#endif
kvm_get_preset_lpj();
/*
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index aa15132228da..525876e7b9f4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
if (num_entries > LDT_ENTRIES)
return NULL;
- new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
if (!new_ldt)
return NULL;
@@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
* than PAGE_SIZE.
*/
if (alloc_size > PAGE_SIZE)
- new_ldt->entries = vzalloc(alloc_size);
+ new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
else
- new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!new_ldt->entries) {
kfree(new_ldt);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 64b00b0d7fe8..1b373d79cedc 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -23,17 +23,6 @@
#include <asm/set_memory.h>
#include <asm/debugreg.h>
-static void set_gdt(void *newgdt, __u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* ia32 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- load_gdt(&curgdt);
-}
-
static void load_segments(void)
{
#define __STR(X) #X
@@ -232,8 +221,8 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- idt_invalidate(phys_to_virt(0));
- set_gdt(phys_to_virt(0), 0);
+ native_idt_invalidate();
+ native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index a29a44a98e5b..0611fd83858e 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
#include <linux/suspend.h>
#include <linux/vmalloc.h>
#include <linux/efi.h>
+#include <linux/cc_platform.h>
#include <asm/init.h>
#include <asm/tlbflush.h>
@@ -26,6 +27,7 @@
#include <asm/kexec-bzimage64.h>
#include <asm/setup.h>
#include <asm/set_memory.h>
+#include <asm/cpu.h>
#ifdef CONFIG_ACPI
/*
@@ -166,7 +168,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
}
pte = pte_offset_kernel(pmd, vaddr);
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
prot = PAGE_KERNEL_EXEC;
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
@@ -206,7 +208,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag |= _PAGE_ENC;
info.kernpg_flag |= _PAGE_ENC;
}
@@ -256,35 +258,6 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
return init_transition_pgtable(image, level4p);
}
-static void set_idt(void *newidt, u16 limit)
-{
- struct desc_ptr curidt;
-
- /* x86-64 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- __asm__ __volatile__ (
- "lidtq %0\n"
- : : "m" (curidt)
- );
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* x86-64 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- __asm__ __volatile__ (
- "lgdtq %0\n"
- : : "m" (curgdt)
- );
-};
-
static void load_segments(void)
{
__asm__ __volatile__ (
@@ -338,6 +311,7 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
hw_breakpoint_disable();
+ cet_disable();
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
@@ -353,7 +327,7 @@ void machine_kexec(struct kimage *image)
}
control_page = page_address(image->control_code_page) + PAGE_SIZE;
- memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+ __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
@@ -379,15 +353,15 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
+ native_idt_invalidate();
+ native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel((unsigned long)image->head,
(unsigned long)page_list,
image->start,
image->preserve_context,
- sme_active());
+ cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -402,9 +376,6 @@ void machine_kexec(struct kimage *image)
#ifdef CONFIG_KEXEC_FILE
void *arch_kexec_kernel_image_load(struct kimage *image)
{
- vfree(image->arch.elf_headers);
- image->arch.elf_headers = NULL;
-
if (!image->fops || !image->fops->load)
return ERR_PTR(-ENOEXEC);
@@ -540,6 +511,15 @@ overflow:
(int)ELF64_R_TYPE(rel[i].r_info), value);
return -ENOEXEC;
}
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ vfree(image->elf_headers);
+ image->elf_headers = NULL;
+ image->elf_headers_sz = 0;
+
+ return kexec_image_post_load_cleanup_default(image);
+}
#endif /* CONFIG_KEXEC_FILE */
static int
@@ -598,12 +578,12 @@ void arch_kexec_unprotect_crashkres(void)
*/
int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
{
- if (sev_active())
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
return 0;
/*
- * If SME is active we need to be sure that kexec pages are
- * not encrypted because when we boot to the new kernel the
+ * If host memory encryption is active we need to be sure that kexec
+ * pages are not encrypted because when we boot to the new kernel the
* pages won't be accessed encrypted (initially).
*/
return set_memory_decrypted((unsigned long)vaddr, pages);
@@ -611,12 +591,12 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
{
- if (sev_active())
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
return;
/*
- * If SME is active we need to reset the pages back to being
- * an encrypted mapping before freeing them.
+ * If host memory encryption is active we need to reset the pages back
+ * to being an encrypted mapping before freeing them.
*/
set_memory_encrypted((unsigned long)vaddr, pages);
}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index b5cb49e57df8..c94dec6a1834 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -95,7 +95,7 @@ static void get_fam10h_pci_mmconf_base(void)
return;
/* SYS_CFG */
- address = MSR_K8_SYSCFG;
+ address = MSR_AMD64_SYSCFG;
rdmsrl(address, val);
/* TOP_MEM2 is not enabled? */
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 5e9a34b5bd74..b98ffcf4d250 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void)
void *module_alloc(unsigned long size)
{
+ gfp_t gfp_mask = GFP_KERNEL;
void *p;
if (PAGE_ALIGN(size) > MODULES_LEN)
@@ -74,10 +75,10 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
- MODULES_END, GFP_KERNEL,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
+ MODULES_END, gfp_mask,
+ PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
__builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size) < 0)) {
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
@@ -251,7 +252,8 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL, *orc = NULL, *orc_ip = NULL;
+ *para = NULL, *orc = NULL, *orc_ip = NULL,
+ *retpolines = NULL, *ibt_endbr = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -267,13 +269,33 @@ int module_finalize(const Elf_Ehdr *hdr,
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
orc_ip = s;
+ if (!strcmp(".retpoline_sites", secstrings + s->sh_name))
+ retpolines = s;
+ if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name))
+ ibt_endbr = s;
}
+ /*
+ * See alternative_instructions() for the ordering rules between the
+ * various patching types.
+ */
+ if (para) {
+ void *pseg = (void *)para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }
+ if (retpolines) {
+ void *rseg = (void *)retpolines->sh_addr;
+ apply_retpolines(rseg, rseg + retpolines->sh_size);
+ }
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
+ if (ibt_endbr) {
+ void *iseg = (void *)ibt_endbr->sh_addr;
+ apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size);
+ }
if (locks && text) {
void *lseg = (void *)locks->sh_addr;
void *tseg = (void *)text->sh_addr;
@@ -282,11 +304,6 @@ int module_finalize(const Elf_Ehdr *hdr,
tseg, tseg + text->sh_size);
}
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
-
/* make jump label nops */
jump_label_apply_nops(me);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 8f06449aab27..fed721f90116 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,6 +19,7 @@
#include <linux/smp.h>
#include <linux/pci.h>
+#include <asm/i8259.h>
#include <asm/io_apic.h>
#include <asm/acpi.h>
#include <asm/irqdomain.h>
@@ -251,7 +252,7 @@ static int __init ELCR_trigger(unsigned int irq)
{
unsigned int port;
- port = 0x4d0 + (irq >> 3);
+ port = PIC_ELCR1 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bf250a339655..cec0bfa3bc04 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -33,7 +33,7 @@
#include <asm/reboot.h>
#include <asm/cache.h>
#include <asm/nospec-branch.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -157,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
struct nmi_desc *desc = nmi_to_desc(type);
unsigned long flags;
- if (!action->handler)
+ if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list)))
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -177,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
list_add_rcu(&action->list, &desc->head);
else
list_add_tail_rcu(&action->list, &desc->head);
-
+
raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
@@ -186,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler);
void unregister_nmi_handler(unsigned int type, const char *name)
{
struct nmi_desc *desc = nmi_to_desc(type);
- struct nmiaction *n;
+ struct nmiaction *n, *found = NULL;
unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -200,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name)
WARN(in_nmi(),
"Trying to free NMI (%s) from NMI context!\n", n->name);
list_del_rcu(&n->list);
+ found = n;
break;
}
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
- synchronize_rcu();
+ if (found) {
+ synchronize_rcu();
+ INIT_LIST_HEAD(&found->list);
+ }
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
@@ -292,7 +296,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
reason, smp_processor_id());
- pr_emerg("Do you have a strange power saving mode enabled?\n");
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
nmi_panic(regs, "NMI: Not continuing");
@@ -524,6 +527,16 @@ nmi_restart:
mds_user_clear_cpu_buffers();
}
+#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_noist)
+{
+ exc_nmi(regs);
+}
+#endif
+#if IS_MODULE(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
+#endif
+
void stop_nmi(void)
{
ignore_nmis++;
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 4f75d0cf6305..9e1ea99ad9df 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -32,3 +32,12 @@ bool pv_is_native_vcpu_is_preempted(void)
return pv_ops.lock.vcpu_is_preempted.func ==
__raw_callee_save___native_vcpu_is_preempted;
}
+
+void __init paravirt_set_cap(void)
+{
+ if (!pv_is_native_spin_unlock())
+ setup_force_cpu_cap(X86_FEATURE_PVUNLOCK);
+
+ if (!pv_is_native_vcpu_is_preempted())
+ setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT);
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c60222ab8ab9..7ca2d46c08cc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -14,6 +14,7 @@
#include <linux/highmem.h>
#include <linux/kprobes.h>
#include <linux/pgtable.h>
+#include <linux/static_call.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -40,11 +41,24 @@ extern void _paravirt_nop(void);
asm (".pushsection .entry.text, \"ax\"\n"
".global _paravirt_nop\n"
"_paravirt_nop:\n\t"
- "ret\n\t"
+ ASM_ENDBR
+ ASM_RET
".size _paravirt_nop, . - _paravirt_nop\n\t"
".type _paravirt_nop, @function\n\t"
".popsection");
+/* stub always returning 0. */
+asm (".pushsection .entry.text, \"ax\"\n"
+ ".global paravirt_ret0\n"
+ "paravirt_ret0:\n\t"
+ ASM_ENDBR
+ "xor %" _ASM_AX ", %" _ASM_AX ";\n\t"
+ ASM_RET
+ ".size paravirt_ret0, . - paravirt_ret0\n\t"
+ ".type paravirt_ret0, @function\n\t"
+ ".popsection");
+
+
void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -52,31 +66,17 @@ void __init default_banner(void)
}
/* Undefined instruction for dealing with missing ops pointers. */
-static const unsigned char ud2a[] = { 0x0f, 0x0b };
-
-struct branch {
- unsigned char opcode;
- u32 delta;
-} __attribute__((packed));
+noinstr void paravirt_BUG(void)
+{
+ BUG();
+}
static unsigned paravirt_patch_call(void *insn_buff, const void *target,
unsigned long addr, unsigned len)
{
- const int call_len = 5;
- struct branch *b = insn_buff;
- unsigned long delta = (unsigned long)target - (addr+call_len);
-
- if (len < call_len) {
- pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr);
- /* Kernel might not be viable if patching fails, bail out: */
- BUG_ON(1);
- }
-
- b->opcode = 0xe8; /* call */
- b->delta = delta;
- BUILD_BUG_ON(sizeof(*b) != call_len);
-
- return call_len;
+ __text_gen_insn(insn_buff, CALL_INSN_OPCODE,
+ (void *)addr, target, CALL_INSN_SIZE);
+ return CALL_INSN_SIZE;
}
#ifdef CONFIG_PARAVIRT_XXL
@@ -85,25 +85,6 @@ u64 notrace _paravirt_ident_64(u64 x)
{
return x;
}
-
-static unsigned paravirt_patch_jmp(void *insn_buff, const void *target,
- unsigned long addr, unsigned len)
-{
- struct branch *b = insn_buff;
- unsigned long delta = (unsigned long)target - (addr+5);
-
- if (len < 5) {
-#ifdef CONFIG_RETPOLINE
- WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
-#endif
- return len; /* call too long for patch site */
- }
-
- b->opcode = 0xe9; /* jmp */
- b->delta = delta;
-
- return 5;
-}
#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -114,8 +95,8 @@ void __init native_pv_lock_init(void)
static_branch_disable(&virt_spin_lock_key);
}
-unsigned paravirt_patch_default(u8 type, void *insn_buff,
- unsigned long addr, unsigned len)
+unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
+ unsigned int len)
{
/*
* Neat trick to map patch type back to the call within the
@@ -125,20 +106,10 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
unsigned ret;
if (opfunc == NULL)
- /* If there's no function, patch it with a ud2a (BUG) */
- ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a));
+ /* If there's no function, patch it with paravirt_BUG() */
+ ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len);
else if (opfunc == _paravirt_nop)
ret = 0;
-
-#ifdef CONFIG_PARAVIRT_XXL
- /* identity functions just return their single argument */
- else if (opfunc == _paravirt_ident_64)
- ret = paravirt_patch_ident_64(insn_buff, len);
-
- else if (type == PARAVIRT_PATCH(cpu.iret))
- /* If operation requires a jmp, then jmp */
- ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
-#endif
else
/* Otherwise call the function. */
ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
@@ -146,19 +117,6 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
return ret;
}
-unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
- const char *start, const char *end)
-{
- unsigned insn_len = end - start;
-
- /* Alternative instruction is too large for the patch site and we cannot continue: */
- BUG_ON(insn_len > len || start == NULL);
-
- memcpy(insn_buff, start, insn_len);
-
- return insn_len;
-}
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -167,9 +125,15 @@ static u64 native_steal_clock(int cpu)
return 0;
}
-/* These are in entry.S */
-extern void native_iret(void);
+DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
+DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);
+void paravirt_set_sched_clock(u64 (*func)(void))
+{
+ static_call_update(pv_sched_clock, func);
+}
+
+/* These are in entry.S */
static struct resource reserve_ioports = {
.start = 0,
.end = IO_SPACE_LIMIT,
@@ -248,6 +212,36 @@ void paravirt_end_context_switch(struct task_struct *next)
if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
arch_enter_lazy_mmu_mode();
}
+
+static noinstr unsigned long pv_native_read_cr2(void)
+{
+ return native_read_cr2();
+}
+
+static noinstr void pv_native_write_cr2(unsigned long val)
+{
+ native_write_cr2(val);
+}
+
+static noinstr unsigned long pv_native_get_debugreg(int regno)
+{
+ return native_get_debugreg(regno);
+}
+
+static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
+{
+ native_set_debugreg(regno, val);
+}
+
+static noinstr void pv_native_irq_enable(void)
+{
+ native_irq_enable();
+}
+
+static noinstr void pv_native_irq_disable(void)
+{
+ native_irq_disable();
+}
#endif
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
@@ -269,20 +263,13 @@ struct pv_info pv_info = {
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
struct paravirt_patch_template pv_ops = {
- /* Init ops. */
- .init.patch = native_patch,
-
- /* Time ops. */
- .time.sched_clock = native_sched_clock,
- .time.steal_clock = native_steal_clock,
-
/* Cpu ops. */
.cpu.io_delay = native_io_delay,
#ifdef CONFIG_PARAVIRT_XXL
.cpu.cpuid = native_cpuid,
- .cpu.get_debugreg = native_get_debugreg,
- .cpu.set_debugreg = native_set_debugreg,
+ .cpu.get_debugreg = pv_native_get_debugreg,
+ .cpu.set_debugreg = pv_native_set_debugreg,
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
@@ -308,8 +295,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
- .cpu.iret = native_iret,
-
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
.cpu.update_io_bitmap = native_tss_update_io_bitmap,
@@ -320,8 +305,8 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
- .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
+ .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
+ .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
.irq.safe_halt = native_safe_halt,
.irq.halt = native_halt,
#endif /* CONFIG_PARAVIRT_XXL */
@@ -330,15 +315,16 @@ struct paravirt_patch_template pv_ops = {
.mmu.flush_tlb_user = native_flush_tlb_local,
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
- .mmu.flush_tlb_others = native_flush_tlb_others,
+ .mmu.flush_tlb_multi = native_flush_tlb_multi,
.mmu.tlb_remove_table =
(void (*)(struct mmu_gather *, void *))tlb_remove_page,
.mmu.exit_mmap = paravirt_nop,
+ .mmu.notify_page_enc_status_changed = paravirt_nop,
#ifdef CONFIG_PARAVIRT_XXL
- .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2),
- .mmu.write_cr2 = native_write_cr2,
+ .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
+ .mmu.write_cr2 = pv_native_write_cr2,
.mmu.read_cr3 = __native_read_cr3,
.mmu.write_cr3 = native_write_cr3,
@@ -410,9 +396,6 @@ struct paravirt_patch_template pv_ops = {
};
#ifdef CONFIG_PARAVIRT_XXL
-/* At this point, native_get/set_debugreg has real function entries */
-NOKPROBE_SYMBOL(native_get_debugreg);
-NOKPROBE_SYMBOL(native_set_debugreg);
NOKPROBE_SYMBOL(native_load_idt);
#endif
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
deleted file mode 100644
index abd27ec67397..000000000000
--- a/arch/x86/kernel/paravirt_patch.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/stringify.h>
-
-#include <asm/paravirt.h>
-#include <asm/asm-offsets.h>
-
-#define PSTART(d, m) \
- patch_data_##d.m
-
-#define PEND(d, m) \
- (PSTART(d, m) + sizeof(patch_data_##d.m))
-
-#define PATCH(d, m, insn_buff, len) \
- paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m))
-
-#define PATCH_CASE(ops, m, data, insn_buff, len) \
- case PARAVIRT_PATCH(ops.m): \
- return PATCH(data, ops##_##m, insn_buff, len)
-
-#ifdef CONFIG_PARAVIRT_XXL
-struct patch_xxl {
- const unsigned char irq_irq_disable[1];
- const unsigned char irq_irq_enable[1];
- const unsigned char irq_save_fl[2];
- const unsigned char mmu_read_cr2[3];
- const unsigned char mmu_read_cr3[3];
- const unsigned char mmu_write_cr3[3];
- const unsigned char cpu_wbinvd[2];
- const unsigned char mov64[3];
-};
-
-static const struct patch_xxl patch_data_xxl = {
- .irq_irq_disable = { 0xfa }, // cli
- .irq_irq_enable = { 0xfb }, // sti
- .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax
- .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
- .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
- .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
- .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
- .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
-};
-
-unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len)
-{
- return PATCH(xxl, mov64, insn_buff, len);
-}
-# endif /* CONFIG_PARAVIRT_XXL */
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-struct patch_lock {
- unsigned char queued_spin_unlock[3];
- unsigned char vcpu_is_preempted[2];
-};
-
-static const struct patch_lock patch_data_lock = {
- .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax
-
-# ifdef CONFIG_X86_64
- .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi)
-# else
- .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax)
-# endif
-};
-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-
-unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
- unsigned int len)
-{
- switch (type) {
-
-#ifdef CONFIG_PARAVIRT_XXL
- PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
- PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
- PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
-
- PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len);
- PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
- PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
-
- PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
-#endif
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
- case PARAVIRT_PATCH(lock.queued_spin_unlock):
- if (pv_is_native_spin_unlock())
- return PATCH(lock, queued_spin_unlock, insn_buff, len);
- break;
-
- case PARAVIRT_PATCH(lock.vcpu_is_preempted):
- if (pv_is_native_vcpu_is_preempted())
- return PATCH(lock, vcpu_is_preempted, insn_buff, len);
- break;
-#endif
- default:
- break;
- }
-
- return paravirt_patch_default(type, insn_buff, addr, len);
-}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index de234e7a8962..30bbe4abb5d6 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -7,13 +7,16 @@
#include <linux/memblock.h>
#include <linux/gfp.h>
#include <linux/pci.h>
+#include <linux/amd-iommu.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
+
+#include <xen/xen.h>
+#include <xen/swiotlb-xen.h>
static bool disable_dac_quirk __read_mostly;
@@ -34,24 +37,90 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
+#ifdef CONFIG_SWIOTLB
+bool x86_swiotlb_enable;
+static unsigned int x86_swiotlb_flags;
+
+static void __init pci_swiotlb_detect(void)
+{
+ /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+ if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
+ x86_swiotlb_enable = true;
+
+ /*
+ * Set swiotlb to 1 so that bounce buffers are allocated and used for
+ * devices that can't support DMA to encrypted memory.
+ */
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ x86_swiotlb_enable = true;
+
+ /*
+ * Guest with guest memory encryption currently perform all DMA through
+ * bounce buffers as the hypervisor can't access arbitrary VM memory
+ * that is not explicitly shared with it.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ x86_swiotlb_enable = true;
+ x86_swiotlb_flags |= SWIOTLB_FORCE;
+ }
+}
+#else
+static inline void __init pci_swiotlb_detect(void)
+{
+}
+#define x86_swiotlb_flags 0
+#endif /* CONFIG_SWIOTLB */
+
+#ifdef CONFIG_SWIOTLB_XEN
+static void __init pci_xen_swiotlb_init(void)
+{
+ if (!xen_initial_domain() && !x86_swiotlb_enable)
+ return;
+ x86_swiotlb_enable = true;
+ x86_swiotlb_flags |= SWIOTLB_ANY;
+ swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup);
+ dma_ops = &xen_swiotlb_dma_ops;
+ if (IS_ENABLED(CONFIG_PCI))
+ pci_request_acs();
+}
+
+int pci_xen_swiotlb_init_late(void)
+{
+ if (dma_ops == &xen_swiotlb_dma_ops)
+ return 0;
+
+ /* we can work with the default swiotlb */
+ if (!io_tlb_default_mem.nslabs) {
+ int rc = swiotlb_init_late(swiotlb_size_or_default(),
+ GFP_KERNEL, xen_swiotlb_fixup);
+ if (rc < 0)
+ return rc;
+ }
+
+ /* XXX: this switches the dma ops under live devices! */
+ dma_ops = &xen_swiotlb_dma_ops;
+ if (IS_ENABLED(CONFIG_PCI))
+ pci_request_acs();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
+#else
+static inline void __init pci_xen_swiotlb_init(void)
+{
+}
+#endif /* CONFIG_SWIOTLB_XEN */
void __init pci_iommu_alloc(void)
{
- struct iommu_table_entry *p;
-
- sort_iommu_table(__iommu_table, __iommu_table_end);
- check_iommu_entries(__iommu_table, __iommu_table_end);
-
- for (p = __iommu_table; p < __iommu_table_end; p++) {
- if (p && p->detect && p->detect() > 0) {
- p->flags |= IOMMU_DETECTED;
- if (p->early_init)
- p->early_init();
- if (p->flags & IOMMU_FINISH_IF_DETECTED)
- break;
- }
+ if (xen_pv_domain()) {
+ pci_xen_swiotlb_init();
+ return;
}
+ pci_swiotlb_detect();
+ gart_iommu_hole_init();
+ amd_iommu_detect();
+ detect_intel_iommu();
+ swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
}
/*
@@ -102,7 +171,7 @@ static __init int iommu_setup(char *p)
}
#ifdef CONFIG_SWIOTLB
if (!strncmp(p, "soft", 4))
- swiotlb = 1;
+ x86_swiotlb_enable = true;
#endif
if (!strncmp(p, "pt", 2))
iommu_set_default_passthrough(true);
@@ -121,14 +190,17 @@ early_param("iommu", iommu_setup);
static int __init pci_iommu_init(void)
{
- struct iommu_table_entry *p;
-
x86_init.iommu.iommu_init();
- for (p = __iommu_table; p < __iommu_table_end; p++) {
- if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
- p->late_init();
+#ifdef CONFIG_SWIOTLB
+ /* An IOMMU turned us off. */
+ if (x86_swiotlb_enable) {
+ pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ } else {
+ swiotlb_exit();
}
+#endif
return 0;
}
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
deleted file mode 100644
index 42e92ec62973..000000000000
--- a/arch/x86/kernel/pci-iommu_table.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/dma-mapping.h>
-#include <asm/iommu_table.h>
-#include <linux/string.h>
-#include <linux/kallsyms.h>
-
-static struct iommu_table_entry * __init
-find_dependents_of(struct iommu_table_entry *start,
- struct iommu_table_entry *finish,
- struct iommu_table_entry *q)
-{
- struct iommu_table_entry *p;
-
- if (!q)
- return NULL;
-
- for (p = start; p < finish; p++)
- if (p->detect == q->depend)
- return p;
-
- return NULL;
-}
-
-
-void __init sort_iommu_table(struct iommu_table_entry *start,
- struct iommu_table_entry *finish) {
-
- struct iommu_table_entry *p, *q, tmp;
-
- for (p = start; p < finish; p++) {
-again:
- q = find_dependents_of(start, finish, p);
- /* We are bit sneaky here. We use the memory address to figure
- * out if the node we depend on is past our point, if so, swap.
- */
- if (q > p) {
- tmp = *p;
- memmove(p, q, sizeof(*p));
- *q = tmp;
- goto again;
- }
- }
-
-}
-
-#ifdef DEBUG
-void __init check_iommu_entries(struct iommu_table_entry *start,
- struct iommu_table_entry *finish)
-{
- struct iommu_table_entry *p, *q, *x;
-
- /* Simple cyclic dependency checker. */
- for (p = start; p < finish; p++) {
- q = find_dependents_of(start, finish, p);
- x = find_dependents_of(start, finish, q);
- if (p == x) {
- printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
- p->detect, q->detect);
- /* Heavy handed way..*/
- x->depend = NULL;
- }
- }
-
- for (p = start; p < finish; p++) {
- q = find_dependents_of(p, finish, p);
- if (q && q > p) {
- printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
- p->detect, q->detect);
- }
- }
-}
-#else
-void __init check_iommu_entries(struct iommu_table_entry *start,
- struct iommu_table_entry *finish)
-{
-}
-#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
deleted file mode 100644
index c2cfa5e7c152..000000000000
--- a/arch/x86/kernel/pci-swiotlb.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/pci.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/swiotlb.h>
-#include <linux/memblock.h>
-#include <linux/dma-direct.h>
-#include <linux/mem_encrypt.h>
-
-#include <asm/iommu.h>
-#include <asm/swiotlb.h>
-#include <asm/dma.h>
-#include <asm/xen/swiotlb-xen.h>
-#include <asm/iommu_table.h>
-
-int swiotlb __read_mostly;
-
-/*
- * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
- *
- * This returns non-zero if we are forced to use swiotlb (by the boot
- * option).
- */
-int __init pci_swiotlb_detect_override(void)
-{
- if (swiotlb_force == SWIOTLB_FORCE)
- swiotlb = 1;
-
- return swiotlb;
-}
-IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
- pci_xen_swiotlb_detect,
- pci_swiotlb_init,
- pci_swiotlb_late_init);
-
-/*
- * If 4GB or more detected (and iommu=off not set) or if SME is active
- * then set swiotlb to 1 and return 1.
- */
-int __init pci_swiotlb_detect_4gb(void)
-{
- /* don't initialize swiotlb if iommu=off (no_iommu=1) */
- if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
- swiotlb = 1;
-
- /*
- * If SME is active then swiotlb will be set to 1 so that bounce
- * buffers are allocated and used for devices that do not support
- * the addressing range required for the encryption mask.
- */
- if (sme_active())
- swiotlb = 1;
-
- return swiotlb;
-}
-IOMMU_INIT(pci_swiotlb_detect_4gb,
- pci_swiotlb_detect_override,
- pci_swiotlb_init,
- pci_swiotlb_late_init);
-
-void __init pci_swiotlb_init(void)
-{
- if (swiotlb)
- swiotlb_init(0);
-}
-
-void __init pci_swiotlb_late_init(void)
-{
- /* An IOMMU turned us off. */
- if (!swiotlb)
- swiotlb_exit();
- else {
- printk(KERN_INFO "PCI-DMA: "
- "Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_print_info();
- }
-}
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 9e1def3744f2..319fef37d9dc 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -21,6 +21,7 @@
#include <asm/sections.h>
#include <asm/io.h>
#include <asm/setup_arch.h>
+#include <asm/sev.h>
static struct resource system_rom_resource = {
.name = "System ROM",
@@ -80,7 +81,7 @@ static struct resource video_rom_resource = {
*/
static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
{
- struct pci_driver *drv = pdev->driver;
+ struct pci_driver *drv = to_pci_driver(pdev->dev.driver);
const struct pci_device_id *id;
if (pdev->vendor == vendor && pdev->device == device)
@@ -197,11 +198,21 @@ static int __init romchecksum(const unsigned char *rom, unsigned long length)
void __init probe_roms(void)
{
- const unsigned char *rom;
unsigned long start, length, upper;
+ const unsigned char *rom;
unsigned char c;
int i;
+ /*
+ * The ROM memory range is not part of the e820 table and is therefore not
+ * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted
+ * memory, and SNP requires encrypted memory to be validated before access.
+ * Do that here.
+ */
+ snp_prep_memory(video_rom_resource.start,
+ ((system_rom_resource.end + 1) - video_rom_resource.start),
+ SNP_PAGE_STATE_PRIVATE);
+
/* video rom */
upper = adapter_rom_resources[0].start;
for (start = video_rom_resource.start; start < upper; start += 2048) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9c214d7085a4..9b2772b7e1f3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,7 +30,9 @@
#include <asm/apic.h>
#include <linux/uaccess.h>
#include <asm/mwait.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
#include <asm/tlbflush.h>
@@ -43,6 +45,8 @@
#include <asm/io_bitmap.h>
#include <asm/proto.h>
#include <asm/frame.h>
+#include <asm/unwind.h>
+#include <asm/tdx.h>
#include "process.h"
@@ -63,14 +67,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
- /*
- * .sp1 is cpu_current_top_of_stack. The init task never
- * runs user code, but cpu_current_top_of_stack should still
- * be well defined before the first context switch.
- */
+#ifdef CONFIG_X86_32
.sp1 = TOP_OF_INIT_STACK,
-#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
#endif
@@ -92,9 +91,19 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
+ /* Drop the copied pointer to current's fpstate */
+ dst->thread.fpu.fpstate = NULL;
+
+ return 0;
+}
- return fpu__copy(dst, src);
+#ifdef CONFIG_X86_64
+void arch_release_task_struct(struct task_struct *tsk)
+{
+ if (fpu_state_size_dynamic())
+ fpstate_free(&tsk->thread.fpu);
}
+#endif
/*
* Free thread data structures etc..
@@ -122,9 +131,11 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
-int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
- struct task_struct *p, unsigned long tls)
+int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
+ unsigned long clone_flags = args->flags;
+ unsigned long sp = args->stack;
+ unsigned long tls = args->tls;
struct inactive_task_frame *frame;
struct fork_frame *fork_frame;
struct pt_regs *childregs;
@@ -138,6 +149,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
@@ -151,6 +163,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
savesegment(ds, p->thread.ds);
#else
p->thread.sp0 = (unsigned long) (childregs + 1);
+ savesegment(gs, p->thread.gs);
/*
* Clear all status flags including IF and set fixed bit. 64bit
* does not have this initialization as the frame does not contain
@@ -160,22 +173,44 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame->flags = X86_EFLAGS_FIXED;
#endif
+ fpu_clone(p, clone_flags, args->fn);
+
/* Kernel thread ? */
- if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ p->thread.pkru = pkru_get_init_value();
memset(childregs, 0, sizeof(struct pt_regs));
- kthread_frame_init(frame, sp, arg);
+ kthread_frame_init(frame, args->fn, args->fn_arg);
return 0;
}
+ /*
+ * Clone current's PKRU value from hardware. tsk->thread.pkru
+ * is only valid when scheduled out.
+ */
+ p->thread.pkru = read_pkru();
+
frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
-#ifdef CONFIG_X86_32
- task_user_gs(p) = get_user_gs(current_pt_regs());
-#endif
+ if (unlikely(args->fn)) {
+ /*
+ * A user space thread, but it doesn't return to
+ * ret_after_fork().
+ *
+ * In order to indicate that to tools like gdb,
+ * we reset the stack and instruction pointers.
+ *
+ * It does the same kernel frame setup to return to a kernel
+ * function that a kernel thread does.
+ */
+ childregs->sp = 0;
+ childregs->ip = 0;
+ kthread_frame_init(frame, args->fn, args->fn_arg);
+ return 0;
+ }
/* Set a new TLS for the child thread? */
if (clone_flags & CLONE_SETTLS)
@@ -187,6 +222,15 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
return ret;
}
+static void pkru_flush_thread(void)
+{
+ /*
+ * If PKRU is enabled the default PKRU value has to be loaded into
+ * the hardware right here (similar to context switch).
+ */
+ pkru_write_default();
+}
+
void flush_thread(void)
{
struct task_struct *tsk = current;
@@ -194,7 +238,8 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- fpu__clear_all(&tsk->thread.fpu);
+ fpu_flush_thread();
+ pkru_flush_thread();
}
void disable_TSC(void)
@@ -289,7 +334,7 @@ static int get_cpuid_mode(void)
return !test_thread_flag(TIF_NOCPUID);
}
-static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+static int set_cpuid_mode(unsigned long cpuid_enabled)
{
if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
return -ENODEV;
@@ -320,7 +365,7 @@ void arch_setup_new_exec(void)
clear_thread_flag(TIF_SSBD);
task_clear_spec_ssb_disable(current);
task_clear_spec_ssb_noexec(current);
- speculation_ctrl_update(task_thread_info(current)->flags);
+ speculation_ctrl_update(read_thread_flags());
}
}
@@ -360,7 +405,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
}
/**
- * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
+ * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
*/
void native_tss_update_io_bitmap(void)
{
@@ -451,7 +496,7 @@ void speculative_store_bypass_ht_init(void)
* First HT sibling to come up on the core. Link shared state of
* the first HT sibling to itself. The siblings on the same core
* which come up later will see the shared state pointer and link
- * themself to the state of this CPU.
+ * themselves to the state of this CPU.
*/
st->shared_state = st;
}
@@ -572,7 +617,7 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
}
/* Return the updated threadinfo flags*/
- return task_thread_info(tsk)->flags;
+ return read_task_thread_flags(tsk);
}
void speculation_ctrl_update(unsigned long tif)
@@ -608,8 +653,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
unsigned long tifp, tifn;
- tifn = READ_ONCE(task_thread_info(next_p)->flags);
- tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+ tifn = read_task_thread_flags(next_p);
+ tifp = read_task_thread_flags(prev_p);
switch_to_bitmap(tifp);
@@ -641,9 +686,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
/* Enforce MSR update to ensure consistent state */
__speculation_ctrl_update(~tifn, tifn);
}
-
- if ((tifp ^ tifn) & _TIF_SLD)
- switch_to_sld(tifn);
}
/*
@@ -702,7 +744,7 @@ bool xen_set_default_idle(void)
}
#endif
-void stop_this_cpu(void *dummy)
+void __noreturn stop_this_cpu(void *dummy)
{
local_irq_disable();
/*
@@ -720,8 +762,11 @@ void stop_this_cpu(void *dummy)
* without the encryption bit, they don't race each other when flushed
* and potentially end up with the wrong entry being committed to
* memory.
+ *
+ * Test the CPUID bit directly because the machine might've cleared
+ * X86_FEATURE_SME due to cmdline options.
*/
- if (boot_cpu_has(X86_FEATURE_SME))
+ if (cpuid_eax(0x8000001f) & BIT(0))
native_wbinvd();
for (;;) {
/*
@@ -825,6 +870,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
} else if (prefer_mwait_c1_over_halt(c)) {
pr_info("using mwait in idle threads\n");
x86_idle = mwait_idle;
+ } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ pr_info("using TDX aware idle routine\n");
+ x86_idle = tdx_safe_halt;
} else
x86_idle = default_idle;
}
@@ -914,70 +962,42 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
* because the task might wake up and we might look at a stack
* changing under us.
*/
-unsigned long get_wchan(struct task_struct *p)
+unsigned long __get_wchan(struct task_struct *p)
{
- unsigned long start, bottom, top, sp, fp, ip, ret = 0;
- int count = 0;
-
- if (p == current || p->state == TASK_RUNNING)
- return 0;
+ struct unwind_state state;
+ unsigned long addr = 0;
if (!try_get_task_stack(p))
return 0;
- start = (unsigned long)task_stack_page(p);
- if (!start)
- goto out;
-
- /*
- * Layout of the stack page:
- *
- * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
- * PADDING
- * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
- * stack
- * ----------- bottom = start
- *
- * The tasks stack pointer points at the location where the
- * framepointer is stored. The data on the stack is:
- * ... IP FP ... IP FP
- *
- * We need to read FP and IP, so we need to adjust the upper
- * bound by another unsigned long.
- */
- top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
- top -= 2 * sizeof(unsigned long);
- bottom = start;
-
- sp = READ_ONCE(p->thread.sp);
- if (sp < bottom || sp > top)
- goto out;
-
- fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
- do {
- if (fp < bottom || fp > top)
- goto out;
- ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
- if (!in_sched_functions(ip)) {
- ret = ip;
- goto out;
- }
- fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
- } while (count++ < 16 && p->state != TASK_RUNNING);
+ for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+ if (in_sched_functions(addr))
+ continue;
+ break;
+ }
-out:
put_task_stack(p);
- return ret;
+
+ return addr;
}
-long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long cpuid_enabled)
+long do_arch_prctl_common(int option, unsigned long arg2)
{
switch (option) {
case ARCH_GET_CPUID:
return get_cpuid_mode();
case ARCH_SET_CPUID:
- return set_cpuid_mode(task, cpuid_enabled);
+ return set_cpuid_mode(arg2);
+ case ARCH_GET_XCOMP_SUPP:
+ case ARCH_GET_XCOMP_PERM:
+ case ARCH_REQ_XCOMP_PERM:
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ return fpu_xstate_prctl(option, arg2);
}
return -EINVAL;
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
index 1d0797b2338a..76b547b83232 100644
--- a/arch/x86/kernel/process.h
+++ b/arch/x86/kernel/process.h
@@ -13,8 +13,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
static inline void switch_to_extra(struct task_struct *prev,
struct task_struct *next)
{
- unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long prev_tif = task_thread_info(prev)->flags;
+ unsigned long next_tif = read_task_thread_flags(next);
+ unsigned long prev_tif = read_task_thread_flags(prev);
if (IS_ENABLED(CONFIG_SMP)) {
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4f2f54e1281c..2f314b170c9f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -41,7 +41,7 @@
#include <asm/ldt.h>
#include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
#include <asm/desc.h>
#include <linux/err.h>
@@ -63,10 +63,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
unsigned long d0, d1, d2, d3, d6, d7;
unsigned short gs;
- if (user_mode(regs))
- gs = get_user_gs(regs);
- else
- savesegment(gs, gs);
+ savesegment(gs, gs);
show_ip(regs, log_lvl);
@@ -114,7 +111,7 @@ void release_thread(struct task_struct *dead_task)
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- set_user_gs(regs, 0);
+ loadsegment(gs, 0);
regs->fs = 0;
regs->ds = __USER_DS;
regs->es = __USER_DS;
@@ -160,7 +157,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -178,7 +174,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- lazy_save_gs(prev->gs);
+ savesegment(gs, prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -209,11 +205,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- lazy_load_gs(next->gs);
+ loadsegment(gs, next->gs);
this_cpu_write(current_task, next_p);
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish();
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in();
@@ -223,5 +219,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
- return do_arch_prctl_common(current, option, arg2);
+ return do_arch_prctl_common(option, arg2);
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d08307df69ad..1962008fe743 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -41,7 +41,8 @@
#include <linux/syscalls.h>
#include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/pkru.h>
+#include <asm/fpu/sched.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
#include <asm/desc.h>
@@ -136,7 +137,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
log_lvl, d3, d6, d7);
}
- if (boot_cpu_has(X86_FEATURE_OSPKE))
+ if (cpu_feature_enabled(X86_FEATURE_OSPKE))
printk("%sPKRU: %08x\n", log_lvl, read_pkru());
}
@@ -339,6 +340,29 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
}
}
+/*
+ * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
+ * is not XSTATE managed on context switch because that would require a
+ * lookup in the task's FPU xsave buffer and require to keep that updated
+ * in various places.
+ */
+static __always_inline void x86_pkru_load(struct thread_struct *prev,
+ struct thread_struct *next)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ /* Stash the prev task's value: */
+ prev->pkru = rdpkru();
+
+ /*
+ * PKRU writes are slightly expensive. Avoid them when not
+ * strictly necessary:
+ */
+ if (prev->pkru != next->pkru)
+ wrpkru(next->pkru);
+}
+
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
struct thread_struct *next)
{
@@ -535,7 +559,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
@@ -588,13 +611,15 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
x86_fsgsbase_load(prev, next);
+ x86_pkru_load(prev, next);
+
/*
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish();
/* Reload sp0. */
update_task_stack(next_p);
@@ -656,7 +681,7 @@ void set_personality_64bit(void)
static void __set_personality_x32(void)
{
-#ifdef CONFIG_X86_X32
+#ifdef CONFIG_X86_X32_ABI
if (current->mm)
current->mm->context.flags = 0;
@@ -819,7 +844,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
ret = do_arch_prctl_64(current, option, arg2);
if (ret == -EINVAL)
- ret = do_arch_prctl_common(current, option, arg2);
+ ret = do_arch_prctl_common(option, arg2);
return ret;
}
@@ -827,7 +852,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
#ifdef CONFIG_IA32_EMULATION
COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
- return do_arch_prctl_common(current, option, arg2);
+ return do_arch_prctl_common(option, arg2);
}
#endif
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 87a4143aa7d7..37c12fb92906 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -13,7 +13,6 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/security.h>
@@ -29,9 +28,9 @@
#include <linux/uaccess.h>
#include <asm/processor.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
@@ -171,9 +170,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
retval = *pt_regs_access(task_pt_regs(task), offset);
else {
if (task == current)
- retval = get_user_gs(task_pt_regs(task));
+ savesegment(gs, retval);
else
- retval = task_user_gs(task);
+ retval = task->thread.gs;
}
return retval;
}
@@ -211,7 +210,7 @@ static int set_segment_reg(struct task_struct *task,
break;
case offsetof(struct user_regs_struct, gs):
- task_user_gs(task) = value;
+ task->thread.gs = value;
}
return 0;
@@ -911,7 +910,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
* syscall with TS_COMPAT still set.
*/
regs->orig_ax = value;
- if (syscall_get_nr(child, regs) >= 0)
+ if (syscall_get_nr(child, regs) != -1)
child->thread_info.status |= TS_I386_REGS_POKED;
break;
@@ -1224,7 +1223,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
},
[REGSET_FP] = {
.core_note_type = NT_PRFPREG,
- .n = sizeof(struct user_i387_struct) / sizeof(long),
+ .n = sizeof(struct fxregs_state) / sizeof(long),
.size = sizeof(long), .align = sizeof(long),
.active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
},
@@ -1271,7 +1270,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
},
[REGSET_XFP] = {
.core_note_type = NT_PRXFPREG,
- .n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
+ .n = sizeof(struct fxregs_state) / sizeof(u32),
.size = sizeof(u32), .align = sizeof(u32),
.active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
},
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 11065dc03f5b..eda37df016f0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -89,7 +89,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
/*
* Assumption here is that last_value, a global accumulator, always goes
* forward. If we are less than that, we should not be much smaller.
- * We assume there is an error marging we're inside, and then the correction
+ * We assume there is an error margin we're inside, and then the correction
* does not sacrifice accuracy.
*
* For reads: global may have changed between test and return,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index b29657b76e3f..c3636ea4aa71 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type)
spin_unlock(&rtc_lock);
/*
- * Switch back to the initial page table.
+ * Switch to the trampoline page table.
*/
-#ifdef CONFIG_X86_32
- load_cr3(initial_page_table);
-#else
- write_cr3(real_mode_header->trampoline_pgd);
-
- /* Exiting long mode will fail if CR4.PCIDE is set. */
- if (boot_cpu_has(X86_FEATURE_PCID))
- cr4_clear_bits(X86_CR4_PCIDE);
-#endif
+ load_trampoline_pgtable();
/* Jump to the identity-mapped low memory code */
#ifdef CONFIG_X86_32
@@ -388,10 +380,11 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
},
{ /* Handle problems with rebooting on the OptiPlex 990. */
.callback = set_pci_reboot,
- .ident = "Dell OptiPlex 990",
+ .ident = "Dell OptiPlex 990 BIOS A0x",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A0"),
},
},
{ /* Handle problems with rebooting on Dell 300's */
@@ -669,7 +662,7 @@ static void native_machine_emergency_restart(void)
break;
case BOOT_TRIPLE:
- idt_invalidate(NULL);
+ idt_invalidate();
__asm__ __volatile__("int3");
/* We're probably dead after this, but... */
@@ -746,10 +739,10 @@ static void native_machine_halt(void)
static void native_machine_power_off(void)
{
- if (pm_power_off) {
+ if (kernel_can_power_off()) {
if (!reboot_force)
machine_shutdown();
- pm_power_off();
+ do_kernel_power_off();
}
/* A fallback in case there is no PM info available */
tboot_shutdown(TB_SHUTDOWN_HALT);
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 94b33885f8d2..fcc8a7699103 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
movl %edi, %eax
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
- ret
+ RET
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
@@ -107,7 +107,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
- * - Proctected mode enabled
+ * - Protected mode enabled
*/
movl %cr0, %eax
andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax
@@ -159,7 +159,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
xorl %edx, %edx
xorl %esi, %esi
xorl %ebp, %ebp
- ret
+ RET
1:
popl %edx
movl CP_PA_SWAP_PAGE(%edi), %esp
@@ -190,7 +190,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movl %edi, %eax
addl $(virtual_mapped - relocate_kernel), %eax
pushl %eax
- ret
+ RET
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
@@ -208,7 +208,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
popl %edi
popl %esi
popl %ebx
- ret
+ RET
SYM_CODE_END(virtual_mapped)
/* Do the copies */
@@ -271,7 +271,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
popl %edi
popl %ebx
popl %ebp
- ret
+ RET
SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index a4d9a261425b..c1d8626c53b6 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -42,12 +42,13 @@
.code64
SYM_CODE_START_NOALIGN(relocate_kernel)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
/*
* %rdi indirection_page
* %rsi page_list
* %rdx start address
* %rcx preserve_context
- * %r8 sme_active
+ * %r8 host_mem_enc_active
*/
/* Save the CPU context, used for jumping back */
@@ -104,7 +105,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
/* jump to identity mapped page */
addq $(identity_mapped - relocate_kernel), %r8
pushq %r8
- ret
+ RET
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
@@ -115,13 +116,21 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
pushq %rdx
/*
+ * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
+ * below.
+ */
+ movq %cr4, %rax
+ andq $~(X86_CR4_CET), %rax
+ movq %rax, %cr4
+
+ /*
* Set cr0 to a known state:
* - Paging enabled
* - Alignment check disabled
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
- * - Proctected mode enabled
+ * - Protected mode enabled
*/
movq %cr0, %rax
andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
@@ -191,7 +200,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
xorl %r14d, %r14d
xorl %r15d, %r15d
- ret
+ RET
1:
popq %rdx
@@ -210,11 +219,12 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
call swap_pages
movq $virtual_mapped, %rax
pushq %rax
- ret
+ RET
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR // RET target, above
movq RSP(%r8), %rsp
movq CR4(%r8), %rax
movq %rax, %cr4
@@ -231,7 +241,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
popq %r12
popq %rbp
popq %rbx
- ret
+ RET
SYM_CODE_END(virtual_mapped)
/* Do the copies */
@@ -288,7 +298,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
lea PAGE_SIZE(%rax), %rsi
jmp 0b
3:
- ret
+ RET
SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 9b9fb7882c20..bba1abd05bfe 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/ioport.h>
+#include <linux/printk.h>
#include <asm/e820/api.h>
+#include <asm/pci_x86.h>
static void resource_clip(struct resource *res, resource_size_t start,
resource_size_t end)
@@ -27,12 +29,23 @@ static void remove_e820_regions(struct resource *avail)
{
int i;
struct e820_entry *entry;
+ u64 e820_start, e820_end;
+ struct resource orig = *avail;
+
+ if (!pci_use_e820)
+ return;
for (i = 0; i < e820_table->nr_entries; i++) {
entry = &e820_table->entries[i];
+ e820_start = entry->addr;
+ e820_end = entry->addr + entry->size - 1;
- resource_clip(avail, entry->addr,
- entry->addr + entry->size - 1);
+ resource_clip(avail, e820_start, e820_end);
+ if (orig.start != avail->start || orig.end != avail->end) {
+ pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
+ &orig, avail, e820_start, e820_end);
+ orig = *avail;
+ }
}
}
diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c
new file mode 100644
index 000000000000..8a1c0111ae79
--- /dev/null
+++ b/arch/x86/kernel/rethook.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c.
+ */
+#include <linux/bug.h>
+#include <linux/rethook.h>
+#include <linux/kprobes.h>
+#include <linux/objtool.h>
+
+#include "kprobes/common.h"
+
+__visible void arch_rethook_trampoline_callback(struct pt_regs *regs);
+
+#ifndef ANNOTATE_NOENDBR
+#define ANNOTATE_NOENDBR
+#endif
+
+/*
+ * When a target function returns, this code saves registers and calls
+ * arch_rethook_trampoline_callback(), which calls the rethook handler.
+ */
+asm(
+ ".text\n"
+ ".global arch_rethook_trampoline\n"
+ ".type arch_rethook_trampoline, @function\n"
+ "arch_rethook_trampoline:\n"
+#ifdef CONFIG_X86_64
+ ANNOTATE_NOENDBR /* This is only jumped from ret instruction */
+ /* Push a fake return address to tell the unwinder it's a rethook. */
+ " pushq $arch_rethook_trampoline\n"
+ UNWIND_HINT_FUNC
+ " pushq $" __stringify(__KERNEL_DS) "\n"
+ /* Save the 'sp - 16', this will be fixed later. */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rdi\n"
+ " call arch_rethook_trampoline_callback\n"
+ RESTORE_REGS_STRING
+ /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */
+ " addq $16, %rsp\n"
+ " popfq\n"
+#else
+ /* Push a fake return address to tell the unwinder it's a rethook. */
+ " pushl $arch_rethook_trampoline\n"
+ UNWIND_HINT_FUNC
+ " pushl %ss\n"
+ /* Save the 'sp - 8', this will be fixed later. */
+ " pushl %esp\n"
+ " pushfl\n"
+ SAVE_REGS_STRING
+ " movl %esp, %eax\n"
+ " call arch_rethook_trampoline_callback\n"
+ RESTORE_REGS_STRING
+ /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */
+ " addl $8, %esp\n"
+ " popfl\n"
+#endif
+ ASM_RET
+ ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n"
+);
+NOKPROBE_SYMBOL(arch_rethook_trampoline);
+
+/*
+ * Called from arch_rethook_trampoline
+ */
+__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs)
+{
+ unsigned long *frame_pointer;
+
+ /* fixup registers */
+ regs->cs = __KERNEL_CS;
+#ifdef CONFIG_X86_32
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)&arch_rethook_trampoline;
+ regs->orig_ax = ~0UL;
+ regs->sp += 2*sizeof(long);
+ frame_pointer = (long *)(regs + 1);
+
+ /*
+ * The return address at 'frame_pointer' is recovered by the
+ * arch_rethook_fixup_return() which called from this
+ * rethook_trampoline_handler().
+ */
+ rethook_trampoline_handler(regs, (unsigned long)frame_pointer);
+
+ /*
+ * Copy FLAGS to 'pt_regs::ss' so that arch_rethook_trapmoline()
+ * can do RET right after POPF.
+ */
+ *(unsigned long *)&regs->ss = regs->flags;
+}
+NOKPROBE_SYMBOL(arch_rethook_trampoline_callback);
+
+/*
+ * arch_rethook_trampoline() skips updating frame pointer. The frame pointer
+ * saved in arch_rethook_trampoline_callback() points to the real caller
+ * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have
+ * a standard stack frame with CONFIG_FRAME_POINTER=y.
+ * Let's mark it non-standard function. Anyway, FP unwinder can correctly
+ * unwind without the hint.
+ */
+STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline);
+
+/* This is called from rethook_trampoline_handler(). */
+void arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ unsigned long *frame_pointer = (void *)(regs + 1);
+
+ /* Replace fake return address with real one. */
+ *frame_pointer = correct_ret_addr;
+}
+NOKPROBE_SYMBOL(arch_rethook_fixup_return);
+
+void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount)
+{
+ unsigned long *stack = (unsigned long *)regs->sp;
+
+ rh->ret_addr = stack[0];
+ rh->frame = regs->sp;
+
+ /* Replace the return addr with trampoline addr */
+ stack[0] = (unsigned long) arch_rethook_trampoline;
+}
+NOKPROBE_SYMBOL(arch_rethook_prepare);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d883176ef2ce..bd6c6fd373ae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -5,6 +5,7 @@
* This file contains the setup_arch() code, which handles the architecture-dependent
* parts of early kernel initialization.
*/
+#include <linux/acpi.h>
#include <linux/console.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
@@ -14,6 +15,7 @@
#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
#include <linux/memblock.h>
+#include <linux/panic_notifier.h>
#include <linux/pci.h>
#include <linux/root_dev.h>
#include <linux/hugetlb.h>
@@ -38,12 +40,14 @@
#include <asm/kasan.h>
#include <asm/kaslr.h>
#include <asm/mce.h>
+#include <asm/memtype.h>
#include <asm/mtrr.h>
#include <asm/realmode.h>
#include <asm/olpc_ofw.h>
#include <asm/pci-direct.h>
#include <asm/prom.h>
#include <asm/proto.h>
+#include <asm/thermal.h>
#include <asm/unwind.h>
#include <asm/vsyscall.h>
#include <linux/vmalloc.h>
@@ -63,11 +67,6 @@ RESERVE_BRK(dmi_alloc, 65536);
#endif
-/*
- * Range of the BSS area. The size of the BSS area is determined
- * at link time, with RESERVE_BRK*() facility reserving additional
- * chunks.
- */
unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end = (unsigned long)__brk_base;
@@ -319,7 +318,7 @@ static void __init reserve_initrd(void)
relocate_initrd();
- memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+ memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
@@ -365,21 +364,41 @@ static void __init parse_setup_data(void)
static void __init memblock_x86_reserve_range_setup_data(void)
{
+ struct setup_indirect *indirect;
struct setup_data *data;
- u64 pa_data;
+ u64 pa_data, pa_next;
+ u32 len;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
+ if (!data) {
+ pr_warn("setup: failed to memremap setup_data entry\n");
+ return;
+ }
+
+ len = sizeof(*data);
+ pa_next = data->next;
+
memblock_reserve(pa_data, sizeof(*data) + data->len);
- if (data->type == SETUP_INDIRECT &&
- ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT)
- memblock_reserve(((struct setup_indirect *)data->data)->addr,
- ((struct setup_indirect *)data->data)->len);
+ if (data->type == SETUP_INDIRECT) {
+ len += data->len;
+ early_memunmap(data, sizeof(*data));
+ data = early_memremap(pa_data, len);
+ if (!data) {
+ pr_warn("setup: failed to memremap indirect setup_data\n");
+ return;
+ }
- pa_data = data->next;
- early_memunmap(data, sizeof(*data));
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT)
+ memblock_reserve(indirect->addr, indirect->len);
+ }
+
+ pa_data = pa_next;
+ early_memunmap(data, len);
}
}
@@ -387,8 +406,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
* --------- Crashkernel reservation ------------------------------
*/
-#ifdef CONFIG_KEXEC_CORE
-
/* 16M alignment for crash kernel regions */
#define CRASH_ALIGN SZ_16M
@@ -466,6 +483,9 @@ static void __init reserve_crashkernel(void)
bool high = false;
int ret;
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
total_mem = memblock_phys_mem_size();
/* crashkernel=XM */
@@ -518,7 +538,7 @@ static void __init reserve_crashkernel(void)
}
if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
- memblock_free(crash_base, crash_size);
+ memblock_phys_free(crash_base, crash_size);
return;
}
@@ -531,11 +551,6 @@ static void __init reserve_crashkernel(void)
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
}
-#else
-static void __init reserve_crashkernel(void)
-{
-}
-#endif
static struct resource standard_io_resources[] = {
{ .name = "dma1", .start = 0x00, .end = 0x1f,
@@ -570,16 +585,6 @@ void __init reserve_standard_io_resources(void)
}
-static __init void reserve_ibft_region(void)
-{
- unsigned long addr, size = 0;
-
- addr = find_ibft_region(&size);
-
- if (size)
- memblock_reserve(addr, size);
-}
-
static bool __init snb_gfx_workaround_needed(void)
{
#ifdef CONFIG_PCI
@@ -633,11 +638,16 @@ static void __init trim_snb_memory(void)
printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
/*
- * Reserve all memory below the 1 MB mark that has not
- * already been reserved.
+ * SandyBridge integrated graphics devices have a bug that prevents
+ * them from accessing certain memory ranges, namely anything below
+ * 1M and in the pages listed in bad_pages[] above.
+ *
+ * To avoid these pages being ever accessed by SNB gfx devices reserve
+ * bad_pages that have not already been reserved at boot time.
+ * All memory below the 1 MB mark is anyway reserved later during
+ * setup_arch(), so there is no need to reserve it here.
*/
- memblock_reserve(0, 1<<20);
-
+
for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
if (memblock_reserve(bad_pages[i], PAGE_SIZE))
printk(KERN_WARNING "failed to reserve 0x%08lx\n",
@@ -645,18 +655,6 @@ static void __init trim_snb_memory(void)
}
}
-/*
- * Here we put platform-specific memory range workarounds, i.e.
- * memory known to be corrupt or otherwise in need to be reserved on
- * specific platforms.
- *
- * If this gets used more widely it could use a real dispatch mechanism.
- */
-static void __init trim_platform_memory_ranges(void)
-{
- trim_snb_memory();
-}
-
static void __init trim_bios_range(void)
{
/*
@@ -701,35 +699,39 @@ static void __init e820_add_kernel_range(void)
e820__range_add(start, size, E820_TYPE_RAM);
}
-static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
-
-static int __init parse_reservelow(char *p)
+static void __init early_reserve_memory(void)
{
- unsigned long long size;
-
- if (!p)
- return -EINVAL;
-
- size = memparse(p, &p);
+ /*
+ * Reserve the memory occupied by the kernel between _text and
+ * __end_of_kernel_reserve symbols. Any kernel sections after the
+ * __end_of_kernel_reserve symbol must be explicitly reserved with a
+ * separate memblock_reserve() or they will be discarded.
+ */
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
- if (size < 4096)
- size = 4096;
+ /*
+ * The first 4Kb of memory is a BIOS owned area, but generally it is
+ * not listed as such in the E820 table.
+ *
+ * Reserve the first 64K of memory since some BIOSes are known to
+ * corrupt low memory. After the real mode trampoline is allocated the
+ * rest of the memory below 640k is reserved.
+ *
+ * In addition, make sure page 0 is always reserved because on
+ * systems with L1TF its contents can be leaked to user processes.
+ */
+ memblock_reserve(0, SZ_64K);
- if (size > 640*1024)
- size = 640*1024;
+ early_reserve_initrd();
- reserve_low = size;
+ memblock_x86_reserve_range_setup_data();
- return 0;
+ reserve_ibft_region();
+ reserve_bios_regions();
+ trim_snb_memory();
}
-early_param("reservelow", parse_reservelow);
-
-static void __init trim_low_memory_range(void)
-{
- memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
-}
-
/*
* Dump out kernel offset information on panic.
*/
@@ -749,6 +751,30 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
+void x86_configure_nx(void)
+{
+ if (boot_cpu_has(X86_FEATURE_NX))
+ __supported_pte_mask |= _PAGE_NX;
+ else
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+static void __init x86_report_nx(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "missing in CPU!\n");
+ } else {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#else
+ /* 32bit non-PAE kernel, NX cannot be used */
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "cannot be enabled: non-PAE kernel!\n");
+#endif
+ }
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -764,29 +790,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
void __init setup_arch(char **cmdline_p)
{
- /*
- * Reserve the memory occupied by the kernel between _text and
- * __end_of_kernel_reserve symbols. Any kernel sections after the
- * __end_of_kernel_reserve symbol must be explicitly reserved with a
- * separate memblock_reserve() or they will be discarded.
- */
- memblock_reserve(__pa_symbol(_text),
- (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
-
- /*
- * Make sure page 0 is always reserved because on systems with
- * L1TF its contents can be leaked to user processes.
- */
- memblock_reserve(0, PAGE_SIZE);
-
- early_reserve_initrd();
-
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -822,7 +825,6 @@ void __init setup_arch(char **cmdline_p)
idt_setup_early_traps();
early_cpu_init();
- arch_init_ideal_nops();
jump_label_init();
static_call_init();
early_ioremap_init();
@@ -861,6 +863,20 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
+ /*
+ * Do some memory reservations *before* memory is added to memblock, so
+ * memblock allocations won't overwrite it.
+ *
+ * After this point, everything still needed from the boot loader or
+ * firmware or kernel text should be early reserved or marked not RAM in
+ * e820. All other memory is free game.
+ *
+ * This call needs to happen before e820__memory_setup() which calls the
+ * xen_memory_setup() on Xen dom0 which relies on the fact that those
+ * early reservations have happened already.
+ */
+ early_reserve_memory();
+
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
e820__memory_setup();
parse_setup_data();
@@ -869,10 +885,7 @@ void __init setup_arch(char **cmdline_p)
if (!boot_params.hdr.root_flags)
root_mountflags &= ~MS_RDONLY;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = _brk_end;
+ setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
code_resource.start = __pa_symbol(_text);
code_resource.end = __pa_symbol(_etext)-1;
@@ -885,26 +898,24 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
#else
if (builtin_cmdline[0]) {
/* append boot loader cmdline to builtin */
strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
}
#endif
#endif
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
- * console setup can safely call set_fixmap()). It may then be called
- * again from within noexec_setup() during parsing early parameters
- * to honor the respective command line option.
+ * console setup can safely call set_fixmap()).
*/
x86_configure_nx();
@@ -912,6 +923,7 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled(EFI_BOOT))
efi_memblock_x86_reserve_range();
+
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
@@ -938,9 +950,6 @@ void __init setup_arch(char **cmdline_p)
x86_report_nx();
- /* after early param, so could get panic from serial */
- memblock_x86_reserve_range_setup_data();
-
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
disable_apic = 1;
@@ -992,7 +1001,11 @@ void __init setup_arch(char **cmdline_p)
max_pfn = e820__end_of_ram_pfn();
/* update e820 for memory not covered by WB MTRRs */
- mtrr_bp_init();
+ if (IS_ENABLED(CONFIG_MTRR))
+ mtrr_bp_init();
+ else
+ pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
+
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820__end_of_ram_pfn();
@@ -1032,14 +1045,12 @@ void __init setup_arch(char **cmdline_p)
*/
find_smp_config();
- reserve_ibft_region();
-
early_alloc_pgt_buf();
/*
* Need to conclude brk, before e820__memblock_setup()
- * it could use memblock_find_in_range, could overlap with
- * brk area.
+ * it could use memblock_find_in_range, could overlap with
+ * brk area.
*/
reserve_brk();
@@ -1054,8 +1065,6 @@ void __init setup_arch(char **cmdline_p)
*/
sev_setup_arch();
- reserve_bios_regions();
-
efi_fake_memmap();
efi_find_mirror();
efi_esrt_init();
@@ -1079,11 +1088,22 @@ void __init setup_arch(char **cmdline_p)
(max_pfn_mapped<<PAGE_SHIFT) - 1);
#endif
+ /*
+ * Find free memory for the real mode trampoline and place it there. If
+ * there is not enough free memory under 1M, on EFI-enabled systems
+ * there will be additional attempt to reclaim the memory for the real
+ * mode trampoline at efi_free_boot_services().
+ *
+ * Unconditionally reserve the entire first 1M of RAM because BIOSes
+ * are known to corrupt low memory and several hundred kilobytes are not
+ * worth complex detection what memory gets clobbered. Windows does the
+ * same thing for very similar reasons.
+ *
+ * Moreover, on machines with SandyBridge graphics or in setups that use
+ * crashkernel the entire 1M is reserved anyway.
+ */
reserve_real_mode();
- trim_platform_memory_ranges();
- trim_low_memory_range();
-
init_mem_mapping();
idt_setup_early_pf();
@@ -1129,6 +1149,8 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
acpi_table_upgrade();
+ /* Look for ACPI tables and reserve memory occupied by them. */
+ acpi_boot_table_init();
vsmp_init();
@@ -1136,11 +1158,6 @@ void __init setup_arch(char **cmdline_p)
early_platform_quirks();
- /*
- * Parse the ACPI tables for possible boot-time SMP configuration.
- */
- acpi_boot_table_init();
-
early_acpi_boot_init();
initmem_init();
@@ -1223,6 +1240,14 @@ void __init setup_arch(char **cmdline_p)
x86_init.timers.wallclock_init();
+ /*
+ * This needs to run before setup_local_APIC() which soft-disables the
+ * local APIC temporarily and that masks the thermal LVT interrupt,
+ * leading to softlockups on machines which have configured SMI
+ * interrupt delivery.
+ */
+ therm_lvt_init();
+
mcheck_init();
register_refined_jiffies(CLOCK_TICK_RATE);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index fd945ce78554..49325caa7307 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
*/
static bool __init pcpu_need_numa(void)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
pg_data_t *last = NULL;
unsigned int cpu;
@@ -84,63 +84,9 @@ static bool __init pcpu_need_numa(void)
}
#endif
-/**
- * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
- * @cpu: cpu to allocate for
- * @size: size allocation in bytes
- * @align: alignment
- *
- * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
- * does the right thing for NUMA regardless of the current
- * configuration.
- *
- * RETURNS:
- * Pointer to the allocated area on success, NULL on failure.
- */
-static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
- unsigned long align)
-{
- const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- int node = early_cpu_to_node(cpu);
- void *ptr;
-
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = memblock_alloc_from(size, align, goal);
- pr_info("cpu %d has no node %d or node-local memory\n",
- cpu, node);
- pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
- cpu, size, __pa(ptr));
- } else {
- ptr = memblock_alloc_try_nid(size, align, goal,
- MEMBLOCK_ALLOC_ACCESSIBLE,
- node);
-
- pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
- cpu, size, node, __pa(ptr));
- }
- return ptr;
-#else
- return memblock_alloc_from(size, align, goal);
-#endif
-}
-
-/*
- * Helpers for first chunk memory allocation
- */
-static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
-{
- return pcpu_alloc_bootmem(cpu, size, align);
-}
-
-static void __init pcpu_fc_free(void *ptr, size_t size)
-{
- memblock_free(__pa(ptr), size);
-}
-
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
if (early_cpu_to_node(from) == early_cpu_to_node(to))
return LOCAL_DISTANCE;
else
@@ -150,7 +96,12 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
#endif
}
-static void __init pcpup_populate_pte(unsigned long addr)
+static int __init pcpu_cpu_to_node(int cpu)
+{
+ return early_cpu_to_node(cpu);
+}
+
+void __init pcpu_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
@@ -205,15 +156,14 @@ void __init setup_per_cpu_areas(void)
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, atom_size,
pcpu_cpu_distance,
- pcpu_fc_alloc, pcpu_fc_free);
+ pcpu_cpu_to_node);
if (rc < 0)
pr_warn("%s allocator failed (%d), falling back to page size\n",
pcpu_fc_names[pcpu_chosen_fc], rc);
}
if (rc < 0)
rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
- pcpu_fc_alloc, pcpu_fc_free,
- pcpup_populate_pte);
+ pcpu_cpu_to_node);
if (rc < 0)
panic("cannot initialize percpu area (err=%d)", rc);
@@ -224,7 +174,6 @@ void __init setup_per_cpu_areas(void)
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
- setup_stack_canary_segment(cpu);
/*
* Copy data used in early init routines from the
* initial arrays to the per cpu data areas. These
diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
deleted file mode 100644
index cdc04d091242..000000000000
--- a/arch/x86/kernel/sev-es-shared.c
+++ /dev/null
@@ -1,533 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * AMD Encrypted Register State Support
- *
- * Author: Joerg Roedel <jroedel@suse.de>
- *
- * This file is not compiled stand-alone. It contains code shared
- * between the pre-decompression boot code and the running Linux kernel
- * and is included directly into both code-bases.
- */
-
-#ifndef __BOOT_COMPRESSED
-#define error(v) pr_err(v)
-#define has_cpuflag(f) boot_cpu_has(f)
-#endif
-
-static bool __init sev_es_check_cpu_features(void)
-{
- if (!has_cpuflag(X86_FEATURE_RDRAND)) {
- error("RDRAND instruction not supported - no trusted source of randomness available\n");
- return false;
- }
-
- return true;
-}
-
-static void sev_es_terminate(unsigned int reason)
-{
- u64 val = GHCB_SEV_TERMINATE;
-
- /*
- * Tell the hypervisor what went wrong - only reason-set 0 is
- * currently supported.
- */
- val |= GHCB_SEV_TERMINATE_REASON(0, reason);
-
- /* Request Guest Termination from Hypvervisor */
- sev_es_wr_ghcb_msr(val);
- VMGEXIT();
-
- while (true)
- asm volatile("hlt\n" : : : "memory");
-}
-
-static bool sev_es_negotiate_protocol(void)
-{
- u64 val;
-
- /* Do the GHCB protocol version negotiation */
- sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ);
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
-
- if (GHCB_INFO(val) != GHCB_SEV_INFO)
- return false;
-
- if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR ||
- GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR)
- return false;
-
- return true;
-}
-
-static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
-{
- memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
-}
-
-static bool vc_decoding_needed(unsigned long exit_code)
-{
- /* Exceptions don't require to decode the instruction */
- return !(exit_code >= SVM_EXIT_EXCP_BASE &&
- exit_code <= SVM_EXIT_LAST_EXCP);
-}
-
-static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
- struct pt_regs *regs,
- unsigned long exit_code)
-{
- enum es_result ret = ES_OK;
-
- memset(ctxt, 0, sizeof(*ctxt));
- ctxt->regs = regs;
-
- if (vc_decoding_needed(exit_code))
- ret = vc_decode_insn(ctxt);
-
- return ret;
-}
-
-static void vc_finish_insn(struct es_em_ctxt *ctxt)
-{
- ctxt->regs->ip += ctxt->insn.length;
-}
-
-static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt,
- u64 exit_code, u64 exit_info_1,
- u64 exit_info_2)
-{
- enum es_result ret;
-
- /* Fill in protocol and format specifiers */
- ghcb->protocol_version = GHCB_PROTOCOL_MAX;
- ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
-
- ghcb_set_sw_exit_code(ghcb, exit_code);
- ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
- ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
-
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) {
- u64 info = ghcb->save.sw_exit_info_2;
- unsigned long v;
-
- info = ghcb->save.sw_exit_info_2;
- v = info & SVM_EVTINJ_VEC_MASK;
-
- /* Check if exception information from hypervisor is sane. */
- if ((info & SVM_EVTINJ_VALID) &&
- ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
- ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
- ctxt->fi.vector = v;
- if (info & SVM_EVTINJ_VALID_ERR)
- ctxt->fi.error_code = info >> 32;
- ret = ES_EXCEPTION;
- } else {
- ret = ES_VMM_ERROR;
- }
- } else {
- ret = ES_OK;
- }
-
- return ret;
-}
-
-/*
- * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
- * page yet, so it only supports the MSR based communication with the
- * hypervisor and only the CPUID exit-code.
- */
-void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
-{
- unsigned int fn = lower_bits(regs->ax, 32);
- unsigned long val;
-
- /* Only CPUID is supported via MSR protocol */
- if (exit_code != SVM_EXIT_CPUID)
- goto fail;
-
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
- goto fail;
- regs->ax = val >> 32;
-
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
- goto fail;
- regs->bx = val >> 32;
-
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
- goto fail;
- regs->cx = val >> 32;
-
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
- goto fail;
- regs->dx = val >> 32;
-
- /*
- * This is a VC handler and the #VC is only raised when SEV-ES is
- * active, which means SEV must be active too. Do sanity checks on the
- * CPUID results to make sure the hypervisor does not trick the kernel
- * into the no-sev path. This could map sensitive data unencrypted and
- * make it accessible to the hypervisor.
- *
- * In particular, check for:
- * - Hypervisor CPUID bit
- * - Availability of CPUID leaf 0x8000001f
- * - SEV CPUID bit.
- *
- * The hypervisor might still report the wrong C-bit position, but this
- * can't be checked here.
- */
-
- if ((fn == 1 && !(regs->cx & BIT(31))))
- /* Hypervisor bit */
- goto fail;
- else if (fn == 0x80000000 && (regs->ax < 0x8000001f))
- /* SEV leaf check */
- goto fail;
- else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
- /* SEV bit */
- goto fail;
-
- /* Skip over the CPUID two-byte opcode */
- regs->ip += 2;
-
- return;
-
-fail:
- sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE);
- VMGEXIT();
-
- /* Shouldn't get here - if we do halt the machine */
- while (true)
- asm volatile("hlt\n");
-}
-
-static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
- void *src, char *buf,
- unsigned int data_size,
- unsigned int count,
- bool backwards)
-{
- int i, b = backwards ? -1 : 1;
- enum es_result ret = ES_OK;
-
- for (i = 0; i < count; i++) {
- void *s = src + (i * data_size * b);
- char *d = buf + (i * data_size);
-
- ret = vc_read_mem(ctxt, s, d, data_size);
- if (ret != ES_OK)
- break;
- }
-
- return ret;
-}
-
-static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
- void *dst, char *buf,
- unsigned int data_size,
- unsigned int count,
- bool backwards)
-{
- int i, s = backwards ? -1 : 1;
- enum es_result ret = ES_OK;
-
- for (i = 0; i < count; i++) {
- void *d = dst + (i * data_size * s);
- char *b = buf + (i * data_size);
-
- ret = vc_write_mem(ctxt, d, b, data_size);
- if (ret != ES_OK)
- break;
- }
-
- return ret;
-}
-
-#define IOIO_TYPE_STR BIT(2)
-#define IOIO_TYPE_IN 1
-#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR)
-#define IOIO_TYPE_OUT 0
-#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR)
-
-#define IOIO_REP BIT(3)
-
-#define IOIO_ADDR_64 BIT(9)
-#define IOIO_ADDR_32 BIT(8)
-#define IOIO_ADDR_16 BIT(7)
-
-#define IOIO_DATA_32 BIT(6)
-#define IOIO_DATA_16 BIT(5)
-#define IOIO_DATA_8 BIT(4)
-
-#define IOIO_SEG_ES (0 << 10)
-#define IOIO_SEG_DS (3 << 10)
-
-static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
-{
- struct insn *insn = &ctxt->insn;
- *exitinfo = 0;
-
- switch (insn->opcode.bytes[0]) {
- /* INS opcodes */
- case 0x6c:
- case 0x6d:
- *exitinfo |= IOIO_TYPE_INS;
- *exitinfo |= IOIO_SEG_ES;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
- break;
-
- /* OUTS opcodes */
- case 0x6e:
- case 0x6f:
- *exitinfo |= IOIO_TYPE_OUTS;
- *exitinfo |= IOIO_SEG_DS;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
- break;
-
- /* IN immediate opcodes */
- case 0xe4:
- case 0xe5:
- *exitinfo |= IOIO_TYPE_IN;
- *exitinfo |= (u8)insn->immediate.value << 16;
- break;
-
- /* OUT immediate opcodes */
- case 0xe6:
- case 0xe7:
- *exitinfo |= IOIO_TYPE_OUT;
- *exitinfo |= (u8)insn->immediate.value << 16;
- break;
-
- /* IN register opcodes */
- case 0xec:
- case 0xed:
- *exitinfo |= IOIO_TYPE_IN;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
- break;
-
- /* OUT register opcodes */
- case 0xee:
- case 0xef:
- *exitinfo |= IOIO_TYPE_OUT;
- *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
- break;
-
- default:
- return ES_DECODE_FAILED;
- }
-
- switch (insn->opcode.bytes[0]) {
- case 0x6c:
- case 0x6e:
- case 0xe4:
- case 0xe6:
- case 0xec:
- case 0xee:
- /* Single byte opcodes */
- *exitinfo |= IOIO_DATA_8;
- break;
- default:
- /* Length determined by instruction parsing */
- *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
- : IOIO_DATA_32;
- }
- switch (insn->addr_bytes) {
- case 2:
- *exitinfo |= IOIO_ADDR_16;
- break;
- case 4:
- *exitinfo |= IOIO_ADDR_32;
- break;
- case 8:
- *exitinfo |= IOIO_ADDR_64;
- break;
- }
-
- if (insn_has_rep_prefix(insn))
- *exitinfo |= IOIO_REP;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- u64 exit_info_1, exit_info_2;
- enum es_result ret;
-
- ret = vc_ioio_exitinfo(ctxt, &exit_info_1);
- if (ret != ES_OK)
- return ret;
-
- if (exit_info_1 & IOIO_TYPE_STR) {
-
- /* (REP) INS/OUTS */
-
- bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF);
- unsigned int io_bytes, exit_bytes;
- unsigned int ghcb_count, op_count;
- unsigned long es_base;
- u64 sw_scratch;
-
- /*
- * For the string variants with rep prefix the amount of in/out
- * operations per #VC exception is limited so that the kernel
- * has a chance to take interrupts and re-schedule while the
- * instruction is emulated.
- */
- io_bytes = (exit_info_1 >> 4) & 0x7;
- ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes;
-
- op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1;
- exit_info_2 = min(op_count, ghcb_count);
- exit_bytes = exit_info_2 * io_bytes;
-
- es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
-
- /* Read bytes of OUTS into the shared buffer */
- if (!(exit_info_1 & IOIO_TYPE_IN)) {
- ret = vc_insn_string_read(ctxt,
- (void *)(es_base + regs->si),
- ghcb->shared_buffer, io_bytes,
- exit_info_2, df);
- if (ret)
- return ret;
- }
-
- /*
- * Issue an VMGEXIT to the HV to consume the bytes from the
- * shared buffer or to have it write them into the shared buffer
- * depending on the instruction: OUTS or INS.
- */
- sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
- ghcb_set_sw_scratch(ghcb, sw_scratch);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
- exit_info_1, exit_info_2);
- if (ret != ES_OK)
- return ret;
-
- /* Read bytes from shared buffer into the guest's destination. */
- if (exit_info_1 & IOIO_TYPE_IN) {
- ret = vc_insn_string_write(ctxt,
- (void *)(es_base + regs->di),
- ghcb->shared_buffer, io_bytes,
- exit_info_2, df);
- if (ret)
- return ret;
-
- if (df)
- regs->di -= exit_bytes;
- else
- regs->di += exit_bytes;
- } else {
- if (df)
- regs->si -= exit_bytes;
- else
- regs->si += exit_bytes;
- }
-
- if (exit_info_1 & IOIO_REP)
- regs->cx -= exit_info_2;
-
- ret = regs->cx ? ES_RETRY : ES_OK;
-
- } else {
-
- /* IN/OUT into/from rAX */
-
- int bits = (exit_info_1 & 0x70) >> 1;
- u64 rax = 0;
-
- if (!(exit_info_1 & IOIO_TYPE_IN))
- rax = lower_bits(regs->ax, bits);
-
- ghcb_set_rax(ghcb, rax);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
- if (ret != ES_OK)
- return ret;
-
- if (exit_info_1 & IOIO_TYPE_IN) {
- if (!ghcb_rax_is_valid(ghcb))
- return ES_VMM_ERROR;
- regs->ax = lower_bits(ghcb->save.rax, bits);
- }
- }
-
- return ret;
-}
-
-static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- u32 cr4 = native_read_cr4();
- enum es_result ret;
-
- ghcb_set_rax(ghcb, regs->ax);
- ghcb_set_rcx(ghcb, regs->cx);
-
- if (cr4 & X86_CR4_OSXSAVE)
- /* Safe to read xcr0 */
- ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
- else
- /* xgetbv will cause #GP - use reset value for xcr0 */
- ghcb_set_xcr0(ghcb, 1);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) &&
- ghcb_rbx_is_valid(ghcb) &&
- ghcb_rcx_is_valid(ghcb) &&
- ghcb_rdx_is_valid(ghcb)))
- return ES_VMM_ERROR;
-
- regs->ax = ghcb->save.rax;
- regs->bx = ghcb->save.rbx;
- regs->cx = ghcb->save.rcx;
- regs->dx = ghcb->save.rdx;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt,
- unsigned long exit_code)
-{
- bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
- enum es_result ret;
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) &&
- (!rdtscp || ghcb_rcx_is_valid(ghcb))))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
- ctxt->regs->dx = ghcb->save.rdx;
- if (rdtscp)
- ctxt->regs->cx = ghcb->save.rcx;
-
- return ES_OK;
-}
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
deleted file mode 100644
index 04a780abb512..000000000000
--- a/arch/x86/kernel/sev-es.c
+++ /dev/null
@@ -1,1434 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * AMD Memory Encryption Support
- *
- * Copyright (C) 2019 SUSE
- *
- * Author: Joerg Roedel <jroedel@suse.de>
- */
-
-#define pr_fmt(fmt) "SEV-ES: " fmt
-
-#include <linux/sched/debug.h> /* For show_regs() */
-#include <linux/percpu-defs.h>
-#include <linux/mem_encrypt.h>
-#include <linux/lockdep.h>
-#include <linux/printk.h>
-#include <linux/mm_types.h>
-#include <linux/set_memory.h>
-#include <linux/memblock.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-
-#include <asm/cpu_entry_area.h>
-#include <asm/stacktrace.h>
-#include <asm/sev-es.h>
-#include <asm/insn-eval.h>
-#include <asm/fpu/internal.h>
-#include <asm/processor.h>
-#include <asm/realmode.h>
-#include <asm/traps.h>
-#include <asm/svm.h>
-#include <asm/smp.h>
-#include <asm/cpu.h>
-
-#define DR7_RESET_VALUE 0x400
-
-/* For early boot hypervisor communication in SEV-ES enabled guests */
-static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
-
-/*
- * Needs to be in the .data section because we need it NULL before bss is
- * cleared
- */
-static struct ghcb __initdata *boot_ghcb;
-
-/* #VC handler runtime per-CPU data */
-struct sev_es_runtime_data {
- struct ghcb ghcb_page;
-
- /* Physical storage for the per-CPU IST stack of the #VC handler */
- char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
-
- /*
- * Physical storage for the per-CPU fall-back stack of the #VC handler.
- * The fall-back stack is used when it is not safe to switch back to the
- * interrupted stack in the #VC entry code.
- */
- char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
-
- /*
- * Reserve one page per CPU as backup storage for the unencrypted GHCB.
- * It is needed when an NMI happens while the #VC handler uses the real
- * GHCB, and the NMI handler itself is causing another #VC exception. In
- * that case the GHCB content of the first handler needs to be backed up
- * and restored.
- */
- struct ghcb backup_ghcb;
-
- /*
- * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
- * There is no need for it to be atomic, because nothing is written to
- * the GHCB between the read and the write of ghcb_active. So it is safe
- * to use it when a nested #VC exception happens before the write.
- *
- * This is necessary for example in the #VC->NMI->#VC case when the NMI
- * happens while the first #VC handler uses the GHCB. When the NMI code
- * raises a second #VC handler it might overwrite the contents of the
- * GHCB written by the first handler. To avoid this the content of the
- * GHCB is saved and restored when the GHCB is detected to be in use
- * already.
- */
- bool ghcb_active;
- bool backup_ghcb_active;
-
- /*
- * Cached DR7 value - write it on DR7 writes and return it on reads.
- * That value will never make it to the real hardware DR7 as debugging
- * is currently unsupported in SEV-ES guests.
- */
- unsigned long dr7;
-};
-
-struct ghcb_state {
- struct ghcb *ghcb;
-};
-
-static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
-DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
-
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
-static void __init setup_vc_stacks(int cpu)
-{
- struct sev_es_runtime_data *data;
- struct cpu_entry_area *cea;
- unsigned long vaddr;
- phys_addr_t pa;
-
- data = per_cpu(runtime_data, cpu);
- cea = get_cpu_entry_area(cpu);
-
- /* Map #VC IST stack */
- vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
- pa = __pa(data->ist_stack);
- cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
-
- /* Map VC fall-back stack */
- vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
- pa = __pa(data->fallback_stack);
- cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
-}
-
-static __always_inline bool on_vc_stack(struct pt_regs *regs)
-{
- unsigned long sp = regs->sp;
-
- /* User-mode RSP is not trusted */
- if (user_mode(regs))
- return false;
-
- /* SYSCALL gap still has user-mode RSP */
- if (ip_within_syscall_gap(regs))
- return false;
-
- return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
-}
-
-/*
- * This function handles the case when an NMI is raised in the #VC exception
- * handler entry code. In this case, the IST entry for #VC must be adjusted, so
- * that any subsequent #VC exception will not overwrite the stack contents of the
- * interrupted #VC handler.
- *
- * The IST entry is adjusted unconditionally so that it can be also be
- * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
- * sev_es_ist_exit() call may adjust back the IST entry too early.
- */
-void noinstr __sev_es_ist_enter(struct pt_regs *regs)
-{
- unsigned long old_ist, new_ist;
-
- /* Read old IST entry */
- old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
-
- /* Make room on the IST stack */
- if (on_vc_stack(regs))
- new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
- else
- new_ist = old_ist - sizeof(old_ist);
-
- /* Store old IST entry */
- *(unsigned long *)new_ist = old_ist;
-
- /* Set new IST entry */
- this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
-}
-
-void noinstr __sev_es_ist_exit(void)
-{
- unsigned long ist;
-
- /* Read IST entry */
- ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
-
- if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
- return;
-
- /* Read back old IST entry and write it to the TSS */
- this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
-}
-
-static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
-{
- struct sev_es_runtime_data *data;
- struct ghcb *ghcb;
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- if (unlikely(data->ghcb_active)) {
- /* GHCB is already in use - save its contents */
-
- if (unlikely(data->backup_ghcb_active))
- return NULL;
-
- /* Mark backup_ghcb active before writing to it */
- data->backup_ghcb_active = true;
-
- state->ghcb = &data->backup_ghcb;
-
- /* Backup GHCB content */
- *state->ghcb = *ghcb;
- } else {
- state->ghcb = NULL;
- data->ghcb_active = true;
- }
-
- return ghcb;
-}
-
-static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
-{
- struct sev_es_runtime_data *data;
- struct ghcb *ghcb;
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- if (state->ghcb) {
- /* Restore GHCB from Backup */
- *ghcb = *state->ghcb;
- data->backup_ghcb_active = false;
- state->ghcb = NULL;
- } else {
- data->ghcb_active = false;
- }
-}
-
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
-static inline u64 sev_es_rd_ghcb_msr(void)
-{
- return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
-}
-
-static __always_inline void sev_es_wr_ghcb_msr(u64 val)
-{
- u32 low, high;
-
- low = (u32)(val);
- high = (u32)(val >> 32);
-
- native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
-}
-
-static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
- unsigned char *buffer)
-{
- return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
-}
-
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
-{
- char buffer[MAX_INSN_SIZE];
- enum es_result ret;
- int res;
-
- if (user_mode(ctxt->regs)) {
- res = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
- if (!res) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
- ctxt->fi.cr2 = ctxt->regs->ip;
- return ES_EXCEPTION;
- }
-
- if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res))
- return ES_DECODE_FAILED;
- } else {
- res = vc_fetch_insn_kernel(ctxt, buffer);
- if (res) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_INSTR;
- ctxt->fi.cr2 = ctxt->regs->ip;
- return ES_EXCEPTION;
- }
-
- insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
- insn_get_length(&ctxt->insn);
- }
-
- ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
-
- return ret;
-}
-
-static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
- char *dst, char *buf, size_t size)
-{
- unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
- char __user *target = (char __user *)dst;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
-
- /* If instruction ran in kernel mode and the I/O buffer is in kernel space */
- if (!user_mode(ctxt->regs) && !access_ok(target, size)) {
- memcpy(dst, buf, size);
- return ES_OK;
- }
-
- switch (size) {
- case 1:
- memcpy(&d1, buf, 1);
- if (put_user(d1, target))
- goto fault;
- break;
- case 2:
- memcpy(&d2, buf, 2);
- if (put_user(d2, target))
- goto fault;
- break;
- case 4:
- memcpy(&d4, buf, 4);
- if (put_user(d4, target))
- goto fault;
- break;
- case 8:
- memcpy(&d8, buf, 8);
- if (put_user(d8, target))
- goto fault;
- break;
- default:
- WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
- return ES_UNSUPPORTED;
- }
-
- return ES_OK;
-
-fault:
- if (user_mode(ctxt->regs))
- error_code |= X86_PF_USER;
-
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = error_code;
- ctxt->fi.cr2 = (unsigned long)dst;
-
- return ES_EXCEPTION;
-}
-
-static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
- char *src, char *buf, size_t size)
-{
- unsigned long error_code = X86_PF_PROT;
- char __user *s = (char __user *)src;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
-
- /* If instruction ran in kernel mode and the I/O buffer is in kernel space */
- if (!user_mode(ctxt->regs) && !access_ok(s, size)) {
- memcpy(buf, src, size);
- return ES_OK;
- }
-
- switch (size) {
- case 1:
- if (get_user(d1, s))
- goto fault;
- memcpy(buf, &d1, 1);
- break;
- case 2:
- if (get_user(d2, s))
- goto fault;
- memcpy(buf, &d2, 2);
- break;
- case 4:
- if (get_user(d4, s))
- goto fault;
- memcpy(buf, &d4, 4);
- break;
- case 8:
- if (get_user(d8, s))
- goto fault;
- memcpy(buf, &d8, 8);
- break;
- default:
- WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
- return ES_UNSUPPORTED;
- }
-
- return ES_OK;
-
-fault:
- if (user_mode(ctxt->regs))
- error_code |= X86_PF_USER;
-
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = error_code;
- ctxt->fi.cr2 = (unsigned long)src;
-
- return ES_EXCEPTION;
-}
-
-static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned long vaddr, phys_addr_t *paddr)
-{
- unsigned long va = (unsigned long)vaddr;
- unsigned int level;
- phys_addr_t pa;
- pgd_t *pgd;
- pte_t *pte;
-
- pgd = __va(read_cr3_pa());
- pgd = &pgd[pgd_index(va)];
- pte = lookup_address_in_pgd(pgd, va, &level);
- if (!pte) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.cr2 = vaddr;
- ctxt->fi.error_code = 0;
-
- if (user_mode(ctxt->regs))
- ctxt->fi.error_code |= X86_PF_USER;
-
- return ES_EXCEPTION;
- }
-
- if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
- /* Emulated MMIO to/from encrypted memory not supported */
- return ES_UNSUPPORTED;
-
- pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
- pa |= va & ~page_level_mask(level);
-
- *paddr = pa;
-
- return ES_OK;
-}
-
-/* Include code shared with pre-decompression boot stage */
-#include "sev-es-shared.c"
-
-void noinstr __sev_es_nmi_complete(void)
-{
- struct ghcb_state state;
- struct ghcb *ghcb;
-
- ghcb = sev_es_get_ghcb(&state);
-
- vc_ghcb_invalidate(ghcb);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
- ghcb_set_sw_exit_info_1(ghcb, 0);
- ghcb_set_sw_exit_info_2(ghcb, 0);
-
- sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
- VMGEXIT();
-
- sev_es_put_ghcb(&state);
-}
-
-static u64 get_jump_table_addr(void)
-{
- struct ghcb_state state;
- unsigned long flags;
- struct ghcb *ghcb;
- u64 ret = 0;
-
- local_irq_save(flags);
-
- ghcb = sev_es_get_ghcb(&state);
-
- vc_ghcb_invalidate(ghcb);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
- ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
- ghcb_set_sw_exit_info_2(ghcb, 0);
-
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
- ghcb_sw_exit_info_2_is_valid(ghcb))
- ret = ghcb->save.sw_exit_info_2;
-
- sev_es_put_ghcb(&state);
-
- local_irq_restore(flags);
-
- return ret;
-}
-
-int sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
-{
- u16 startup_cs, startup_ip;
- phys_addr_t jump_table_pa;
- u64 jump_table_addr;
- u16 __iomem *jump_table;
-
- jump_table_addr = get_jump_table_addr();
-
- /* On UP guests there is no jump table so this is not a failure */
- if (!jump_table_addr)
- return 0;
-
- /* Check if AP Jump Table is page-aligned */
- if (jump_table_addr & ~PAGE_MASK)
- return -EINVAL;
-
- jump_table_pa = jump_table_addr & PAGE_MASK;
-
- startup_cs = (u16)(rmh->trampoline_start >> 4);
- startup_ip = (u16)(rmh->sev_es_trampoline_start -
- rmh->trampoline_start);
-
- jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
- if (!jump_table)
- return -EIO;
-
- writew(startup_ip, &jump_table[0]);
- writew(startup_cs, &jump_table[1]);
-
- iounmap(jump_table);
-
- return 0;
-}
-
-/*
- * This is needed by the OVMF UEFI firmware which will use whatever it finds in
- * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
- * runtime GHCBs used by the kernel are also mapped in the EFI page-table.
- */
-int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
-{
- struct sev_es_runtime_data *data;
- unsigned long address, pflags;
- int cpu;
- u64 pfn;
-
- if (!sev_es_active())
- return 0;
-
- pflags = _PAGE_NX | _PAGE_RW;
-
- for_each_possible_cpu(cpu) {
- data = per_cpu(runtime_data, cpu);
-
- address = __pa(&data->ghcb_page);
- pfn = address >> PAGE_SHIFT;
-
- if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
- return 1;
- }
-
- return 0;
-}
-
-static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- enum es_result ret;
- u64 exit_info_1;
-
- /* Is it a WRMSR? */
- exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0;
-
- ghcb_set_rcx(ghcb, regs->cx);
- if (exit_info_1) {
- ghcb_set_rax(ghcb, regs->ax);
- ghcb_set_rdx(ghcb, regs->dx);
- }
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0);
-
- if ((ret == ES_OK) && (!exit_info_1)) {
- regs->ax = ghcb->save.rax;
- regs->dx = ghcb->save.rdx;
- }
-
- return ret;
-}
-
-/*
- * This function runs on the first #VC exception after the kernel
- * switched to virtual addresses.
- */
-static bool __init sev_es_setup_ghcb(void)
-{
- /* First make sure the hypervisor talks a supported protocol. */
- if (!sev_es_negotiate_protocol())
- return false;
-
- /*
- * Clear the boot_ghcb. The first exception comes in before the bss
- * section is cleared.
- */
- memset(&boot_ghcb_page, 0, PAGE_SIZE);
-
- /* Alright - Make the boot-ghcb public */
- boot_ghcb = &boot_ghcb_page;
-
- return true;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static void sev_es_ap_hlt_loop(void)
-{
- struct ghcb_state state;
- struct ghcb *ghcb;
-
- ghcb = sev_es_get_ghcb(&state);
-
- while (true) {
- vc_ghcb_invalidate(ghcb);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
- ghcb_set_sw_exit_info_1(ghcb, 0);
- ghcb_set_sw_exit_info_2(ghcb, 0);
-
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- /* Wakeup signal? */
- if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
- ghcb->save.sw_exit_info_2)
- break;
- }
-
- sev_es_put_ghcb(&state);
-}
-
-/*
- * Play_dead handler when running under SEV-ES. This is needed because
- * the hypervisor can't deliver an SIPI request to restart the AP.
- * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
- * hypervisor wakes it up again.
- */
-static void sev_es_play_dead(void)
-{
- play_dead_common();
-
- /* IRQs now disabled */
-
- sev_es_ap_hlt_loop();
-
- /*
- * If we get here, the VCPU was woken up again. Jump to CPU
- * startup code to get it back online.
- */
- start_cpu0();
-}
-#else /* CONFIG_HOTPLUG_CPU */
-#define sev_es_play_dead native_play_dead
-#endif /* CONFIG_HOTPLUG_CPU */
-
-#ifdef CONFIG_SMP
-static void __init sev_es_setup_play_dead(void)
-{
- smp_ops.play_dead = sev_es_play_dead;
-}
-#else
-static inline void sev_es_setup_play_dead(void) { }
-#endif
-
-static void __init alloc_runtime_data(int cpu)
-{
- struct sev_es_runtime_data *data;
-
- data = memblock_alloc(sizeof(*data), PAGE_SIZE);
- if (!data)
- panic("Can't allocate SEV-ES runtime data");
-
- per_cpu(runtime_data, cpu) = data;
-}
-
-static void __init init_ghcb(int cpu)
-{
- struct sev_es_runtime_data *data;
- int err;
-
- data = per_cpu(runtime_data, cpu);
-
- err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
- sizeof(data->ghcb_page));
- if (err)
- panic("Can't map GHCBs unencrypted");
-
- memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
-
- data->ghcb_active = false;
- data->backup_ghcb_active = false;
-}
-
-void __init sev_es_init_vc_handling(void)
-{
- int cpu;
-
- BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
-
- if (!sev_es_active())
- return;
-
- if (!sev_es_check_cpu_features())
- panic("SEV-ES CPU Features missing");
-
- /* Enable SEV-ES special handling */
- static_branch_enable(&sev_es_enable_key);
-
- /* Initialize per-cpu GHCB pages */
- for_each_possible_cpu(cpu) {
- alloc_runtime_data(cpu);
- init_ghcb(cpu);
- setup_vc_stacks(cpu);
- }
-
- sev_es_setup_play_dead();
-
- /* Secondary CPUs use the runtime #VC handler */
- initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
-}
-
-static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
-{
- int trapnr = ctxt->fi.vector;
-
- if (trapnr == X86_TRAP_PF)
- native_write_cr2(ctxt->fi.cr2);
-
- ctxt->regs->orig_ax = ctxt->fi.error_code;
- do_early_exception(ctxt->regs, trapnr);
-}
-
-static long *vc_insn_get_reg(struct es_em_ctxt *ctxt)
-{
- long *reg_array;
- int offset;
-
- reg_array = (long *)ctxt->regs;
- offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs);
-
- if (offset < 0)
- return NULL;
-
- offset /= sizeof(long);
-
- return reg_array + offset;
-}
-
-static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
-{
- long *reg_array;
- int offset;
-
- reg_array = (long *)ctxt->regs;
- offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
-
- if (offset < 0)
- return NULL;
-
- offset /= sizeof(long);
-
- return reg_array + offset;
-}
-static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned int bytes, bool read)
-{
- u64 exit_code, exit_info_1, exit_info_2;
- unsigned long ghcb_pa = __pa(ghcb);
- enum es_result res;
- phys_addr_t paddr;
- void __user *ref;
-
- ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
- if (ref == (void __user *)-1L)
- return ES_UNSUPPORTED;
-
- exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
-
- res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
- if (res != ES_OK) {
- if (res == ES_EXCEPTION && !read)
- ctxt->fi.error_code |= X86_PF_WRITE;
-
- return res;
- }
-
- exit_info_1 = paddr;
- /* Can never be greater than 8 */
- exit_info_2 = bytes;
-
- ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
-
- return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
-}
-
-static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct insn *insn = &ctxt->insn;
- unsigned int bytes = 0;
- enum es_result ret;
- int sign_byte;
- long *reg_data;
-
- switch (insn->opcode.bytes[1]) {
- /* MMIO Read w/ zero-extension */
- case 0xb6:
- bytes = 1;
- fallthrough;
- case 0xb7:
- if (!bytes)
- bytes = 2;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Zero extend based on operand size */
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- memset(reg_data, 0, insn->opnd_bytes);
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- /* MMIO Read w/ sign-extension */
- case 0xbe:
- bytes = 1;
- fallthrough;
- case 0xbf:
- if (!bytes)
- bytes = 2;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Sign extend based on operand size */
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- if (bytes == 1) {
- u8 *val = (u8 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x80) ? 0xff : 0x00;
- } else {
- u16 *val = (u16 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x8000) ? 0xff : 0x00;
- }
- memset(reg_data, sign_byte, insn->opnd_bytes);
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- default:
- ret = ES_UNSUPPORTED;
- }
-
- return ret;
-}
-
-/*
- * The MOVS instruction has two memory operands, which raises the
- * problem that it is not known whether the access to the source or the
- * destination caused the #VC exception (and hence whether an MMIO read
- * or write operation needs to be emulated).
- *
- * Instead of playing games with walking page-tables and trying to guess
- * whether the source or destination is an MMIO range, split the move
- * into two operations, a read and a write with only one memory operand.
- * This will cause a nested #VC exception on the MMIO address which can
- * then be handled.
- *
- * This implementation has the benefit that it also supports MOVS where
- * source _and_ destination are MMIO regions.
- *
- * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
- * rare operation. If it turns out to be a performance problem the split
- * operations can be moved to memcpy_fromio() and memcpy_toio().
- */
-static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
- unsigned int bytes)
-{
- unsigned long ds_base, es_base;
- unsigned char *src, *dst;
- unsigned char buffer[8];
- enum es_result ret;
- bool rep;
- int off;
-
- ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
- es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
-
- if (ds_base == -1L || es_base == -1L) {
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
- }
-
- src = ds_base + (unsigned char *)ctxt->regs->si;
- dst = es_base + (unsigned char *)ctxt->regs->di;
-
- ret = vc_read_mem(ctxt, src, buffer, bytes);
- if (ret != ES_OK)
- return ret;
-
- ret = vc_write_mem(ctxt, dst, buffer, bytes);
- if (ret != ES_OK)
- return ret;
-
- if (ctxt->regs->flags & X86_EFLAGS_DF)
- off = -bytes;
- else
- off = bytes;
-
- ctxt->regs->si += off;
- ctxt->regs->di += off;
-
- rep = insn_has_rep_prefix(&ctxt->insn);
- if (rep)
- ctxt->regs->cx -= 1;
-
- if (!rep || ctxt->regs->cx == 0)
- return ES_OK;
- else
- return ES_RETRY;
-}
-
-static enum es_result vc_handle_mmio(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct insn *insn = &ctxt->insn;
- unsigned int bytes = 0;
- enum es_result ret;
- long *reg_data;
-
- switch (insn->opcode.bytes[0]) {
- /* MMIO Write */
- case 0x88:
- bytes = 1;
- fallthrough;
- case 0x89:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- memcpy(ghcb->shared_buffer, reg_data, bytes);
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, false);
- break;
-
- case 0xc6:
- bytes = 1;
- fallthrough;
- case 0xc7:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
- memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, false);
- break;
-
- /* MMIO Read */
- case 0x8a:
- bytes = 1;
- fallthrough;
- case 0x8b:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- /* Zero-extend for 32-bit operation */
- if (bytes == 4)
- *reg_data = 0;
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- /* MOVS instruction */
- case 0xa4:
- bytes = 1;
- fallthrough;
- case 0xa5:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
- ret = vc_handle_mmio_movs(ctxt, bytes);
- break;
- /* Two-Byte Opcodes */
- case 0x0f:
- ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt);
- break;
- default:
- ret = ES_UNSUPPORTED;
- }
-
- return ret;
-}
-
-static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
- long val, *reg = vc_insn_get_rm(ctxt);
- enum es_result ret;
-
- if (!reg)
- return ES_DECODE_FAILED;
-
- val = *reg;
-
- /* Upper 32 bits must be written as zeroes */
- if (val >> 32) {
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
- }
-
- /* Clear out other reserved bits and set bit 10 */
- val = (val & 0xffff23ffL) | BIT(10);
-
- /* Early non-zero writes to DR7 are not supported */
- if (!data && (val & ~DR7_RESET_VALUE))
- return ES_UNSUPPORTED;
-
- /* Using a value of 0 for ExitInfo1 means RAX holds the value */
- ghcb_set_rax(ghcb, val);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (data)
- data->dr7 = val;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
- long *reg = vc_insn_get_rm(ctxt);
-
- if (!reg)
- return ES_DECODE_FAILED;
-
- if (data)
- *reg = data->dr7;
- else
- *reg = DR7_RESET_VALUE;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
-}
-
-static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- enum es_result ret;
-
- ghcb_set_rcx(ghcb, ctxt->regs->cx);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
- ctxt->regs->dx = ghcb->save.rdx;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_monitor(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /*
- * Treat it as a NOP and do not leak a physical address to the
- * hypervisor.
- */
- return ES_OK;
-}
-
-static enum es_result vc_handle_mwait(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /* Treat the same as MONITOR/MONITORX */
- return ES_OK;
-}
-
-static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- enum es_result ret;
-
- ghcb_set_rax(ghcb, ctxt->regs->ax);
- ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
-
- if (x86_platform.hyper.sev_es_hcall_prepare)
- x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!ghcb_rax_is_valid(ghcb))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
-
- /*
- * Call sev_es_hcall_finish() after regs->ax is already set.
- * This allows the hypervisor handler to overwrite it again if
- * necessary.
- */
- if (x86_platform.hyper.sev_es_hcall_finish &&
- !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
- return ES_VMM_ERROR;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /*
- * Calling ecx_alignment_check() directly does not work, because it
- * enables IRQs and the GHCB is active. Forward the exception and call
- * it later from vc_forward_exception().
- */
- ctxt->fi.vector = X86_TRAP_AC;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
-}
-
-static __always_inline void vc_handle_trap_db(struct pt_regs *regs)
-{
- if (user_mode(regs))
- noist_exc_debug(regs);
- else
- exc_debug(regs);
-}
-
-static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
- struct ghcb *ghcb,
- unsigned long exit_code)
-{
- enum es_result result;
-
- switch (exit_code) {
- case SVM_EXIT_READ_DR7:
- result = vc_handle_dr7_read(ghcb, ctxt);
- break;
- case SVM_EXIT_WRITE_DR7:
- result = vc_handle_dr7_write(ghcb, ctxt);
- break;
- case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
- result = vc_handle_trap_ac(ghcb, ctxt);
- break;
- case SVM_EXIT_RDTSC:
- case SVM_EXIT_RDTSCP:
- result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
- break;
- case SVM_EXIT_RDPMC:
- result = vc_handle_rdpmc(ghcb, ctxt);
- break;
- case SVM_EXIT_INVD:
- pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
- result = ES_UNSUPPORTED;
- break;
- case SVM_EXIT_CPUID:
- result = vc_handle_cpuid(ghcb, ctxt);
- break;
- case SVM_EXIT_IOIO:
- result = vc_handle_ioio(ghcb, ctxt);
- break;
- case SVM_EXIT_MSR:
- result = vc_handle_msr(ghcb, ctxt);
- break;
- case SVM_EXIT_VMMCALL:
- result = vc_handle_vmmcall(ghcb, ctxt);
- break;
- case SVM_EXIT_WBINVD:
- result = vc_handle_wbinvd(ghcb, ctxt);
- break;
- case SVM_EXIT_MONITOR:
- result = vc_handle_monitor(ghcb, ctxt);
- break;
- case SVM_EXIT_MWAIT:
- result = vc_handle_mwait(ghcb, ctxt);
- break;
- case SVM_EXIT_NPF:
- result = vc_handle_mmio(ghcb, ctxt);
- break;
- default:
- /*
- * Unexpected #VC exception
- */
- result = ES_UNSUPPORTED;
- }
-
- return result;
-}
-
-static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
-{
- long error_code = ctxt->fi.error_code;
- int trapnr = ctxt->fi.vector;
-
- ctxt->regs->orig_ax = ctxt->fi.error_code;
-
- switch (trapnr) {
- case X86_TRAP_GP:
- exc_general_protection(ctxt->regs, error_code);
- break;
- case X86_TRAP_UD:
- exc_invalid_op(ctxt->regs);
- break;
- case X86_TRAP_AC:
- exc_alignment_check(ctxt->regs, error_code);
- break;
- default:
- pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
- BUG();
- }
-}
-
-static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
-{
- unsigned long sp = (unsigned long)regs;
-
- return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
-}
-
-/*
- * Main #VC exception handler. It is called when the entry code was able to
- * switch off the IST to a safe kernel stack.
- *
- * With the current implementation it is always possible to switch to a safe
- * stack because #VC exceptions only happen at known places, like intercepted
- * instructions or accesses to MMIO areas/IO ports. They can also happen with
- * code instrumentation when the hypervisor intercepts #DB, but the critical
- * paths are forbidden to be instrumented, so #DB exceptions currently also
- * only happen in safe places.
- */
-DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
-{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
- irqentry_state_t irq_state;
- struct ghcb_state state;
- struct es_em_ctxt ctxt;
- enum es_result result;
- struct ghcb *ghcb;
-
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
- vc_handle_trap_db(regs);
- return;
- }
-
- irq_state = irqentry_nmi_enter(regs);
- lockdep_assert_irqs_disabled();
- instrumentation_begin();
-
- /*
- * This is invoked through an interrupt gate, so IRQs are disabled. The
- * code below might walk page-tables for user or kernel addresses, so
- * keep the IRQs disabled to protect us against concurrent TLB flushes.
- */
-
- ghcb = sev_es_get_ghcb(&state);
- if (!ghcb) {
- /*
- * Mark GHCBs inactive so that panic() is able to print the
- * message.
- */
- data->ghcb_active = false;
- data->backup_ghcb_active = false;
-
- panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
- }
-
- vc_ghcb_invalidate(ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, error_code);
-
- if (result == ES_OK)
- result = vc_handle_exitcode(&ctxt, ghcb, error_code);
-
- sev_es_put_ghcb(&state);
-
- /* Done - now check the result */
- switch (result) {
- case ES_OK:
- vc_finish_insn(&ctxt);
- break;
- case ES_UNSUPPORTED:
- pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
- error_code, regs->ip);
- goto fail;
- case ES_VMM_ERROR:
- pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
- error_code, regs->ip);
- goto fail;
- case ES_DECODE_FAILED:
- pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
- error_code, regs->ip);
- goto fail;
- case ES_EXCEPTION:
- vc_forward_exception(&ctxt);
- break;
- case ES_RETRY:
- /* Nothing to do */
- break;
- default:
- pr_emerg("Unknown result in %s():%d\n", __func__, result);
- /*
- * Emulating the instruction which caused the #VC exception
- * failed - can't continue so print debug information
- */
- BUG();
- }
-
-out:
- instrumentation_end();
- irqentry_nmi_exit(regs, irq_state);
-
- return;
-
-fail:
- if (user_mode(regs)) {
- /*
- * Do not kill the machine if user-space triggered the
- * exception. Send SIGBUS instead and let user-space deal with
- * it.
- */
- force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
- } else {
- pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
- result);
-
- /* Show some debug info */
- show_regs(regs);
-
- /* Ask hypervisor to sev_es_terminate */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
-
- /* If that fails and we get here - just panic */
- panic("Returned from Terminate-Request to Hypervisor\n");
- }
-
- goto out;
-}
-
-/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
-DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
-{
- instrumentation_begin();
- panic("Can't handle #VC exception from unsupported context\n");
- instrumentation_end();
-}
-
-DEFINE_IDTENTRY_VC(exc_vmm_communication)
-{
- if (likely(!on_vc_fallback_stack(regs)))
- safe_stack_exc_vmm_communication(regs, error_code);
- else
- ist_exc_vmm_communication(regs, error_code);
-}
-
-bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
-{
- unsigned long exit_code = regs->orig_ax;
- struct es_em_ctxt ctxt;
- enum es_result result;
-
- /* Do initial setup or terminate the guest */
- if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
-
- vc_ghcb_invalidate(boot_ghcb);
-
- result = vc_init_em_ctxt(&ctxt, regs, exit_code);
- if (result == ES_OK)
- result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
-
- /* Done - now check the result */
- switch (result) {
- case ES_OK:
- vc_finish_insn(&ctxt);
- break;
- case ES_UNSUPPORTED:
- early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_VMM_ERROR:
- early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_DECODE_FAILED:
- early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_EXCEPTION:
- vc_early_forward_exception(&ctxt);
- break;
- case ES_RETRY:
- /* Nothing to do */
- break;
- default:
- BUG();
- }
-
- return true;
-
-fail:
- show_regs(regs);
-
- while (true)
- halt();
-}
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
new file mode 100644
index 000000000000..b478edf43bec
--- /dev/null
+++ b/arch/x86/kernel/sev-shared.c
@@ -0,0 +1,1000 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This file is not compiled stand-alone. It contains code shared
+ * between the pre-decompression boot code and the running Linux kernel
+ * and is included directly into both code-bases.
+ */
+
+#ifndef __BOOT_COMPRESSED
+#define error(v) pr_err(v)
+#define has_cpuflag(f) boot_cpu_has(f)
+#endif
+
+/* I/O parameters for CPUID-related helpers */
+struct cpuid_leaf {
+ u32 fn;
+ u32 subfn;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+};
+
+/*
+ * Individual entries of the SNP CPUID table, as defined by the SNP
+ * Firmware ABI, Revision 0.9, Section 7.1, Table 14.
+ */
+struct snp_cpuid_fn {
+ u32 eax_in;
+ u32 ecx_in;
+ u64 xcr0_in;
+ u64 xss_in;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u64 __reserved;
+} __packed;
+
+/*
+ * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9,
+ * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit
+ * of 64 entries per CPUID table.
+ */
+#define SNP_CPUID_COUNT_MAX 64
+
+struct snp_cpuid_table {
+ u32 count;
+ u32 __reserved1;
+ u64 __reserved2;
+ struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
+} __packed;
+
+/*
+ * Since feature negotiation related variables are set early in the boot
+ * process they must reside in the .data section so as not to be zeroed
+ * out when the .bss section is later cleared.
+ *
+ * GHCB protocol version negotiated with the hypervisor.
+ */
+static u16 ghcb_version __ro_after_init;
+
+/* Copy of the SNP firmware's CPUID page. */
+static struct snp_cpuid_table cpuid_table_copy __ro_after_init;
+
+/*
+ * These will be initialized based on CPUID table so that non-present
+ * all-zero leaves (for sparse tables) can be differentiated from
+ * invalid/out-of-range leaves. This is needed since all-zero leaves
+ * still need to be post-processed.
+ */
+static u32 cpuid_std_range_max __ro_after_init;
+static u32 cpuid_hyp_range_max __ro_after_init;
+static u32 cpuid_ext_range_max __ro_after_init;
+
+static bool __init sev_es_check_cpu_features(void)
+{
+ if (!has_cpuflag(X86_FEATURE_RDRAND)) {
+ error("RDRAND instruction not supported - no trusted source of randomness available\n");
+ return false;
+ }
+
+ return true;
+}
+
+static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason)
+{
+ u64 val = GHCB_MSR_TERM_REQ;
+
+ /* Tell the hypervisor what went wrong. */
+ val |= GHCB_SEV_TERM_REASON(set, reason);
+
+ /* Request Guest Termination from Hypvervisor */
+ sev_es_wr_ghcb_msr(val);
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+/*
+ * The hypervisor features are available from GHCB version 2 onward.
+ */
+static u64 get_hv_features(void)
+{
+ u64 val;
+
+ if (ghcb_version < 2)
+ return 0;
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ);
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP)
+ return 0;
+
+ return GHCB_MSR_HV_FT_RESP_VAL(val);
+}
+
+static void snp_register_ghcb_early(unsigned long paddr)
+{
+ unsigned long pfn = paddr >> PAGE_SHIFT;
+ u64 val;
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn));
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+
+ /* If the response GPA is not ours then abort the guest */
+ if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) ||
+ (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER);
+}
+
+static bool sev_es_negotiate_protocol(void)
+{
+ u64 val;
+
+ /* Do the GHCB protocol version negotiation */
+ sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+
+ if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
+ return false;
+
+ if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
+ GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
+ return false;
+
+ ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX);
+
+ return true;
+}
+
+static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
+{
+ ghcb->save.sw_exit_code = 0;
+ __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+static bool vc_decoding_needed(unsigned long exit_code)
+{
+ /* Exceptions don't require to decode the instruction */
+ return !(exit_code >= SVM_EXIT_EXCP_BASE &&
+ exit_code <= SVM_EXIT_LAST_EXCP);
+}
+
+static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
+ struct pt_regs *regs,
+ unsigned long exit_code)
+{
+ enum es_result ret = ES_OK;
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->regs = regs;
+
+ if (vc_decoding_needed(exit_code))
+ ret = vc_decode_insn(ctxt);
+
+ return ret;
+}
+
+static void vc_finish_insn(struct es_em_ctxt *ctxt)
+{
+ ctxt->regs->ip += ctxt->insn.length;
+}
+
+static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ u32 ret;
+
+ ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0);
+ if (!ret)
+ return ES_OK;
+
+ if (ret == 1) {
+ u64 info = ghcb->save.sw_exit_info_2;
+ unsigned long v = info & SVM_EVTINJ_VEC_MASK;
+
+ /* Check if exception information from hypervisor is sane. */
+ if ((info & SVM_EVTINJ_VALID) &&
+ ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
+ ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
+ ctxt->fi.vector = v;
+
+ if (info & SVM_EVTINJ_VALID_ERR)
+ ctxt->fi.error_code = info >> 32;
+
+ return ES_EXCEPTION;
+ }
+ }
+
+ return ES_VMM_ERROR;
+}
+
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr,
+ struct es_em_ctxt *ctxt, u64 exit_code,
+ u64 exit_info_1, u64 exit_info_2)
+{
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = ghcb_version;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ /*
+ * Hyper-V unenlightened guests use a paravisor for communicating and
+ * GHCB pages are being allocated and set up by that paravisor. Linux
+ * should not change the GHCB page's physical address.
+ */
+ if (set_ghcb_msr)
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+
+ VMGEXIT();
+
+ return verify_exception_info(ghcb, ctxt);
+}
+
+static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
+{
+ u64 val;
+
+ sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx));
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
+ return -EIO;
+
+ *reg = (val >> 32);
+
+ return 0;
+}
+
+static int sev_cpuid_hv(struct cpuid_leaf *leaf)
+{
+ int ret;
+
+ /*
+ * MSR protocol does not support fetching non-zero subfunctions, but is
+ * sufficient to handle current early-boot cases. Should that change,
+ * make sure to report an error rather than ignoring the index and
+ * grabbing random values. If this issue arises in the future, handling
+ * can be added here to use GHCB-page protocol for cases that occur late
+ * enough in boot that GHCB page is available.
+ */
+ if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn)
+ return -EINVAL;
+
+ ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx);
+
+ return ret;
+}
+
+/*
+ * This may be called early while still running on the initial identity
+ * mapping. Use RIP-relative addressing to obtain the correct address
+ * while running with the initial identity mapping as well as the
+ * switch-over to kernel virtual addresses later.
+ */
+static const struct snp_cpuid_table *snp_cpuid_get_table(void)
+{
+ void *ptr;
+
+ asm ("lea cpuid_table_copy(%%rip), %0"
+ : "=r" (ptr)
+ : "p" (&cpuid_table_copy));
+
+ return ptr;
+}
+
+/*
+ * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of
+ * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0
+ * and 1 based on the corresponding features enabled by a particular
+ * combination of XCR0 and XSS registers so that a guest can look up the
+ * version corresponding to the features currently enabled in its XCR0/XSS
+ * registers. The only values that differ between these versions/table
+ * entries is the enabled XSAVE area size advertised via EBX.
+ *
+ * While hypervisors may choose to make use of this support, it is more
+ * robust/secure for a guest to simply find the entry corresponding to the
+ * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the
+ * XSAVE area size using subfunctions 2 through 64, as documented in APM
+ * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here.
+ *
+ * Since base/legacy XSAVE area size is documented as 0x240, use that value
+ * directly rather than relying on the base size in the CPUID table.
+ *
+ * Return: XSAVE area size on success, 0 otherwise.
+ */
+static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ u64 xfeatures_found = 0;
+ u32 xsave_size = 0x240;
+ int i;
+
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+ if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64))
+ continue;
+ if (!(xfeatures_en & (BIT_ULL(e->ecx_in))))
+ continue;
+ if (xfeatures_found & (BIT_ULL(e->ecx_in)))
+ continue;
+
+ xfeatures_found |= (BIT_ULL(e->ecx_in));
+
+ if (compacted)
+ xsave_size += e->eax;
+ else
+ xsave_size = max(xsave_size, e->eax + e->ebx);
+ }
+
+ /*
+ * Either the guest set unsupported XCR0/XSS bits, or the corresponding
+ * entries in the CPUID table were not present. This is not a valid
+ * state to be in.
+ */
+ if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
+ return 0;
+
+ return xsave_size;
+}
+
+static bool
+snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ int i;
+
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+ if (e->eax_in != leaf->fn)
+ continue;
+
+ if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn)
+ continue;
+
+ /*
+ * For 0xD subfunctions 0 and 1, only use the entry corresponding
+ * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0).
+ * See the comments above snp_cpuid_calc_xsave_size() for more
+ * details.
+ */
+ if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1))
+ if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in)
+ continue;
+
+ leaf->eax = e->eax;
+ leaf->ebx = e->ebx;
+ leaf->ecx = e->ecx;
+ leaf->edx = e->edx;
+
+ return true;
+ }
+
+ return false;
+}
+
+static void snp_cpuid_hv(struct cpuid_leaf *leaf)
+{
+ if (sev_cpuid_hv(leaf))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
+}
+
+static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
+{
+ struct cpuid_leaf leaf_hv = *leaf;
+
+ switch (leaf->fn) {
+ case 0x1:
+ snp_cpuid_hv(&leaf_hv);
+
+ /* initial APIC ID */
+ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
+ /* APIC enabled bit */
+ leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9));
+
+ /* OSXSAVE enabled bit */
+ if (native_read_cr4() & X86_CR4_OSXSAVE)
+ leaf->ecx |= BIT(27);
+ break;
+ case 0x7:
+ /* OSPKE enabled bit */
+ leaf->ecx &= ~BIT(4);
+ if (native_read_cr4() & X86_CR4_PKE)
+ leaf->ecx |= BIT(4);
+ break;
+ case 0xB:
+ leaf_hv.subfn = 0;
+ snp_cpuid_hv(&leaf_hv);
+
+ /* extended APIC ID */
+ leaf->edx = leaf_hv.edx;
+ break;
+ case 0xD: {
+ bool compacted = false;
+ u64 xcr0 = 1, xss = 0;
+ u32 xsave_size;
+
+ if (leaf->subfn != 0 && leaf->subfn != 1)
+ return 0;
+
+ if (native_read_cr4() & X86_CR4_OSXSAVE)
+ xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ if (leaf->subfn == 1) {
+ /* Get XSS value if XSAVES is enabled. */
+ if (leaf->eax & BIT(3)) {
+ unsigned long lo, hi;
+
+ asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
+ : "c" (MSR_IA32_XSS));
+ xss = (hi << 32) | lo;
+ }
+
+ /*
+ * The PPR and APM aren't clear on what size should be
+ * encoded in 0xD:0x1:EBX when compaction is not enabled
+ * by either XSAVEC (feature bit 1) or XSAVES (feature
+ * bit 3) since SNP-capable hardware has these feature
+ * bits fixed as 1. KVM sets it to 0 in this case, but
+ * to avoid this becoming an issue it's safer to simply
+ * treat this as unsupported for SNP guests.
+ */
+ if (!(leaf->eax & (BIT(1) | BIT(3))))
+ return -EINVAL;
+
+ compacted = true;
+ }
+
+ xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted);
+ if (!xsave_size)
+ return -EINVAL;
+
+ leaf->ebx = xsave_size;
+ }
+ break;
+ case 0x8000001E:
+ snp_cpuid_hv(&leaf_hv);
+
+ /* extended APIC ID */
+ leaf->eax = leaf_hv.eax;
+ /* compute ID */
+ leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0));
+ /* node ID */
+ leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0));
+ break;
+ default:
+ /* No fix-ups needed, use values as-is. */
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
+ * should be treated as fatal by caller.
+ */
+static int snp_cpuid(struct cpuid_leaf *leaf)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+
+ if (!cpuid_table->count)
+ return -EOPNOTSUPP;
+
+ if (!snp_cpuid_get_validated_func(leaf)) {
+ /*
+ * Some hypervisors will avoid keeping track of CPUID entries
+ * where all values are zero, since they can be handled the
+ * same as out-of-range values (all-zero). This is useful here
+ * as well as it allows virtually all guest configurations to
+ * work using a single SNP CPUID table.
+ *
+ * To allow for this, there is a need to distinguish between
+ * out-of-range entries and in-range zero entries, since the
+ * CPUID table entries are only a template that may need to be
+ * augmented with additional values for things like
+ * CPU-specific information during post-processing. So if it's
+ * not in the table, set the values to zero. Then, if they are
+ * within a valid CPUID range, proceed with post-processing
+ * using zeros as the initial values. Otherwise, skip
+ * post-processing and just return zeros immediately.
+ */
+ leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
+
+ /* Skip post-processing for out-of-range zero leafs. */
+ if (!(leaf->fn <= cpuid_std_range_max ||
+ (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
+ (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
+ return 0;
+ }
+
+ return snp_cpuid_postprocess(leaf);
+}
+
+/*
+ * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
+ * page yet, so it only supports the MSR based communication with the
+ * hypervisor and only the CPUID exit-code.
+ */
+void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
+{
+ unsigned int subfn = lower_bits(regs->cx, 32);
+ unsigned int fn = lower_bits(regs->ax, 32);
+ struct cpuid_leaf leaf;
+ int ret;
+
+ /* Only CPUID is supported via MSR protocol */
+ if (exit_code != SVM_EXIT_CPUID)
+ goto fail;
+
+ leaf.fn = fn;
+ leaf.subfn = subfn;
+
+ ret = snp_cpuid(&leaf);
+ if (!ret)
+ goto cpuid_done;
+
+ if (ret != -EOPNOTSUPP)
+ goto fail;
+
+ if (sev_cpuid_hv(&leaf))
+ goto fail;
+
+cpuid_done:
+ regs->ax = leaf.eax;
+ regs->bx = leaf.ebx;
+ regs->cx = leaf.ecx;
+ regs->dx = leaf.edx;
+
+ /*
+ * This is a VC handler and the #VC is only raised when SEV-ES is
+ * active, which means SEV must be active too. Do sanity checks on the
+ * CPUID results to make sure the hypervisor does not trick the kernel
+ * into the no-sev path. This could map sensitive data unencrypted and
+ * make it accessible to the hypervisor.
+ *
+ * In particular, check for:
+ * - Availability of CPUID leaf 0x8000001f
+ * - SEV CPUID bit.
+ *
+ * The hypervisor might still report the wrong C-bit position, but this
+ * can't be checked here.
+ */
+
+ if (fn == 0x80000000 && (regs->ax < 0x8000001f))
+ /* SEV leaf check */
+ goto fail;
+ else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
+ /* SEV bit */
+ goto fail;
+
+ /* Skip over the CPUID two-byte opcode */
+ regs->ip += 2;
+
+ return;
+
+fail:
+ /* Terminate the guest */
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
+static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
+ void *src, char *buf,
+ unsigned int data_size,
+ unsigned int count,
+ bool backwards)
+{
+ int i, b = backwards ? -1 : 1;
+ enum es_result ret = ES_OK;
+
+ for (i = 0; i < count; i++) {
+ void *s = src + (i * data_size * b);
+ char *d = buf + (i * data_size);
+
+ ret = vc_read_mem(ctxt, s, d, data_size);
+ if (ret != ES_OK)
+ break;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
+ void *dst, char *buf,
+ unsigned int data_size,
+ unsigned int count,
+ bool backwards)
+{
+ int i, s = backwards ? -1 : 1;
+ enum es_result ret = ES_OK;
+
+ for (i = 0; i < count; i++) {
+ void *d = dst + (i * data_size * s);
+ char *b = buf + (i * data_size);
+
+ ret = vc_write_mem(ctxt, d, b, data_size);
+ if (ret != ES_OK)
+ break;
+ }
+
+ return ret;
+}
+
+#define IOIO_TYPE_STR BIT(2)
+#define IOIO_TYPE_IN 1
+#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR)
+#define IOIO_TYPE_OUT 0
+#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR)
+
+#define IOIO_REP BIT(3)
+
+#define IOIO_ADDR_64 BIT(9)
+#define IOIO_ADDR_32 BIT(8)
+#define IOIO_ADDR_16 BIT(7)
+
+#define IOIO_DATA_32 BIT(6)
+#define IOIO_DATA_16 BIT(5)
+#define IOIO_DATA_8 BIT(4)
+
+#define IOIO_SEG_ES (0 << 10)
+#define IOIO_SEG_DS (3 << 10)
+
+static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
+{
+ struct insn *insn = &ctxt->insn;
+ *exitinfo = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ /* INS opcodes */
+ case 0x6c:
+ case 0x6d:
+ *exitinfo |= IOIO_TYPE_INS;
+ *exitinfo |= IOIO_SEG_ES;
+ *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ break;
+
+ /* OUTS opcodes */
+ case 0x6e:
+ case 0x6f:
+ *exitinfo |= IOIO_TYPE_OUTS;
+ *exitinfo |= IOIO_SEG_DS;
+ *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ break;
+
+ /* IN immediate opcodes */
+ case 0xe4:
+ case 0xe5:
+ *exitinfo |= IOIO_TYPE_IN;
+ *exitinfo |= (u8)insn->immediate.value << 16;
+ break;
+
+ /* OUT immediate opcodes */
+ case 0xe6:
+ case 0xe7:
+ *exitinfo |= IOIO_TYPE_OUT;
+ *exitinfo |= (u8)insn->immediate.value << 16;
+ break;
+
+ /* IN register opcodes */
+ case 0xec:
+ case 0xed:
+ *exitinfo |= IOIO_TYPE_IN;
+ *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ break;
+
+ /* OUT register opcodes */
+ case 0xee:
+ case 0xef:
+ *exitinfo |= IOIO_TYPE_OUT;
+ *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ break;
+
+ default:
+ return ES_DECODE_FAILED;
+ }
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x6c:
+ case 0x6e:
+ case 0xe4:
+ case 0xe6:
+ case 0xec:
+ case 0xee:
+ /* Single byte opcodes */
+ *exitinfo |= IOIO_DATA_8;
+ break;
+ default:
+ /* Length determined by instruction parsing */
+ *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
+ : IOIO_DATA_32;
+ }
+ switch (insn->addr_bytes) {
+ case 2:
+ *exitinfo |= IOIO_ADDR_16;
+ break;
+ case 4:
+ *exitinfo |= IOIO_ADDR_32;
+ break;
+ case 8:
+ *exitinfo |= IOIO_ADDR_64;
+ break;
+ }
+
+ if (insn_has_rep_prefix(insn))
+ *exitinfo |= IOIO_REP;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u64 exit_info_1, exit_info_2;
+ enum es_result ret;
+
+ ret = vc_ioio_exitinfo(ctxt, &exit_info_1);
+ if (ret != ES_OK)
+ return ret;
+
+ if (exit_info_1 & IOIO_TYPE_STR) {
+
+ /* (REP) INS/OUTS */
+
+ bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF);
+ unsigned int io_bytes, exit_bytes;
+ unsigned int ghcb_count, op_count;
+ unsigned long es_base;
+ u64 sw_scratch;
+
+ /*
+ * For the string variants with rep prefix the amount of in/out
+ * operations per #VC exception is limited so that the kernel
+ * has a chance to take interrupts and re-schedule while the
+ * instruction is emulated.
+ */
+ io_bytes = (exit_info_1 >> 4) & 0x7;
+ ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes;
+
+ op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1;
+ exit_info_2 = min(op_count, ghcb_count);
+ exit_bytes = exit_info_2 * io_bytes;
+
+ es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
+
+ /* Read bytes of OUTS into the shared buffer */
+ if (!(exit_info_1 & IOIO_TYPE_IN)) {
+ ret = vc_insn_string_read(ctxt,
+ (void *)(es_base + regs->si),
+ ghcb->shared_buffer, io_bytes,
+ exit_info_2, df);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Issue an VMGEXIT to the HV to consume the bytes from the
+ * shared buffer or to have it write them into the shared buffer
+ * depending on the instruction: OUTS or INS.
+ */
+ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
+ ghcb_set_sw_scratch(ghcb, sw_scratch);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO,
+ exit_info_1, exit_info_2);
+ if (ret != ES_OK)
+ return ret;
+
+ /* Read bytes from shared buffer into the guest's destination. */
+ if (exit_info_1 & IOIO_TYPE_IN) {
+ ret = vc_insn_string_write(ctxt,
+ (void *)(es_base + regs->di),
+ ghcb->shared_buffer, io_bytes,
+ exit_info_2, df);
+ if (ret)
+ return ret;
+
+ if (df)
+ regs->di -= exit_bytes;
+ else
+ regs->di += exit_bytes;
+ } else {
+ if (df)
+ regs->si -= exit_bytes;
+ else
+ regs->si += exit_bytes;
+ }
+
+ if (exit_info_1 & IOIO_REP)
+ regs->cx -= exit_info_2;
+
+ ret = regs->cx ? ES_RETRY : ES_OK;
+
+ } else {
+
+ /* IN/OUT into/from rAX */
+
+ int bits = (exit_info_1 & 0x70) >> 1;
+ u64 rax = 0;
+
+ if (!(exit_info_1 & IOIO_TYPE_IN))
+ rax = lower_bits(regs->ax, bits);
+
+ ghcb_set_rax(ghcb, rax);
+
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt,
+ SVM_EXIT_IOIO, exit_info_1, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (exit_info_1 & IOIO_TYPE_IN) {
+ if (!ghcb_rax_is_valid(ghcb))
+ return ES_VMM_ERROR;
+ regs->ax = lower_bits(ghcb->save.rax, bits);
+ }
+ }
+
+ return ret;
+}
+
+static int vc_handle_cpuid_snp(struct pt_regs *regs)
+{
+ struct cpuid_leaf leaf;
+ int ret;
+
+ leaf.fn = regs->ax;
+ leaf.subfn = regs->cx;
+ ret = snp_cpuid(&leaf);
+ if (!ret) {
+ regs->ax = leaf.eax;
+ regs->bx = leaf.ebx;
+ regs->cx = leaf.ecx;
+ regs->dx = leaf.edx;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u32 cr4 = native_read_cr4();
+ enum es_result ret;
+ int snp_cpuid_ret;
+
+ snp_cpuid_ret = vc_handle_cpuid_snp(regs);
+ if (!snp_cpuid_ret)
+ return ES_OK;
+ if (snp_cpuid_ret != -EOPNOTSUPP)
+ return ES_VMM_ERROR;
+
+ ghcb_set_rax(ghcb, regs->ax);
+ ghcb_set_rcx(ghcb, regs->cx);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #GP - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ regs->ax = ghcb->save.rax;
+ regs->bx = ghcb->save.rbx;
+ regs->cx = ghcb->save.rcx;
+ regs->dx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ unsigned long exit_code)
+{
+ bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
+ enum es_result ret;
+
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) &&
+ (!rdtscp || ghcb_rcx_is_valid(ghcb))))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+ ctxt->regs->dx = ghcb->save.rdx;
+ if (rdtscp)
+ ctxt->regs->cx = ghcb->save.rcx;
+
+ return ES_OK;
+}
+
+struct cc_setup_data {
+ struct setup_data header;
+ u32 cc_blob_address;
+};
+
+/*
+ * Search for a Confidential Computing blob passed in as a setup_data entry
+ * via the Linux Boot Protocol.
+ */
+static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
+{
+ struct cc_setup_data *sd = NULL;
+ struct setup_data *hdr;
+
+ hdr = (struct setup_data *)bp->hdr.setup_data;
+
+ while (hdr) {
+ if (hdr->type == SETUP_CC_BLOB) {
+ sd = (struct cc_setup_data *)hdr;
+ return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address;
+ }
+ hdr = (struct setup_data *)hdr->next;
+ }
+
+ return NULL;
+}
+
+/*
+ * Initialize the kernel's copy of the SNP CPUID table, and set up the
+ * pointer that will be used to access it.
+ *
+ * Maintaining a direct mapping of the SNP CPUID table used by firmware would
+ * be possible as an alternative, but the approach is brittle since the
+ * mapping needs to be updated in sync with all the changes to virtual memory
+ * layout and related mapping facilities throughout the boot process.
+ */
+static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
+{
+ const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
+ int i;
+
+ if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
+
+ cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys;
+ if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
+
+ cpuid_table = snp_cpuid_get_table();
+ memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table));
+
+ /* Initialize CPUID ranges for range-checking. */
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ if (fn->eax_in == 0x0)
+ cpuid_std_range_max = fn->eax;
+ else if (fn->eax_in == 0x40000000)
+ cpuid_hyp_range_max = fn->eax;
+ else if (fn->eax_in == 0x80000000)
+ cpuid_ext_range_max = fn->eax;
+ }
+}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
new file mode 100644
index 000000000000..c05f0124c410
--- /dev/null
+++ b/arch/x86/kernel/sev.c
@@ -0,0 +1,2251 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/sched/debug.h> /* For show_regs() */
+#include <linux/percpu-defs.h>
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/set_memory.h>
+#include <linux/memblock.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/efi.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/stacktrace.h>
+#include <asm/sev.h>
+#include <asm/insn-eval.h>
+#include <asm/fpu/xcr.h>
+#include <asm/processor.h>
+#include <asm/realmode.h>
+#include <asm/setup.h>
+#include <asm/traps.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid.h>
+#include <asm/cmdline.h>
+
+#define DR7_RESET_VALUE 0x400
+
+/* AP INIT values as documented in the APM2 section "Processor Initialization State" */
+#define AP_INIT_CS_LIMIT 0xffff
+#define AP_INIT_DS_LIMIT 0xffff
+#define AP_INIT_LDTR_LIMIT 0xffff
+#define AP_INIT_GDTR_LIMIT 0xffff
+#define AP_INIT_IDTR_LIMIT 0xffff
+#define AP_INIT_TR_LIMIT 0xffff
+#define AP_INIT_RFLAGS_DEFAULT 0x2
+#define AP_INIT_DR6_DEFAULT 0xffff0ff0
+#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define AP_INIT_XCR0_DEFAULT 0x1
+#define AP_INIT_X87_FTW_DEFAULT 0x5555
+#define AP_INIT_X87_FCW_DEFAULT 0x0040
+#define AP_INIT_CR0_DEFAULT 0x60000010
+#define AP_INIT_MXCSR_DEFAULT 0x1f80
+
+/* For early boot hypervisor communication in SEV-ES enabled guests */
+static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
+
+/*
+ * Needs to be in the .data section because we need it NULL before bss is
+ * cleared
+ */
+static struct ghcb *boot_ghcb __section(".data");
+
+/* Bitmap of SEV features supported by the hypervisor */
+static u64 sev_hv_features __ro_after_init;
+
+/* #VC handler runtime per-CPU data */
+struct sev_es_runtime_data {
+ struct ghcb ghcb_page;
+
+ /*
+ * Reserve one page per CPU as backup storage for the unencrypted GHCB.
+ * It is needed when an NMI happens while the #VC handler uses the real
+ * GHCB, and the NMI handler itself is causing another #VC exception. In
+ * that case the GHCB content of the first handler needs to be backed up
+ * and restored.
+ */
+ struct ghcb backup_ghcb;
+
+ /*
+ * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
+ * There is no need for it to be atomic, because nothing is written to
+ * the GHCB between the read and the write of ghcb_active. So it is safe
+ * to use it when a nested #VC exception happens before the write.
+ *
+ * This is necessary for example in the #VC->NMI->#VC case when the NMI
+ * happens while the first #VC handler uses the GHCB. When the NMI code
+ * raises a second #VC handler it might overwrite the contents of the
+ * GHCB written by the first handler. To avoid this the content of the
+ * GHCB is saved and restored when the GHCB is detected to be in use
+ * already.
+ */
+ bool ghcb_active;
+ bool backup_ghcb_active;
+
+ /*
+ * Cached DR7 value - write it on DR7 writes and return it on reads.
+ * That value will never make it to the real hardware DR7 as debugging
+ * is currently unsupported in SEV-ES guests.
+ */
+ unsigned long dr7;
+};
+
+struct ghcb_state {
+ struct ghcb *ghcb;
+};
+
+static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
+DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
+
+static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
+
+struct sev_config {
+ __u64 debug : 1,
+ __reserved : 63;
+};
+
+static struct sev_config sev_cfg __read_mostly;
+
+static __always_inline bool on_vc_stack(struct pt_regs *regs)
+{
+ unsigned long sp = regs->sp;
+
+ /* User-mode RSP is not trusted */
+ if (user_mode(regs))
+ return false;
+
+ /* SYSCALL gap still has user-mode RSP */
+ if (ip_within_syscall_gap(regs))
+ return false;
+
+ return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
+}
+
+/*
+ * This function handles the case when an NMI is raised in the #VC
+ * exception handler entry code, before the #VC handler has switched off
+ * its IST stack. In this case, the IST entry for #VC must be adjusted,
+ * so that any nested #VC exception will not overwrite the stack
+ * contents of the interrupted #VC handler.
+ *
+ * The IST entry is adjusted unconditionally so that it can be also be
+ * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
+ * nested sev_es_ist_exit() call may adjust back the IST entry too
+ * early.
+ *
+ * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
+ * on the NMI IST stack, as they are only called from NMI handling code
+ * right now.
+ */
+void noinstr __sev_es_ist_enter(struct pt_regs *regs)
+{
+ unsigned long old_ist, new_ist;
+
+ /* Read old IST entry */
+ new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+ /*
+ * If NMI happened while on the #VC IST stack, set the new IST
+ * value below regs->sp, so that the interrupted stack frame is
+ * not overwritten by subsequent #VC exceptions.
+ */
+ if (on_vc_stack(regs))
+ new_ist = regs->sp;
+
+ /*
+ * Reserve additional 8 bytes and store old IST value so this
+ * adjustment can be unrolled in __sev_es_ist_exit().
+ */
+ new_ist -= sizeof(old_ist);
+ *(unsigned long *)new_ist = old_ist;
+
+ /* Set new IST entry */
+ this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
+}
+
+void noinstr __sev_es_ist_exit(void)
+{
+ unsigned long ist;
+
+ /* Read IST entry */
+ ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+ if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
+ return;
+
+ /* Read back old IST entry and write it to the TSS */
+ this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
+}
+
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (unlikely(data->ghcb_active)) {
+ /* GHCB is already in use - save its contents */
+
+ if (unlikely(data->backup_ghcb_active)) {
+ /*
+ * Backup-GHCB is also already in use. There is no way
+ * to continue here so just kill the machine. To make
+ * panic() work, mark GHCBs inactive so that messages
+ * can be printed out.
+ */
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+
+ instrumentation_begin();
+ panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ instrumentation_end();
+ }
+
+ /* Mark backup_ghcb active before writing to it */
+ data->backup_ghcb_active = true;
+
+ state->ghcb = &data->backup_ghcb;
+
+ /* Backup GHCB content */
+ *state->ghcb = *ghcb;
+ } else {
+ state->ghcb = NULL;
+ data->ghcb_active = true;
+ }
+
+ return ghcb;
+}
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
+}
+
+static __always_inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ u32 low, high;
+
+ low = (u32)(val);
+ high = (u32)(val >> 32);
+
+ native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
+}
+
+static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
+ unsigned char *buffer)
+{
+ return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+}
+
+static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int insn_bytes;
+
+ insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
+ if (insn_bytes == 0) {
+ /* Nothing could be copied */
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ } else if (insn_bytes == -EINVAL) {
+ /* Effective RIP could not be calculated */
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ ctxt->fi.cr2 = 0;
+ return ES_EXCEPTION;
+ }
+
+ if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
+ return ES_DECODE_FAILED;
+
+ if (ctxt->insn.immediate.got)
+ return ES_OK;
+ else
+ return ES_DECODE_FAILED;
+}
+
+static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int res, ret;
+
+ res = vc_fetch_insn_kernel(ctxt, buffer);
+ if (res) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ }
+
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+ else
+ return ES_OK;
+}
+
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ if (user_mode(ctxt->regs))
+ return __vc_decode_user_insn(ctxt);
+ else
+ return __vc_decode_kern_insn(ctxt);
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ char *dst, char *buf, size_t size)
+{
+ unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
+
+ /*
+ * This function uses __put_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __put_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __put_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_to_user() here because
+ * vc_write_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
+ switch (size) {
+ case 1: {
+ u8 d1;
+ u8 __user *target = (u8 __user *)dst;
+
+ memcpy(&d1, buf, 1);
+ if (__put_user(d1, target))
+ goto fault;
+ break;
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *target = (u16 __user *)dst;
+
+ memcpy(&d2, buf, 2);
+ if (__put_user(d2, target))
+ goto fault;
+ break;
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *target = (u32 __user *)dst;
+
+ memcpy(&d4, buf, 4);
+ if (__put_user(d4, target))
+ goto fault;
+ break;
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *target = (u64 __user *)dst;
+
+ memcpy(&d8, buf, 8);
+ if (__put_user(d8, target))
+ goto fault;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+ return ES_UNSUPPORTED;
+ }
+
+ return ES_OK;
+
+fault:
+ if (user_mode(ctxt->regs))
+ error_code |= X86_PF_USER;
+
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = error_code;
+ ctxt->fi.cr2 = (unsigned long)dst;
+
+ return ES_EXCEPTION;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ char *src, char *buf, size_t size)
+{
+ unsigned long error_code = X86_PF_PROT;
+
+ /*
+ * This function uses __get_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __get_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __get_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_from_user() here because
+ * vc_read_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
+ switch (size) {
+ case 1: {
+ u8 d1;
+ u8 __user *s = (u8 __user *)src;
+
+ if (__get_user(d1, s))
+ goto fault;
+ memcpy(buf, &d1, 1);
+ break;
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *s = (u16 __user *)src;
+
+ if (__get_user(d2, s))
+ goto fault;
+ memcpy(buf, &d2, 2);
+ break;
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *s = (u32 __user *)src;
+
+ if (__get_user(d4, s))
+ goto fault;
+ memcpy(buf, &d4, 4);
+ break;
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *s = (u64 __user *)src;
+ if (__get_user(d8, s))
+ goto fault;
+ memcpy(buf, &d8, 8);
+ break;
+ }
+ default:
+ WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+ return ES_UNSUPPORTED;
+ }
+
+ return ES_OK;
+
+fault:
+ if (user_mode(ctxt->regs))
+ error_code |= X86_PF_USER;
+
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = error_code;
+ ctxt->fi.cr2 = (unsigned long)src;
+
+ return ES_EXCEPTION;
+}
+
+static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned long vaddr, phys_addr_t *paddr)
+{
+ unsigned long va = (unsigned long)vaddr;
+ unsigned int level;
+ phys_addr_t pa;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3_pa());
+ pgd = &pgd[pgd_index(va)];
+ pte = lookup_address_in_pgd(pgd, va, &level);
+ if (!pte) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.cr2 = vaddr;
+ ctxt->fi.error_code = 0;
+
+ if (user_mode(ctxt->regs))
+ ctxt->fi.error_code |= X86_PF_USER;
+
+ return ES_EXCEPTION;
+ }
+
+ if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
+ /* Emulated MMIO to/from encrypted memory not supported */
+ return ES_UNSUPPORTED;
+
+ pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
+ pa |= va & ~page_level_mask(level);
+
+ *paddr = pa;
+
+ return ES_OK;
+}
+
+/* Include code shared with pre-decompression boot stage */
+#include "sev-shared.c"
+
+static noinstr void __sev_put_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (state->ghcb) {
+ /* Restore GHCB from Backup */
+ *ghcb = *state->ghcb;
+ data->backup_ghcb_active = false;
+ state->ghcb = NULL;
+ } else {
+ /*
+ * Invalidate the GHCB so a VMGEXIT instruction issued
+ * from userspace won't appear to be valid.
+ */
+ vc_ghcb_invalidate(ghcb);
+ data->ghcb_active = false;
+ }
+}
+
+void noinstr __sev_es_nmi_complete(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
+ VMGEXIT();
+
+ __sev_put_ghcb(&state);
+}
+
+static u64 __init get_secrets_page(void)
+{
+ u64 pa_data = boot_params.cc_blob_address;
+ struct cc_blob_sev_info info;
+ void *map;
+
+ /*
+ * The CC blob contains the address of the secrets page, check if the
+ * blob is present.
+ */
+ if (!pa_data)
+ return 0;
+
+ map = early_memremap(pa_data, sizeof(info));
+ if (!map) {
+ pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n");
+ return 0;
+ }
+ memcpy(&info, map, sizeof(info));
+ early_memunmap(map, sizeof(info));
+
+ /* smoke-test the secrets page passed */
+ if (!info.secrets_phys || info.secrets_len != PAGE_SIZE)
+ return 0;
+
+ return info.secrets_phys;
+}
+
+static u64 __init get_snp_jump_table_addr(void)
+{
+ struct snp_secrets_page_layout *layout;
+ void __iomem *mem;
+ u64 pa, addr;
+
+ pa = get_secrets_page();
+ if (!pa)
+ return 0;
+
+ mem = ioremap_encrypted(pa, PAGE_SIZE);
+ if (!mem) {
+ pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
+ return 0;
+ }
+
+ layout = (__force struct snp_secrets_page_layout *)mem;
+
+ addr = layout->os_area.ap_jump_table_pa;
+ iounmap(mem);
+
+ return addr;
+}
+
+static u64 __init get_jump_table_addr(void)
+{
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ u64 ret = 0;
+
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return get_snp_jump_table_addr();
+
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
+ ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
+ ghcb_sw_exit_info_2_is_valid(ghcb))
+ ret = ghcb->save.sw_exit_info_2;
+
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate)
+{
+ unsigned long vaddr_end;
+ int rc;
+
+ vaddr = vaddr & PAGE_MASK;
+ vaddr_end = vaddr + (npages << PAGE_SHIFT);
+
+ while (vaddr < vaddr_end) {
+ rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
+ if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+
+ vaddr = vaddr + PAGE_SIZE;
+ }
+}
+
+static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op)
+{
+ unsigned long paddr_end;
+ u64 val;
+
+ paddr = paddr & PAGE_MASK;
+ paddr_end = paddr + (npages << PAGE_SHIFT);
+
+ while (paddr < paddr_end) {
+ /*
+ * Use the MSR protocol because this function can be called before
+ * the GHCB is established.
+ */
+ sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+
+ if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
+ "Wrong PSC response code: 0x%x\n",
+ (unsigned int)GHCB_RESP_CODE(val)))
+ goto e_term;
+
+ if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
+ "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
+ op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
+ paddr, GHCB_MSR_PSC_RESP_VAL(val)))
+ goto e_term;
+
+ paddr = paddr + PAGE_SIZE;
+ }
+
+ return;
+
+e_term:
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+}
+
+void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned int npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ /*
+ * Ask the hypervisor to mark the memory pages as private in the RMP
+ * table.
+ */
+ early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE);
+
+ /* Validate the memory pages after they've been added in the RMP table. */
+ pvalidate_pages(vaddr, npages, true);
+}
+
+void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned int npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ /* Invalidate the memory pages before they are marked shared in the RMP table. */
+ pvalidate_pages(vaddr, npages, false);
+
+ /* Ask hypervisor to mark the memory pages shared in the RMP table. */
+ early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED);
+}
+
+void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
+{
+ unsigned long vaddr, npages;
+
+ vaddr = (unsigned long)__va(paddr);
+ npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+
+ if (op == SNP_PAGE_STATE_PRIVATE)
+ early_snp_set_memory_private(vaddr, paddr, npages);
+ else if (op == SNP_PAGE_STATE_SHARED)
+ early_snp_set_memory_shared(vaddr, paddr, npages);
+ else
+ WARN(1, "invalid memory op %d\n", op);
+}
+
+static int vmgexit_psc(struct snp_psc_desc *desc)
+{
+ int cur_entry, end_entry, ret = 0;
+ struct snp_psc_desc *data;
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ unsigned long flags;
+ struct ghcb *ghcb;
+
+ /*
+ * __sev_get_ghcb() needs to run with IRQs disabled because it is using
+ * a per-CPU GHCB.
+ */
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+ if (!ghcb) {
+ ret = 1;
+ goto out_unlock;
+ }
+
+ /* Copy the input desc into GHCB shared buffer */
+ data = (struct snp_psc_desc *)ghcb->shared_buffer;
+ memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
+
+ /*
+ * As per the GHCB specification, the hypervisor can resume the guest
+ * before processing all the entries. Check whether all the entries
+ * are processed. If not, then keep retrying. Note, the hypervisor
+ * will update the data memory directly to indicate the status, so
+ * reference the data->hdr everywhere.
+ *
+ * The strategy here is to wait for the hypervisor to change the page
+ * state in the RMP table before guest accesses the memory pages. If the
+ * page state change was not successful, then later memory access will
+ * result in a crash.
+ */
+ cur_entry = data->hdr.cur_entry;
+ end_entry = data->hdr.end_entry;
+
+ while (data->hdr.cur_entry <= data->hdr.end_entry) {
+ ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
+
+ /* This will advance the shared buffer data points to. */
+ ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
+
+ /*
+ * Page State Change VMGEXIT can pass error code through
+ * exit_info_2.
+ */
+ if (WARN(ret || ghcb->save.sw_exit_info_2,
+ "SNP: PSC failed ret=%d exit_info_2=%llx\n",
+ ret, ghcb->save.sw_exit_info_2)) {
+ ret = 1;
+ goto out;
+ }
+
+ /* Verify that reserved bit is not set */
+ if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Sanity check that entry processing is not going backwards.
+ * This will happen only if hypervisor is tricking us.
+ */
+ if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
+"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
+ end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+out:
+ __sev_put_ghcb(&state);
+
+out_unlock:
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
+ unsigned long vaddr_end, int op)
+{
+ struct psc_hdr *hdr;
+ struct psc_entry *e;
+ unsigned long pfn;
+ int i;
+
+ hdr = &data->hdr;
+ e = data->entries;
+
+ memset(data, 0, sizeof(*data));
+ i = 0;
+
+ while (vaddr < vaddr_end) {
+ if (is_vmalloc_addr((void *)vaddr))
+ pfn = vmalloc_to_pfn((void *)vaddr);
+ else
+ pfn = __pa(vaddr) >> PAGE_SHIFT;
+
+ e->gfn = pfn;
+ e->operation = op;
+ hdr->end_entry = i;
+
+ /*
+ * Current SNP implementation doesn't keep track of the RMP page
+ * size so use 4K for simplicity.
+ */
+ e->pagesize = RMP_PG_SIZE_4K;
+
+ vaddr = vaddr + PAGE_SIZE;
+ e++;
+ i++;
+ }
+
+ if (vmgexit_psc(data))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+}
+
+static void set_pages_state(unsigned long vaddr, unsigned int npages, int op)
+{
+ unsigned long vaddr_end, next_vaddr;
+ struct snp_psc_desc *desc;
+
+ desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT);
+ if (!desc)
+ panic("SNP: failed to allocate memory for PSC descriptor\n");
+
+ vaddr = vaddr & PAGE_MASK;
+ vaddr_end = vaddr + (npages << PAGE_SHIFT);
+
+ while (vaddr < vaddr_end) {
+ /* Calculate the last vaddr that fits in one struct snp_psc_desc. */
+ next_vaddr = min_t(unsigned long, vaddr_end,
+ (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr);
+
+ __set_pages_state(desc, vaddr, next_vaddr, op);
+
+ vaddr = next_vaddr;
+ }
+
+ kfree(desc);
+}
+
+void snp_set_memory_shared(unsigned long vaddr, unsigned int npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ pvalidate_pages(vaddr, npages, false);
+
+ set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);
+}
+
+void snp_set_memory_private(unsigned long vaddr, unsigned int npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
+
+ pvalidate_pages(vaddr, npages, true);
+}
+
+static int snp_set_vmsa(void *va, bool vmsa)
+{
+ u64 attrs;
+
+ /*
+ * Running at VMPL0 allows the kernel to change the VMSA bit for a page
+ * using the RMPADJUST instruction. However, for the instruction to
+ * succeed it must target the permissions of a lesser privileged
+ * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
+ * instruction in the AMD64 APM Volume 3).
+ */
+ attrs = 1;
+ if (vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+}
+
+#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
+#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK)
+#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK)
+
+#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
+#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
+
+static void *snp_alloc_vmsa_page(void)
+{
+ struct page *p;
+
+ /*
+ * Allocate VMSA page to work around the SNP erratum where the CPU will
+ * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
+ * collides with the RMP entry of VMSA page. The recommended workaround
+ * is to not use a large page.
+ *
+ * Allocate an 8k page which is also 8k-aligned.
+ */
+ p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ if (!p)
+ return NULL;
+
+ split_page(p, 1);
+
+ /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */
+ __free_page(p);
+
+ return page_address(p + 1);
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
+{
+ int err;
+
+ err = snp_set_vmsa(vmsa, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+}
+
+static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip)
+{
+ struct sev_es_save_area *cur_vmsa, *vmsa;
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ u8 sipi_vector;
+ int cpu, ret;
+ u64 cr4;
+
+ /*
+ * The hypervisor SNP feature support check has happened earlier, just check
+ * the AP_CREATION one here.
+ */
+ if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
+ return -EOPNOTSUPP;
+
+ /*
+ * Verify the desired start IP against the known trampoline start IP
+ * to catch any future new trampolines that may be introduced that
+ * would require a new protected guest entry point.
+ */
+ if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
+ "Unsupported SNP start_ip: %lx\n", start_ip))
+ return -EINVAL;
+
+ /* Override start_ip with known protected guest start IP */
+ start_ip = real_mode_header->sev_es_trampoline_start;
+
+ /* Find the logical CPU for the APIC ID */
+ for_each_present_cpu(cpu) {
+ if (arch_match_cpu_phys_id(cpu, apic_id))
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+
+ cur_vmsa = per_cpu(sev_vmsa, cpu);
+
+ /*
+ * A new VMSA is created each time because there is no guarantee that
+ * the current VMSA is the kernels or that the vCPU is not running. If
+ * an attempt was done to use the current VMSA with a running vCPU, a
+ * #VMEXIT of that vCPU would wipe out all of the settings being done
+ * here.
+ */
+ vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
+ if (!vmsa)
+ return -ENOMEM;
+
+ /* CR4 should maintain the MCE value */
+ cr4 = native_read_cr4() & X86_CR4_MCE;
+
+ /* Set the CS value based on the start_ip converted to a SIPI vector */
+ sipi_vector = (start_ip >> 12);
+ vmsa->cs.base = sipi_vector << 12;
+ vmsa->cs.limit = AP_INIT_CS_LIMIT;
+ vmsa->cs.attrib = INIT_CS_ATTRIBS;
+ vmsa->cs.selector = sipi_vector << 8;
+
+ /* Set the RIP value based on start_ip */
+ vmsa->rip = start_ip & 0xfff;
+
+ /* Set AP INIT defaults as documented in the APM */
+ vmsa->ds.limit = AP_INIT_DS_LIMIT;
+ vmsa->ds.attrib = INIT_DS_ATTRIBS;
+ vmsa->es = vmsa->ds;
+ vmsa->fs = vmsa->ds;
+ vmsa->gs = vmsa->ds;
+ vmsa->ss = vmsa->ds;
+
+ vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT;
+ vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT;
+ vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS;
+ vmsa->idtr.limit = AP_INIT_IDTR_LIMIT;
+ vmsa->tr.limit = AP_INIT_TR_LIMIT;
+ vmsa->tr.attrib = INIT_TR_ATTRIBS;
+
+ vmsa->cr4 = cr4;
+ vmsa->cr0 = AP_INIT_CR0_DEFAULT;
+ vmsa->dr7 = DR7_RESET_VALUE;
+ vmsa->dr6 = AP_INIT_DR6_DEFAULT;
+ vmsa->rflags = AP_INIT_RFLAGS_DEFAULT;
+ vmsa->g_pat = AP_INIT_GPAT_DEFAULT;
+ vmsa->xcr0 = AP_INIT_XCR0_DEFAULT;
+ vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT;
+ vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT;
+ vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT;
+
+ /* SVME must be set. */
+ vmsa->efer = EFER_SVME;
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = 0;
+ vmsa->sev_features = sev_status >> 2;
+
+ /* Switch the page over to a VMSA page now that it is initialized */
+ ret = snp_set_vmsa(vmsa, true);
+ if (ret) {
+ pr_err("set VMSA page failed (%u)\n", ret);
+ free_page((unsigned long)vmsa);
+
+ return -EINVAL;
+ }
+
+ /* Issue VMGEXIT AP Creation NAE event */
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_rax(ghcb, vmsa->sev_features);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
+ ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE);
+ ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
+ lower_32_bits(ghcb->save.sw_exit_info_1)) {
+ pr_err("SNP AP Creation error\n");
+ ret = -EINVAL;
+ }
+
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ /* Perform cleanup if there was an error */
+ if (ret) {
+ snp_cleanup_vmsa(vmsa);
+ vmsa = NULL;
+ }
+
+ /* Free up any previous VMSA page */
+ if (cur_vmsa)
+ snp_cleanup_vmsa(cur_vmsa);
+
+ /* Record the current VMSA page */
+ per_cpu(sev_vmsa, cpu) = vmsa;
+
+ return ret;
+}
+
+void snp_set_wakeup_secondary_cpu(void)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ /*
+ * Always set this override if SNP is enabled. This makes it the
+ * required method to start APs under SNP. If the hypervisor does
+ * not support AP creation, then no APs will be started.
+ */
+ apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit;
+}
+
+int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
+{
+ u16 startup_cs, startup_ip;
+ phys_addr_t jump_table_pa;
+ u64 jump_table_addr;
+ u16 __iomem *jump_table;
+
+ jump_table_addr = get_jump_table_addr();
+
+ /* On UP guests there is no jump table so this is not a failure */
+ if (!jump_table_addr)
+ return 0;
+
+ /* Check if AP Jump Table is page-aligned */
+ if (jump_table_addr & ~PAGE_MASK)
+ return -EINVAL;
+
+ jump_table_pa = jump_table_addr & PAGE_MASK;
+
+ startup_cs = (u16)(rmh->trampoline_start >> 4);
+ startup_ip = (u16)(rmh->sev_es_trampoline_start -
+ rmh->trampoline_start);
+
+ jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
+ if (!jump_table)
+ return -EIO;
+
+ writew(startup_ip, &jump_table[0]);
+ writew(startup_cs, &jump_table[1]);
+
+ iounmap(jump_table);
+
+ return 0;
+}
+
+/*
+ * This is needed by the OVMF UEFI firmware which will use whatever it finds in
+ * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
+ * runtime GHCBs used by the kernel are also mapped in the EFI page-table.
+ */
+int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
+{
+ struct sev_es_runtime_data *data;
+ unsigned long address, pflags;
+ int cpu;
+ u64 pfn;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ return 0;
+
+ pflags = _PAGE_NX | _PAGE_RW;
+
+ for_each_possible_cpu(cpu) {
+ data = per_cpu(runtime_data, cpu);
+
+ address = __pa(&data->ghcb_page);
+ pfn = address >> PAGE_SHIFT;
+
+ if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
+ return 1;
+ }
+
+ return 0;
+}
+
+static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ enum es_result ret;
+ u64 exit_info_1;
+
+ /* Is it a WRMSR? */
+ exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0;
+
+ ghcb_set_rcx(ghcb, regs->cx);
+ if (exit_info_1) {
+ ghcb_set_rax(ghcb, regs->ax);
+ ghcb_set_rdx(ghcb, regs->dx);
+ }
+
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR,
+ exit_info_1, 0);
+
+ if ((ret == ES_OK) && (!exit_info_1)) {
+ regs->ax = ghcb->save.rax;
+ regs->dx = ghcb->save.rdx;
+ }
+
+ return ret;
+}
+
+static void snp_register_per_cpu_ghcb(void)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ snp_register_ghcb_early(__pa(ghcb));
+}
+
+void setup_ghcb(void)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ return;
+
+ /* First make sure the hypervisor talks a supported protocol. */
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /*
+ * Check whether the runtime #VC exception handler is active. It uses
+ * the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
+ *
+ * If SNP is active, register the per-CPU GHCB page so that the runtime
+ * exception handler can use it.
+ */
+ if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ snp_register_per_cpu_ghcb();
+
+ return;
+ }
+
+ /*
+ * Clear the boot_ghcb. The first exception comes in before the bss
+ * section is cleared.
+ */
+ memset(&boot_ghcb_page, 0, PAGE_SIZE);
+
+ /* Alright - Make the boot-ghcb public */
+ boot_ghcb = &boot_ghcb_page;
+
+ /* SNP guest requires that GHCB GPA must be registered. */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ snp_register_ghcb_early(__pa(&boot_ghcb_page));
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sev_es_ap_hlt_loop(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = __sev_get_ghcb(&state);
+
+ while (true) {
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ /* Wakeup signal? */
+ if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
+ ghcb->save.sw_exit_info_2)
+ break;
+ }
+
+ __sev_put_ghcb(&state);
+}
+
+/*
+ * Play_dead handler when running under SEV-ES. This is needed because
+ * the hypervisor can't deliver an SIPI request to restart the AP.
+ * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
+ * hypervisor wakes it up again.
+ */
+static void sev_es_play_dead(void)
+{
+ play_dead_common();
+
+ /* IRQs now disabled */
+
+ sev_es_ap_hlt_loop();
+
+ /*
+ * If we get here, the VCPU was woken up again. Jump to CPU
+ * startup code to get it back online.
+ */
+ start_cpu0();
+}
+#else /* CONFIG_HOTPLUG_CPU */
+#define sev_es_play_dead native_play_dead
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_SMP
+static void __init sev_es_setup_play_dead(void)
+{
+ smp_ops.play_dead = sev_es_play_dead;
+}
+#else
+static inline void sev_es_setup_play_dead(void) { }
+#endif
+
+static void __init alloc_runtime_data(int cpu)
+{
+ struct sev_es_runtime_data *data;
+
+ data = memblock_alloc(sizeof(*data), PAGE_SIZE);
+ if (!data)
+ panic("Can't allocate SEV-ES runtime data");
+
+ per_cpu(runtime_data, cpu) = data;
+}
+
+static void __init init_ghcb(int cpu)
+{
+ struct sev_es_runtime_data *data;
+ int err;
+
+ data = per_cpu(runtime_data, cpu);
+
+ err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
+ sizeof(data->ghcb_page));
+ if (err)
+ panic("Can't map GHCBs unencrypted");
+
+ memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
+
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+}
+
+void __init sev_es_init_vc_handling(void)
+{
+ int cpu;
+
+ BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
+
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ return;
+
+ if (!sev_es_check_cpu_features())
+ panic("SEV-ES CPU Features missing");
+
+ /*
+ * SNP is supported in v2 of the GHCB spec which mandates support for HV
+ * features.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ sev_hv_features = get_hv_features();
+
+ if (!(sev_hv_features & GHCB_HV_FT_SNP))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+ }
+
+ /* Enable SEV-ES special handling */
+ static_branch_enable(&sev_es_enable_key);
+
+ /* Initialize per-cpu GHCB pages */
+ for_each_possible_cpu(cpu) {
+ alloc_runtime_data(cpu);
+ init_ghcb(cpu);
+ }
+
+ sev_es_setup_play_dead();
+
+ /* Secondary CPUs use the runtime #VC handler */
+ initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
+}
+
+static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
+{
+ int trapnr = ctxt->fi.vector;
+
+ if (trapnr == X86_TRAP_PF)
+ native_write_cr2(ctxt->fi.cr2);
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+ do_early_exception(ctxt->regs, trapnr);
+}
+
+static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
+{
+ long *reg_array;
+ int offset;
+
+ reg_array = (long *)ctxt->regs;
+ offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
+
+ if (offset < 0)
+ return NULL;
+
+ offset /= sizeof(long);
+
+ return reg_array + offset;
+}
+static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned int bytes, bool read)
+{
+ u64 exit_code, exit_info_1, exit_info_2;
+ unsigned long ghcb_pa = __pa(ghcb);
+ enum es_result res;
+ phys_addr_t paddr;
+ void __user *ref;
+
+ ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
+ if (ref == (void __user *)-1L)
+ return ES_UNSUPPORTED;
+
+ exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
+
+ res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
+ if (res != ES_OK) {
+ if (res == ES_EXCEPTION && !read)
+ ctxt->fi.error_code |= X86_PF_WRITE;
+
+ return res;
+ }
+
+ exit_info_1 = paddr;
+ /* Can never be greater than 8 */
+ exit_info_2 = bytes;
+
+ ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
+
+ return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2);
+}
+
+/*
+ * The MOVS instruction has two memory operands, which raises the
+ * problem that it is not known whether the access to the source or the
+ * destination caused the #VC exception (and hence whether an MMIO read
+ * or write operation needs to be emulated).
+ *
+ * Instead of playing games with walking page-tables and trying to guess
+ * whether the source or destination is an MMIO range, split the move
+ * into two operations, a read and a write with only one memory operand.
+ * This will cause a nested #VC exception on the MMIO address which can
+ * then be handled.
+ *
+ * This implementation has the benefit that it also supports MOVS where
+ * source _and_ destination are MMIO regions.
+ *
+ * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
+ * rare operation. If it turns out to be a performance problem the split
+ * operations can be moved to memcpy_fromio() and memcpy_toio().
+ */
+static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
+ unsigned int bytes)
+{
+ unsigned long ds_base, es_base;
+ unsigned char *src, *dst;
+ unsigned char buffer[8];
+ enum es_result ret;
+ bool rep;
+ int off;
+
+ ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
+ es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
+
+ if (ds_base == -1L || es_base == -1L) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ src = ds_base + (unsigned char *)ctxt->regs->si;
+ dst = es_base + (unsigned char *)ctxt->regs->di;
+
+ ret = vc_read_mem(ctxt, src, buffer, bytes);
+ if (ret != ES_OK)
+ return ret;
+
+ ret = vc_write_mem(ctxt, dst, buffer, bytes);
+ if (ret != ES_OK)
+ return ret;
+
+ if (ctxt->regs->flags & X86_EFLAGS_DF)
+ off = -bytes;
+ else
+ off = bytes;
+
+ ctxt->regs->si += off;
+ ctxt->regs->di += off;
+
+ rep = insn_has_rep_prefix(&ctxt->insn);
+ if (rep)
+ ctxt->regs->cx -= 1;
+
+ if (!rep || ctxt->regs->cx == 0)
+ return ES_OK;
+ else
+ return ES_RETRY;
+}
+
+static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct insn *insn = &ctxt->insn;
+ unsigned int bytes = 0;
+ enum mmio_type mmio;
+ enum es_result ret;
+ u8 sign_byte;
+ long *reg_data;
+
+ mmio = insn_decode_mmio(insn, &bytes);
+ if (mmio == MMIO_DECODE_FAILED)
+ return ES_DECODE_FAILED;
+
+ if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
+ if (!reg_data)
+ return ES_DECODE_FAILED;
+ }
+
+ switch (mmio) {
+ case MMIO_WRITE:
+ memcpy(ghcb->shared_buffer, reg_data, bytes);
+ ret = vc_do_mmio(ghcb, ctxt, bytes, false);
+ break;
+ case MMIO_WRITE_IMM:
+ memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
+ ret = vc_do_mmio(ghcb, ctxt, bytes, false);
+ break;
+ case MMIO_READ:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ /* Zero-extend for 32-bit operation */
+ if (bytes == 4)
+ *reg_data = 0;
+
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case MMIO_READ_ZERO_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ /* Zero extend based on operand size */
+ memset(reg_data, 0, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case MMIO_READ_SIGN_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ if (bytes == 1) {
+ u8 *val = (u8 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x80) ? 0xff : 0x00;
+ } else {
+ u16 *val = (u16 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x8000) ? 0xff : 0x00;
+ }
+
+ /* Sign extend based on operand size */
+ memset(reg_data, sign_byte, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case MMIO_MOVS:
+ ret = vc_handle_mmio_movs(ctxt, bytes);
+ break;
+ default:
+ ret = ES_UNSUPPORTED;
+ break;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ long val, *reg = vc_insn_get_rm(ctxt);
+ enum es_result ret;
+
+ if (!reg)
+ return ES_DECODE_FAILED;
+
+ val = *reg;
+
+ /* Upper 32 bits must be written as zeroes */
+ if (val >> 32) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ /* Clear out other reserved bits and set bit 10 */
+ val = (val & 0xffff23ffL) | BIT(10);
+
+ /* Early non-zero writes to DR7 are not supported */
+ if (!data && (val & ~DR7_RESET_VALUE))
+ return ES_UNSUPPORTED;
+
+ /* Using a value of 0 for ExitInfo1 means RAX holds the value */
+ ghcb_set_rax(ghcb, val);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (data)
+ data->dr7 = val;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ long *reg = vc_insn_get_rm(ctxt);
+
+ if (!reg)
+ return ES_DECODE_FAILED;
+
+ if (data)
+ *reg = data->dr7;
+ else
+ *reg = DR7_RESET_VALUE;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0);
+}
+
+static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ enum es_result ret;
+
+ ghcb_set_rcx(ghcb, ctxt->regs->cx);
+
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+ ctxt->regs->dx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_monitor(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /*
+ * Treat it as a NOP and do not leak a physical address to the
+ * hypervisor.
+ */
+ return ES_OK;
+}
+
+static enum es_result vc_handle_mwait(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /* Treat the same as MONITOR/MONITORX */
+ return ES_OK;
+}
+
+static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ enum es_result ret;
+
+ ghcb_set_rax(ghcb, ctxt->regs->ax);
+ ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
+
+ if (x86_platform.hyper.sev_es_hcall_prepare)
+ x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
+
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!ghcb_rax_is_valid(ghcb))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+
+ /*
+ * Call sev_es_hcall_finish() after regs->ax is already set.
+ * This allows the hypervisor handler to overwrite it again if
+ * necessary.
+ */
+ if (x86_platform.hyper.sev_es_hcall_finish &&
+ !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
+ return ES_VMM_ERROR;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /*
+ * Calling ecx_alignment_check() directly does not work, because it
+ * enables IRQs and the GHCB is active. Forward the exception and call
+ * it later from vc_forward_exception().
+ */
+ ctxt->fi.vector = X86_TRAP_AC;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+}
+
+static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
+ struct ghcb *ghcb,
+ unsigned long exit_code)
+{
+ enum es_result result;
+
+ switch (exit_code) {
+ case SVM_EXIT_READ_DR7:
+ result = vc_handle_dr7_read(ghcb, ctxt);
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ result = vc_handle_dr7_write(ghcb, ctxt);
+ break;
+ case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
+ result = vc_handle_trap_ac(ghcb, ctxt);
+ break;
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
+ break;
+ case SVM_EXIT_RDPMC:
+ result = vc_handle_rdpmc(ghcb, ctxt);
+ break;
+ case SVM_EXIT_INVD:
+ pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
+ result = ES_UNSUPPORTED;
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(ghcb, ctxt);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MSR:
+ result = vc_handle_msr(ghcb, ctxt);
+ break;
+ case SVM_EXIT_VMMCALL:
+ result = vc_handle_vmmcall(ghcb, ctxt);
+ break;
+ case SVM_EXIT_WBINVD:
+ result = vc_handle_wbinvd(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MONITOR:
+ result = vc_handle_monitor(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MWAIT:
+ result = vc_handle_mwait(ghcb, ctxt);
+ break;
+ case SVM_EXIT_NPF:
+ result = vc_handle_mmio(ghcb, ctxt);
+ break;
+ default:
+ /*
+ * Unexpected #VC exception
+ */
+ result = ES_UNSUPPORTED;
+ }
+
+ return result;
+}
+
+static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
+{
+ long error_code = ctxt->fi.error_code;
+ int trapnr = ctxt->fi.vector;
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+
+ switch (trapnr) {
+ case X86_TRAP_GP:
+ exc_general_protection(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_UD:
+ exc_invalid_op(ctxt->regs);
+ break;
+ case X86_TRAP_PF:
+ write_cr2(ctxt->fi.cr2);
+ exc_page_fault(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_AC:
+ exc_alignment_check(ctxt->regs, error_code);
+ break;
+ default:
+ pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
+ BUG();
+ }
+}
+
+static __always_inline bool is_vc2_stack(unsigned long sp)
+{
+ return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
+}
+
+static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
+{
+ unsigned long sp, prev_sp;
+
+ sp = (unsigned long)regs;
+ prev_sp = regs->sp;
+
+ /*
+ * If the code was already executing on the VC2 stack when the #VC
+ * happened, let it proceed to the normal handling routine. This way the
+ * code executing on the VC2 stack can cause #VC exceptions to get handled.
+ */
+ return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
+}
+
+static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
+{
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+ struct ghcb *ghcb;
+ bool ret = true;
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, error_code);
+
+ if (result == ES_OK)
+ result = vc_handle_exitcode(&ctxt, ghcb, error_code);
+
+ __sev_put_ghcb(&state);
+
+ /* Done - now check the result */
+ switch (result) {
+ case ES_OK:
+ vc_finish_insn(&ctxt);
+ break;
+ case ES_UNSUPPORTED:
+ pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_VMM_ERROR:
+ pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_DECODE_FAILED:
+ pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_EXCEPTION:
+ vc_forward_exception(&ctxt);
+ break;
+ case ES_RETRY:
+ /* Nothing to do */
+ break;
+ default:
+ pr_emerg("Unknown result in %s():%d\n", __func__, result);
+ /*
+ * Emulating the instruction which caused the #VC exception
+ * failed - can't continue so print debug information
+ */
+ BUG();
+ }
+
+ return ret;
+}
+
+static __always_inline bool vc_is_db(unsigned long error_code)
+{
+ return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
+}
+
+/*
+ * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
+ * and will panic when an error happens.
+ */
+DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
+{
+ irqentry_state_t irq_state;
+
+ /*
+ * With the current implementation it is always possible to switch to a
+ * safe stack because #VC exceptions only happen at known places, like
+ * intercepted instructions or accesses to MMIO areas/IO ports. They can
+ * also happen with code instrumentation when the hypervisor intercepts
+ * #DB, but the critical paths are forbidden to be instrumented, so #DB
+ * exceptions currently also only happen in safe places.
+ *
+ * But keep this here in case the noinstr annotations are violated due
+ * to bug elsewhere.
+ */
+ if (unlikely(vc_from_invalid_context(regs))) {
+ instrumentation_begin();
+ panic("Can't handle #VC exception from unsupported context\n");
+ instrumentation_end();
+ }
+
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ exc_debug(regs);
+ return;
+ }
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /* Show some debug info */
+ show_regs(regs);
+
+ /* Ask hypervisor to sev_es_terminate */
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /* If that fails and we get here - just panic */
+ panic("Returned from Terminate-Request to Hypervisor\n");
+ }
+
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+/*
+ * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
+ * and will kill the current task with SIGBUS when an error happens.
+ */
+DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
+{
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ noist_exc_debug(regs);
+ return;
+ }
+
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal with
+ * it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ }
+
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+}
+
+bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
+{
+ unsigned long exit_code = regs->orig_ax;
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ vc_ghcb_invalidate(boot_ghcb);
+
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result == ES_OK)
+ result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
+
+ /* Done - now check the result */
+ switch (result) {
+ case ES_OK:
+ vc_finish_insn(&ctxt);
+ break;
+ case ES_UNSUPPORTED:
+ early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_VMM_ERROR:
+ early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_DECODE_FAILED:
+ early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_EXCEPTION:
+ vc_early_forward_exception(&ctxt);
+ break;
+ case ES_RETRY:
+ /* Nothing to do */
+ break;
+ default:
+ BUG();
+ }
+
+ return true;
+
+fail:
+ show_regs(regs);
+
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the kernel
+ * in the following ways, depending on how it is booted:
+ *
+ * - when booted via the boot/decompress kernel:
+ * - via boot_params
+ *
+ * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
+ * - via a setup_data entry, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ /* Boot kernel would have passed the CC blob via boot_params. */
+ if (bp->cc_blob_address) {
+ cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
+ goto found_cc_info;
+ }
+
+ /*
+ * If kernel was booted directly, without the use of the
+ * boot/decompression kernel, the CC blob may have been passed via
+ * setup_data instead.
+ */
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ snp_abort();
+
+ return cc_info;
+}
+
+bool __init snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ setup_cpuid_table(cc_info);
+
+ /*
+ * The CC blob will be used later to access the secrets page. Cache
+ * it here like the boot kernel does.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
+void __init snp_abort(void)
+{
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+}
+
+static void dump_cpuid_table(void)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ int i = 0;
+
+ pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
+ cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
+
+ for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) {
+ const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
+ i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
+ fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
+ }
+}
+
+/*
+ * It is useful from an auditing/testing perspective to provide an easy way
+ * for the guest owner to know that the CPUID table has been initialized as
+ * expected, but that initialization happens too early in boot to print any
+ * sort of indicator, and there's not really any other good place to do it,
+ * so do it here.
+ */
+static int __init report_cpuid_table(void)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+
+ if (!cpuid_table->count)
+ return 0;
+
+ pr_info("Using SNP CPUID table, %d entries present.\n",
+ cpuid_table->count);
+
+ if (sev_cfg.debug)
+ dump_cpuid_table();
+
+ return 0;
+}
+arch_initcall(report_cpuid_table);
+
+static int __init init_sev_config(char *str)
+{
+ char *s;
+
+ while ((s = strsep(&str, ","))) {
+ if (!strcmp(s, "debug")) {
+ sev_cfg.debug = true;
+ continue;
+ }
+
+ pr_info("SEV command-line option '%s' was not recognized\n", s);
+ }
+
+ return 1;
+}
+__setup("sev=", init_sev_config);
+
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err)
+{
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return -ENODEV;
+
+ if (!fw_err)
+ return -EINVAL;
+
+ /*
+ * __sev_get_ghcb() needs to run with IRQs disabled because it is using
+ * a per-CPU GHCB.
+ */
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+ if (!ghcb) {
+ ret = -EIO;
+ goto e_restore_irq;
+ }
+
+ vc_ghcb_invalidate(ghcb);
+
+ if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
+ ghcb_set_rax(ghcb, input->data_gpa);
+ ghcb_set_rbx(ghcb, input->data_npages);
+ }
+
+ ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa);
+ if (ret)
+ goto e_put;
+
+ if (ghcb->save.sw_exit_info_2) {
+ /* Number of expected pages are returned in RBX */
+ if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST &&
+ ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN)
+ input->data_npages = ghcb_get_rbx(ghcb);
+
+ *fw_err = ghcb->save.sw_exit_info_2;
+
+ ret = -EIO;
+ }
+
+e_put:
+ __sev_put_ghcb(&state);
+e_restore_irq:
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(snp_issue_guest_request);
+
+static struct platform_device sev_guest_device = {
+ .name = "sev-guest",
+ .id = -1,
+};
+
+static int __init snp_init_platform_device(void)
+{
+ struct sev_guest_platform_data data;
+ u64 gpa;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return -ENODEV;
+
+ gpa = get_secrets_page();
+ if (!gpa)
+ return -ENODEV;
+
+ data.secrets_gpa = gpa;
+ if (platform_device_add_data(&sev_guest_device, &data, sizeof(data)))
+ return -ENODEV;
+
+ if (platform_device_register(&sev_guest_device))
+ return -ENODEV;
+
+ pr_info("SNP guest platform device initialized.\n");
+ return 0;
+}
+device_initcall(snp_init_platform_device);
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
index ee04941a6546..3355e27c69eb 100644
--- a/arch/x86/kernel/sev_verify_cbit.S
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -85,5 +85,5 @@ SYM_FUNC_START(sev_verify_cbit)
#endif
/* Return page-table pointer */
movq %rdi, %rax
- ret
+ RET
SYM_FUNC_END(sev_verify_cbit)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index f306e85a08a6..9c7265b524c7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -15,9 +15,9 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/errno.h>
#include <linux/wait.h>
-#include <linux/tracehook.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
@@ -30,8 +30,8 @@
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
+#include <asm/fpu/xstate.h>
#include <asm/vdso.h>
#include <asm/mce.h>
#include <asm/sighandling.h>
@@ -41,6 +41,7 @@
#include <linux/compat.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
+#include <asm/fpu/xstate.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@@ -79,9 +80,9 @@ static void force_valid_ss(struct pt_regs *regs)
# define CONTEXT_COPY_SIZE sizeof(struct sigcontext)
#endif
-static int restore_sigcontext(struct pt_regs *regs,
- struct sigcontext __user *usc,
- unsigned long uc_flags)
+static bool restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *usc,
+ unsigned long uc_flags)
{
struct sigcontext sc;
@@ -89,10 +90,10 @@ static int restore_sigcontext(struct pt_regs *regs,
current->restart_block.fn = do_no_restart_syscall;
if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
- return -EFAULT;
+ return false;
#ifdef CONFIG_X86_32
- set_user_gs(regs, sc.gs);
+ loadsegment(gs, sc.gs);
regs->fs = sc.fs;
regs->es = sc.es;
regs->ds = sc.ds;
@@ -145,8 +146,10 @@ __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
struct pt_regs *regs, unsigned long mask)
{
#ifdef CONFIG_X86_32
- unsafe_put_user(get_user_gs(regs),
- (unsigned int __user *)&sc->gs, Efault);
+ unsigned int gs;
+ savesegment(gs, gs);
+
+ unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault);
unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
@@ -212,6 +215,11 @@ do { \
* Set up a signal frame.
*/
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT 16UL
+
+#define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1)
+
/*
* Determine which stack to use..
*/
@@ -222,9 +230,9 @@ static unsigned long align_sigframe(unsigned long sp)
* Align the stack pointer according to the i386 ABI,
* i.e. so that on function entry ((sp + 4) & 15) == 0.
*/
- sp = ((sp + 4) & -16ul) - 4;
+ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
#else /* !CONFIG_X86_32 */
- sp = round_down(sp, 16) - 8;
+ sp = round_down(sp, FRAME_ALIGNMENT) - 8;
#endif
return sp;
}
@@ -234,11 +242,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
/* Default to using normal stack */
+ bool nested_altstack = on_sig_stack(regs->sp);
+ bool entering_altstack = false;
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
- int onsigstack = on_sig_stack(sp);
- int ret;
/* redzone */
if (IS_ENABLED(CONFIG_X86_64))
@@ -246,15 +254,23 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
/* This is the X/Open sanctioned signal stack switching. */
if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
+ /*
+ * This checks nested_altstack via sas_ss_flags(). Sensible
+ * programs use SS_AUTODISARM, which disables that check, and
+ * programs that don't use SS_AUTODISARM get compatible.
+ */
+ if (sas_ss_flags(sp) == 0) {
sp = current->sas_ss_sp + current->sas_ss_size;
+ entering_altstack = true;
+ }
} else if (IS_ENABLED(CONFIG_X86_32) &&
- !onsigstack &&
+ !nested_altstack &&
regs->ss != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer) {
/* This is the legacy signal stack switching. */
sp = (unsigned long) ka->sa.sa_restorer;
+ entering_altstack = true;
}
sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
@@ -267,12 +283,18 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
* If we are on the alternate signal stack and would overflow it, don't.
* Return an always-bogus address instead so we will die with SIGSEGV.
*/
- if (onsigstack && !likely(on_sig_stack(sp)))
+ if (unlikely((nested_altstack || entering_altstack) &&
+ !__on_sig_stack(sp))) {
+
+ if (show_unhandled_signals && printk_ratelimit())
+ pr_info("%s[%d] overflowed sigaltstack\n",
+ current->comm, task_pid_nr(current));
+
return (void __user *)-1L;
+ }
/* save i387 and extended state */
- ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
- if (ret < 0)
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
return (void __user *)-1L;
return (void __user *)sp;
@@ -492,7 +514,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
* SS descriptor, but we do need SS to be valid. It's possible
* that the old SS is entirely bogus -- this can happen if the
* signal we're trying to deliver is #GP or #SS caused by a bad
- * SS value. We also have a compatbility issue here: DOSEMU
+ * SS value. We also have a compatibility issue here: DOSEMU
* relies on the contents of the SS register indicating the
* SS value at the time of the signal, even though that code in
* DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
@@ -622,7 +644,7 @@ SYSCALL_DEFINE0(sigreturn)
* x86_32 has no uc_flags bits relevant to restore_sigcontext.
* Save a few cycles by skipping the __get_user.
*/
- if (restore_sigcontext(regs, &frame->sc, 0))
+ if (!restore_sigcontext(regs, &frame->sc, 0))
goto badframe;
return regs->ax;
@@ -650,7 +672,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
if (restore_altstack(&frame->uc.uc_stack))
@@ -663,6 +685,64 @@ badframe:
return 0;
}
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32)
+#else
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe)
+#endif
+
+/*
+ * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
+ * If a signal frame starts at an unaligned address, extra space is required.
+ * This is the max alignment padding, conservatively.
+ */
+#define MAX_XSAVE_PADDING 63UL
+
+/*
+ * The frame data is composed of the following areas and laid out as:
+ *
+ * -------------------------
+ * | alignment padding |
+ * -------------------------
+ * | (f)xsave frame |
+ * -------------------------
+ * | fsave header |
+ * -------------------------
+ * | alignment padding |
+ * -------------------------
+ * | siginfo + ucontext |
+ * -------------------------
+ */
+
+/* max_frame_size tells userspace the worst case signal stack size. */
+static unsigned long __ro_after_init max_frame_size;
+static unsigned int __ro_after_init fpu_default_state_size;
+
+void __init init_sigframe_size(void)
+{
+ fpu_default_state_size = fpu__get_fpstate_size();
+
+ max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
+
+ max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;
+
+ /* Userspace expects an aligned size. */
+ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
+
+ pr_info("max sigframe size: %lu\n", max_frame_size);
+}
+
+unsigned long get_sigframe_size(void)
+{
+ return max_frame_size;
+}
+
static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
return IS_ENABLED(CONFIG_IA32_EMULATION) &&
@@ -713,7 +793,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
/* Are we from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* If so, check system call restarting.. */
switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
@@ -782,18 +862,18 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
+void arch_do_signal_or_restart(struct pt_regs *regs)
{
struct ksignal ksig;
- if (has_signal && get_signal(&ksig)) {
+ if (get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
handle_signal(&ksig, regs);
return;
}
/* Did we come from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* Restart the system call - no handlers present */
switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
@@ -834,6 +914,62 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
force_sig(SIGSEGV);
}
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
+static bool strict_sigaltstack_size __ro_after_init = true;
+#else
+static bool strict_sigaltstack_size __ro_after_init = false;
+#endif
+
+static int __init strict_sas_size(char *arg)
+{
+ return kstrtobool(arg, &strict_sigaltstack_size);
+}
+__setup("strict_sas_size", strict_sas_size);
+
+/*
+ * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
+ * exceeds that size already. As such programs might never use the
+ * sigaltstack they just continued to work. While always checking against
+ * the real size would be correct, this might be considered a regression.
+ *
+ * Therefore avoid the sanity check, unless enforced by kernel
+ * configuration or command line option.
+ *
+ * When dynamic FPU features are supported, the check is also enforced when
+ * the task has permissions to use dynamic features. Tasks which have no
+ * permission are checked against the size of the non-dynamic feature set
+ * if strict checking is enabled. This avoids forcing all tasks on the
+ * system to allocate large sigaltstacks even if they are never going
+ * to use a dynamic feature. As this is serialized via sighand::siglock
+ * any permission request for a dynamic feature either happened already
+ * or will see the newly install sigaltstack size in the permission checks.
+ */
+bool sigaltstack_size_valid(size_t ss_size)
+{
+ unsigned long fsize = max_frame_size - fpu_default_state_size;
+ u64 mask;
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
+ return true;
+
+ fsize += current->group_leader->thread.fpu.perm.__user_state_size;
+ if (likely(ss_size > fsize))
+ return true;
+
+ if (strict_sigaltstack_size)
+ return ss_size > fsize;
+
+ mask = current->group_leader->thread.fpu.perm.__state_perm;
+ if (mask & XFEATURE_MASK_USER_DYNAMIC)
+ return ss_size > fsize;
+
+ return true;
+}
+#endif /* CONFIG_DYNAMIC_SIGFRAME */
+
#ifdef CONFIG_X86_X32_ABI
COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
{
@@ -853,7 +989,7 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
if (compat_restore_altstack(&frame->uc.uc_stack))
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff498f0..879ef8c72f5c 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,12 +29,18 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGFPE != 15);
BUILD_BUG_ON(NSIGSEGV != 9);
BUILD_BUG_ON(NSIGBUS != 5);
- BUILD_BUG_ON(NSIGTRAP != 5);
+ BUILD_BUG_ON(NSIGTRAP != 6);
BUILD_BUG_ON(NSIGCHLD != 6);
BUILD_BUG_ON(NSIGSYS != 2);
/* This is part of the ABI and can never change in size: */
+ BUILD_BUG_ON(sizeof(siginfo_t) != 128);
BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
+
+ /* This is a part of the ABI and can never change in alignment */
+ BUILD_BUG_ON(__alignof__(siginfo_t) != 8);
+ BUILD_BUG_ON(__alignof__(compat_siginfo_t) != 4);
+
/*
* The offsets of all the (unioned) si_fields are fixed
* in the ABI, of course. Make sure none of them ever
@@ -127,6 +133,9 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10);
+
BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10);
@@ -138,6 +147,13 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18);
+
CHECK_CSI_OFFSET(_sigpoll);
CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index eff4ce3b10da..06db901fabe8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -67,7 +67,7 @@
* 5AP. symmetric IO mode (normal Linux operation) not affected.
* 'noapic' mode has vector 0xf filled out properly.
* 6AP. 'noapic' mode might be affected - fixed in later steppings
- * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 7AP. We do not assume writes to the LVT deasserting IRQs
* 8AP. We do not enable low power mode (deep sleep) during MP bootup
* 9AP. We do not use mixed mode
*
@@ -204,7 +204,7 @@ static void native_stop_other_cpus(int wait)
}
/*
* Don't wait longer than 10 ms if the caller didn't
- * reqeust it. If wait is true, the machine hangs here if
+ * request it. If wait is true, the machine hangs here if
* one or more CPUs do not reach shutdown state.
*/
timeout = USEC_PER_MSEC * 10;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 02813a7f3a7c..5e7f9532a10d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,7 +56,6 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
-#include <linux/syscore_ops.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -70,7 +69,7 @@
#include <asm/mwait.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
@@ -82,10 +81,7 @@
#include <asm/spec-ctrl.h>
#include <asm/hw_irq.h>
#include <asm/stackprotector.h>
-
-#ifdef CONFIG_ACPI_CPPC_LIB
-#include <acpi/cppc_acpi.h>
-#endif
+#include <asm/sev.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -101,6 +97,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
/* Per CPU bogomips and other parameters */
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -153,8 +151,6 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
-static void init_freq_invariance(bool secondary, bool cppc_ready);
-
/*
* Report back to the Boot Processor during boot time or to the caller processor
* during CPU online.
@@ -191,7 +187,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
- init_freq_invariance(true, false);
+ ap_init_aperfmperf();
/*
* Get our bogomips.
@@ -232,11 +228,9 @@ static void notrace start_secondary(void *unused)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
#endif
- cpu_init_exception_handling();
- cpu_init();
+ cpu_init_secondary();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
- preempt_disable();
smp_callin();
enable_start_cpu0 = 0;
@@ -458,29 +452,67 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_die_id == o->cpu_die_id)
+ return true;
+ return false;
+}
+
+static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ /* If the arch didn't set up l2c_id, fall back to SMT */
+ if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
+ return match_smt(c, o);
+
+ /* Do not match if L2 cache id does not match: */
+ if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
+ return false;
+
+ return topology_sane(c, o, "l2c");
+}
+
/*
- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node. If this happens, we will
+ * discard the MC level of the topology later.
+ */
+static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id)
+ return true;
+ return false;
+}
+
+/*
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
*
- * These are Intel CPUs that enumerate an LLC that is shared by
- * multiple NUMA nodes. The LLC on these systems is shared for
- * off-package data access but private to the NUMA node (half
- * of the package) for on-package access.
+ * Any Intel CPU that has multiple nodes per package and does not
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
*
- * CPUID (the source of the information about the LLC) can only
- * enumerate the cache as being shared *or* unshared, but not
- * this particular configuration. The CPU in this case enumerates
- * the cache to be shared across the entire package (spanning both
- * NUMA nodes).
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
+ * by multiple NUMA nodes. The LLC is shared for off-package data
+ * access but private to the NUMA node (half of the package) for
+ * on-package access. CPUID (the source of the information about
+ * the LLC) can only enumerate the cache as shared or unshared,
+ * but not this particular configuration.
*/
-static const struct x86_cpu_id snc_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL),
+static const struct x86_cpu_id intel_cod_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
+ X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
{}
};
static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+ bool intel_snc = id && id->driver_data;
/* Do not match if we do not have a valid APICID for cpu: */
if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
@@ -495,34 +527,14 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
* means 'c' does not share the LLC of 'o'. This will be
* reflected to userspace.
*/
- if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
return false;
return topology_sane(c, o, "llc");
}
-/*
- * Unlike the other levels, we do not enforce keeping a
- * multicore group inside a NUMA node. If this happens, we will
- * discard the MC level of the topology later.
- */
-static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
-{
- if (c->phys_proc_id == o->phys_proc_id)
- return true;
- return false;
-}
-
-static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
-{
- if ((c->phys_proc_id == o->phys_proc_id) &&
- (c->cpu_die_id == o->cpu_die_id))
- return true;
- return false;
-}
-
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -540,15 +552,35 @@ static int x86_smt_flags(void)
return cpu_smt_flags() | x86_sched_itmt_flags();
}
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+static int x86_cluster_flags(void)
+{
+ return cpu_cluster_flags() | x86_sched_itmt_flags();
+}
+#endif
#endif
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { NULL, },
+};
+
+static struct sched_domain_topology_level x86_hybrid_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
@@ -556,6 +588,9 @@ static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
@@ -583,6 +618,7 @@ void set_cpu_sibling_map(int cpu)
if (!has_mp) {
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
c->booted_cores = 1;
@@ -592,14 +628,29 @@ void set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
o = &cpu_data(i);
+ if (match_pkg(c, o) && !topology_same_node(c, o))
+ x86_has_numa_in_package = true;
+
if ((i == cpu) || (has_smt && match_smt(c, o)))
link_mask(topology_sibling_cpumask, cpu, i);
if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i);
+ if ((i == cpu) || (has_mp && match_l2c(c, o)))
+ link_mask(cpu_l2c_shared_mask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_die(c, o)))
+ link_mask(topology_die_cpumask, cpu, i);
}
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
+
+ for_each_cpu(i, topology_sibling_cpumask(cpu))
+ cpu_data(i).smt_active = threads > 1;
+
/*
* This needs a separate iteration over the cpus because we rely on all
* topology_sibling_cpumask links to be set-up.
@@ -613,8 +664,7 @@ void set_cpu_sibling_map(int cpu)
/*
* Does this new cpu bringup a new core?
*/
- if (cpumask_weight(
- topology_sibling_cpumask(cpu)) == 1) {
+ if (threads == 1) {
/*
* for each core in package, increment
* the booted_cores for this new cpu
@@ -631,16 +681,7 @@ void set_cpu_sibling_map(int cpu)
} else if (i != cpu && !c->booted_cores)
c->booted_cores = cpu_data(i).booted_cores;
}
- if (match_pkg(c, o) && !topology_same_node(c, o))
- x86_has_numa_in_package = true;
-
- if ((i == cpu) || (has_mp && match_die(c, o)))
- link_mask(topology_die_cpumask, cpu, i);
}
-
- threads = cpumask_weight(topology_sibling_cpumask(cpu));
- if (threads > __max_smt_threads)
- __max_smt_threads = threads;
}
/* maps the cpu to the sched domain representing multi-core */
@@ -649,6 +690,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
return cpu_llc_shared_mask(cpu);
}
+const struct cpumask *cpu_clustergroup_mask(int cpu)
+{
+ return cpu_l2c_shared_mask(cpu);
+}
+
static void impress_friends(void)
{
int cpu;
@@ -1036,6 +1082,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
unsigned long boot_error = 0;
unsigned long timeout;
+#ifdef CONFIG_X86_64
+ /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
+ if (apic->wakeup_secondary_cpu_64)
+ start_ip = real_mode_header->trampoline_start64;
+#endif
idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
@@ -1077,11 +1128,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
/*
* Wake up a CPU in difference cases:
- * - Use the method in the APIC driver if it's defined
+ * - Use a method from the APIC driver if one defined, with wakeup
+ * straight to 64-bit mode preferred over wakeup to RM.
* Otherwise,
* - Use an INIT boot APIC message for APs or NMI for BSP.
*/
- if (apic->wakeup_secondary_cpu)
+ if (apic->wakeup_secondary_cpu_64)
+ boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip);
+ else if (apic->wakeup_secondary_cpu)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
@@ -1309,12 +1363,7 @@ static void __init smp_get_logical_apicid(void)
cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
}
-/*
- * Prepare for SMP bootup.
- * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
- * for common interface support.
- */
-void __init native_smp_prepare_cpus(unsigned int max_cpus)
+void __init smp_prepare_cpus_common(void)
{
unsigned int i;
@@ -1332,6 +1381,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
}
/*
@@ -1344,7 +1394,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
- init_freq_invariance(false, false);
+}
+
+/*
+ * Prepare for SMP bootup.
+ * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
+ * for common interface support.
+ */
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
+{
+ smp_prepare_cpus_common();
+
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1377,6 +1437,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
smp_quirk_init_udelay();
speculative_store_bypass_ht_init();
+
+ snp_set_wakeup_secondary_cpu();
}
void arch_thaw_secondary_cpus_begin(void)
@@ -1407,7 +1469,7 @@ void __init calculate_max_logical_packages(void)
int ncpus;
/*
- * Today neither Intel nor AMD support heterogenous systems so
+ * Today neither Intel nor AMD support heterogeneous systems so
* extrapolate the boot cpu's data to all packages.
*/
ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
@@ -1421,8 +1483,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
calculate_max_logical_packages();
+ /* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ set_sched_topology(x86_hybrid_topology);
nmi_selftest();
impress_friends();
@@ -1552,11 +1617,19 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, topology_die_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
- for_each_cpu(sibling, topology_sibling_cpumask(cpu))
+
+ for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
+ if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1)
+ cpu_data(sibling).smt_active = false;
+ }
+
for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
cpumask_clear(cpu_llc_shared_mask(cpu));
+ cpumask_clear(cpu_l2c_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu));
cpumask_clear(topology_die_cpumask(cpu));
@@ -1659,13 +1732,17 @@ void play_dead_common(void)
local_irq_disable();
}
-static bool wakeup_cpu0(void)
+/**
+ * cond_wakeup_cpu0 - Wake up CPU0 if needed.
+ *
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+void cond_wakeup_cpu0(void)
{
if (smp_processor_id() == 0 && enable_start_cpu0)
- return true;
-
- return false;
+ start_cpu0();
}
+EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
/*
* We need to flush the caches before going to sleep, lest we have
@@ -1734,11 +1811,8 @@ static inline void mwait_play_dead(void)
__monitor(mwait_ptr, 0, 0);
mb();
__mwait(eax, 0);
- /*
- * If NMI wants to wake up CPU0, start CPU0.
- */
- if (wakeup_cpu0())
- start_cpu0();
+
+ cond_wakeup_cpu0();
}
}
@@ -1749,11 +1823,8 @@ void hlt_play_dead(void)
while (1) {
native_halt();
- /*
- * If NMI wants to wake up CPU0, start CPU0.
- */
- if (wakeup_cpu0())
- start_cpu0();
+
+ cond_wakeup_cpu0();
}
}
@@ -1785,422 +1856,3 @@ void native_play_dead(void)
}
#endif
-
-#ifdef CONFIG_X86_64
-/*
- * APERF/MPERF frequency ratio computation.
- *
- * The scheduler wants to do frequency invariant accounting and needs a <1
- * ratio to account for the 'current' frequency, corresponding to
- * freq_curr / freq_max.
- *
- * Since the frequency freq_curr on x86 is controlled by micro-controller and
- * our P-state setting is little more than a request/hint, we need to observe
- * the effective frequency 'BusyMHz', i.e. the average frequency over a time
- * interval after discarding idle time. This is given by:
- *
- * BusyMHz = delta_APERF / delta_MPERF * freq_base
- *
- * where freq_base is the max non-turbo P-state.
- *
- * The freq_max term has to be set to a somewhat arbitrary value, because we
- * can't know which turbo states will be available at a given point in time:
- * it all depends on the thermal headroom of the entire package. We set it to
- * the turbo level with 4 cores active.
- *
- * Benchmarks show that's a good compromise between the 1C turbo ratio
- * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
- * which would ignore the entire turbo range (a conspicuous part, making
- * freq_curr/freq_max always maxed out).
- *
- * An exception to the heuristic above is the Atom uarch, where we choose the
- * highest turbo level for freq_max since Atom's are generally oriented towards
- * power efficiency.
- *
- * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
- * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
- */
-
-DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
-
-static DEFINE_PER_CPU(u64, arch_prev_aperf);
-static DEFINE_PER_CPU(u64, arch_prev_mperf);
-static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
-static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
-
-void arch_set_max_freq_ratio(bool turbo_disabled)
-{
- arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
- arch_turbo_freq_ratio;
-}
-EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
-
-static bool turbo_disabled(void)
-{
- u64 misc_en;
- int err;
-
- err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
- if (err)
- return false;
-
- return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
-}
-
-static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
-{
- int err;
-
- err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
- if (err)
- return false;
-
- err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
- if (err)
- return false;
-
- *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
- *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
-
- return true;
-}
-
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-
-#define X86_MATCH(model) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
-
-static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
- X86_MATCH(XEON_PHI_KNL),
- X86_MATCH(XEON_PHI_KNM),
- {}
-};
-
-static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
- X86_MATCH(SKYLAKE_X),
- {}
-};
-
-static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
- X86_MATCH(ATOM_GOLDMONT),
- X86_MATCH(ATOM_GOLDMONT_D),
- X86_MATCH(ATOM_GOLDMONT_PLUS),
- {}
-};
-
-static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
- int num_delta_fratio)
-{
- int fratio, delta_fratio, found;
- int err, i;
- u64 msr;
-
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
- if (err)
- return false;
-
- *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
-
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
- if (err)
- return false;
-
- fratio = (msr >> 8) & 0xFF;
- i = 16;
- found = 0;
- do {
- if (found >= num_delta_fratio) {
- *turbo_freq = fratio;
- return true;
- }
-
- delta_fratio = (msr >> (i + 5)) & 0x7;
-
- if (delta_fratio) {
- found += 1;
- fratio -= delta_fratio;
- }
-
- i += 8;
- } while (i < 64);
-
- return true;
-}
-
-static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
-{
- u64 ratios, counts;
- u32 group_size;
- int err, i;
-
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
- if (err)
- return false;
-
- *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
-
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
- if (err)
- return false;
-
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
- if (err)
- return false;
-
- for (i = 0; i < 64; i += 8) {
- group_size = (counts >> i) & 0xFF;
- if (group_size >= size) {
- *turbo_freq = (ratios >> i) & 0xFF;
- return true;
- }
- }
-
- return false;
-}
-
-static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
-{
- u64 msr;
- int err;
-
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
- if (err)
- return false;
-
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
- if (err)
- return false;
-
- *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
- *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
-
- /* The CPU may have less than 4 cores */
- if (!*turbo_freq)
- *turbo_freq = msr & 0xFF; /* 1C turbo */
-
- return true;
-}
-
-static bool intel_set_max_freq_ratio(void)
-{
- u64 base_freq, turbo_freq;
- u64 turbo_ratio;
-
- if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
- goto out;
-
- if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
- skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
- goto out;
-
- if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
- knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
- goto out;
-
- if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
- skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
- goto out;
-
- if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
- goto out;
-
- return false;
-
-out:
- /*
- * Some hypervisors advertise X86_FEATURE_APERFMPERF
- * but then fill all MSR's with zeroes.
- * Some CPUs have turbo boost but don't declare any turbo ratio
- * in MSR_TURBO_RATIO_LIMIT.
- */
- if (!base_freq || !turbo_freq) {
- pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
- return false;
- }
-
- turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
- if (!turbo_ratio) {
- pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
- return false;
- }
-
- arch_turbo_freq_ratio = turbo_ratio;
- arch_set_max_freq_ratio(turbo_disabled());
-
- return true;
-}
-
-#ifdef CONFIG_ACPI_CPPC_LIB
-static bool amd_set_max_freq_ratio(void)
-{
- struct cppc_perf_caps perf_caps;
- u64 highest_perf, nominal_perf;
- u64 perf_ratio;
- int rc;
-
- rc = cppc_get_perf_caps(0, &perf_caps);
- if (rc) {
- pr_debug("Could not retrieve perf counters (%d)\n", rc);
- return false;
- }
-
- highest_perf = perf_caps.highest_perf;
- nominal_perf = perf_caps.nominal_perf;
-
- if (!highest_perf || !nominal_perf) {
- pr_debug("Could not retrieve highest or nominal performance\n");
- return false;
- }
-
- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
- /* midpoint between max_boost and max_P */
- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
- if (!perf_ratio) {
- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
- return false;
- }
-
- arch_turbo_freq_ratio = perf_ratio;
- arch_set_max_freq_ratio(false);
-
- return true;
-}
-#else
-static bool amd_set_max_freq_ratio(void)
-{
- return false;
-}
-#endif
-
-static void init_counter_refs(void)
-{
- u64 aperf, mperf;
-
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
-
- this_cpu_write(arch_prev_aperf, aperf);
- this_cpu_write(arch_prev_mperf, mperf);
-}
-
-#ifdef CONFIG_PM_SLEEP
-static struct syscore_ops freq_invariance_syscore_ops = {
- .resume = init_counter_refs,
-};
-
-static void register_freq_invariance_syscore_ops(void)
-{
- /* Bail out if registered already. */
- if (freq_invariance_syscore_ops.node.prev)
- return;
-
- register_syscore_ops(&freq_invariance_syscore_ops);
-}
-#else
-static inline void register_freq_invariance_syscore_ops(void) {}
-#endif
-
-static void init_freq_invariance(bool secondary, bool cppc_ready)
-{
- bool ret = false;
-
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
-
- if (secondary) {
- if (static_branch_likely(&arch_scale_freq_key)) {
- init_counter_refs();
- }
- return;
- }
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- ret = intel_set_max_freq_ratio();
- else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (!cppc_ready) {
- return;
- }
- ret = amd_set_max_freq_ratio();
- }
-
- if (ret) {
- init_counter_refs();
- static_branch_enable(&arch_scale_freq_key);
- register_freq_invariance_syscore_ops();
- pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
- } else {
- pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
- }
-}
-
-#ifdef CONFIG_ACPI_CPPC_LIB
-static DEFINE_MUTEX(freq_invariance_lock);
-
-void init_freq_invariance_cppc(void)
-{
- static bool secondary;
-
- mutex_lock(&freq_invariance_lock);
-
- init_freq_invariance(secondary, true);
- secondary = true;
-
- mutex_unlock(&freq_invariance_lock);
-}
-#endif
-
-static void disable_freq_invariance_workfn(struct work_struct *work)
-{
- static_branch_disable(&arch_scale_freq_key);
-}
-
-static DECLARE_WORK(disable_freq_invariance_work,
- disable_freq_invariance_workfn);
-
-DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
-
-void arch_scale_freq_tick(void)
-{
- u64 freq_scale = SCHED_CAPACITY_SCALE;
- u64 aperf, mperf;
- u64 acnt, mcnt;
-
- if (!arch_scale_freq_invariant())
- return;
-
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
-
- acnt = aperf - this_cpu_read(arch_prev_aperf);
- mcnt = mperf - this_cpu_read(arch_prev_mperf);
-
- this_cpu_write(arch_prev_aperf, aperf);
- this_cpu_write(arch_prev_mperf, mperf);
-
- if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
- goto error;
-
- if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
- goto error;
-
- freq_scale = div64_u64(acnt, mcnt);
- if (!freq_scale)
- goto error;
-
- if (freq_scale > SCHED_CAPACITY_SCALE)
- freq_scale = SCHED_CAPACITY_SCALE;
-
- this_cpu_write(arch_freq_scale, freq_scale);
- return;
-
-error:
- pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
- schedule_work(&disable_freq_invariance_work);
-}
-#else
-static inline void init_freq_invariance(bool secondary, bool cppc_ready)
-{
-}
-#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8627fda8d993..ee117fcf46ed 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -29,12 +29,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
}
}
-/*
- * This function returns an error if it detects any unreliable features of the
- * stack. Otherwise it guarantees that the stack trace is reliable.
- *
- * If the task is not 'current', the caller *must* ensure the task is inactive.
- */
int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
void *cookie, struct task_struct *task)
{
@@ -96,7 +90,7 @@ copy_stack_frame(const struct stack_frame_user __user *fp,
{
int ret;
- if (__range_not_ok(fp, sizeof(*frame), TASK_SIZE))
+ if (!__access_ok(fp, sizeof(*frame)))
return 0;
ret = 1;
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 9442c4136c38..aa72cefdd5be 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -12,10 +12,11 @@ enum insn_type {
};
/*
- * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
- * The REX.W cancels the effect of any data16.
+ * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax
*/
-static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
+
+static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
@@ -34,7 +35,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
break;
case NOP:
- code = ideal_nops[NOP_ATOMIC5];
+ code = x86_nops[5];
break;
case JMP:
@@ -42,8 +43,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
break;
case RET:
- code = text_gen_insn(RET_INSN_OPCODE, insn, func);
- size = RET_INSN_SIZE;
+ code = &retinsn;
break;
}
@@ -56,17 +56,22 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
text_poke_bp(insn, code, size, emulate);
}
-static void __static_call_validate(void *insn, bool tail)
+static void __static_call_validate(void *insn, bool tail, bool tramp)
{
u8 opcode = *(u8 *)insn;
+ if (tramp && memcmp(insn+5, "SCT", 3)) {
+ pr_err("trampoline signature fail");
+ BUG();
+ }
+
if (tail) {
if (opcode == JMP32_INSN_OPCODE ||
opcode == RET_INSN_OPCODE)
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
- !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
+ !memcmp(insn, x86_nops[5], 5) ||
!memcmp(insn, xor5rax, 5))
return;
}
@@ -74,7 +79,8 @@ static void __static_call_validate(void *insn, bool tail)
/*
* If we ever trigger this, our text is corrupt, we'll probably not live long.
*/
- WARN_ONCE(1, "unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn);
+ pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn);
+ BUG();
}
static inline enum insn_type __sc_insn(bool null, bool tail)
@@ -97,12 +103,12 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
mutex_lock(&text_mutex);
if (tramp) {
- __static_call_validate(tramp, true);
+ __static_call_validate(tramp, true, true);
__static_call_transform(tramp, __sc_insn(!func, true), func);
}
if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) {
- __static_call_validate(site, tail);
+ __static_call_validate(site, tail, false);
__static_call_transform(site, __sc_insn(!func, tail), func);
}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 0f3c307b37b3..8e2b2552b5ee 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -180,8 +180,7 @@ void set_task_blockstep(struct task_struct *task, bool on)
*
* NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
* task is current or it can't be running, otherwise we can race
- * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but
- * PTRACE_KILL is not safe.
+ * with __switch_to_xtra(). We rely on ptrace_freeze_traced().
*/
local_irq_disable();
debugctl = get_debugctlmsr();
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 660b78827638..8cc653ffdccd 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -68,9 +68,6 @@ static int __init control_va_addr_alignment(char *str)
if (*str == 0)
return 1;
- if (*str == '=')
- str++;
-
if (!strcmp(str, "32"))
va_align.flags = ALIGN_VA_32;
else if (!strcmp(str, "64"))
@@ -80,11 +77,11 @@ static int __init control_va_addr_alignment(char *str)
else if (!strcmp(str, "on"))
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
else
- return 0;
+ pr_warn("invalid option value: 'align_va_addr=%s'\n", str);
return 1;
}
-__setup("align_va_addr", control_va_addr_alignment);
+__setup("align_va_addr=", control_va_addr_alignment);
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c
deleted file mode 100644
index 014ebd8ca869..000000000000
--- a/arch/x86/kernel/sysfb.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Generic System Framebuffers on x86
- * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
- */
-
-/*
- * Simple-Framebuffer support for x86 systems
- * Create a platform-device for any available boot framebuffer. The
- * simple-framebuffer platform device is already available on DT systems, so
- * this module parses the global "screen_info" object and creates a suitable
- * platform device compatible with the "simple-framebuffer" DT object. If
- * the framebuffer is incompatible, we instead create a legacy
- * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and
- * pass the screen_info as platform_data. This allows legacy drivers
- * to pick these devices up without messing with simple-framebuffer drivers.
- * The global "screen_info" is still valid at all times.
- *
- * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer"
- * platform devices, but only use legacy framebuffer devices for
- * backwards compatibility.
- *
- * TODO: We set the dev_id field of all platform-devices to 0. This allows
- * other x86 OF/DT parsers to create such devices, too. However, they must
- * start at offset 1 for this to work.
- */
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/platform_data/simplefb.h>
-#include <linux/platform_device.h>
-#include <linux/screen_info.h>
-#include <asm/sysfb.h>
-
-static __init int sysfb_init(void)
-{
- struct screen_info *si = &screen_info;
- struct simplefb_platform_data mode;
- struct platform_device *pd;
- const char *name;
- bool compatible;
- int ret;
-
- sysfb_apply_efi_quirks();
-
- /* try to create a simple-framebuffer device */
- compatible = parse_mode(si, &mode);
- if (compatible) {
- ret = create_simplefb(si, &mode);
- if (!ret)
- return 0;
- }
-
- /* if the FB is incompatible, create a legacy framebuffer device */
- if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
- name = "efi-framebuffer";
- else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
- name = "vesa-framebuffer";
- else
- name = "platform-framebuffer";
-
- pd = platform_device_register_resndata(NULL, name, 0,
- NULL, 0, si, sizeof(*si));
- return PTR_ERR_OR_ZERO(pd);
-}
-
-/* must execute after PCI subsystem for EFI quirks */
-device_initcall(sysfb_init);
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
deleted file mode 100644
index 653b7f617b61..000000000000
--- a/arch/x86/kernel/sysfb_efi.c
+++ /dev/null
@@ -1,284 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Generic System Framebuffers on x86
- * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
- *
- * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com>
- */
-
-/*
- * EFI Quirks
- * Several EFI systems do not correctly advertise their boot framebuffers.
- * Hence, we use this static table of known broken machines and fix up the
- * information so framebuffer drivers can load corectly.
- */
-
-#include <linux/dmi.h>
-#include <linux/err.h>
-#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <linux/screen_info.h>
-#include <video/vga.h>
-
-#include <asm/efi.h>
-#include <asm/sysfb.h>
-
-enum {
- OVERRIDE_NONE = 0x0,
- OVERRIDE_BASE = 0x1,
- OVERRIDE_STRIDE = 0x2,
- OVERRIDE_HEIGHT = 0x4,
- OVERRIDE_WIDTH = 0x8,
-};
-
-struct efifb_dmi_info efifb_dmi_list[] = {
- [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
- [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */
- [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE },
- [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */
- [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE },
- [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE },
- [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE },
- [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE },
- [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- /* 11" Macbook Air 3,1 passes the wrong stride */
- [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE },
- [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */
- [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE },
- [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
- [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
-};
-
-void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
-{
- int i;
-
- for (i = 0; i < M_UNKNOWN; i++) {
- if (efifb_dmi_list[i].base != 0 &&
- !strcmp(opt, efifb_dmi_list[i].optname)) {
- si->lfb_base = efifb_dmi_list[i].base;
- si->lfb_linelength = efifb_dmi_list[i].stride;
- si->lfb_width = efifb_dmi_list[i].width;
- si->lfb_height = efifb_dmi_list[i].height;
- }
- }
-}
-
-#define choose_value(dmivalue, fwvalue, field, flags) ({ \
- typeof(fwvalue) _ret_ = fwvalue; \
- if ((flags) & (field)) \
- _ret_ = dmivalue; \
- else if ((fwvalue) == 0) \
- _ret_ = dmivalue; \
- _ret_; \
- })
-
-static int __init efifb_set_system(const struct dmi_system_id *id)
-{
- struct efifb_dmi_info *info = id->driver_data;
-
- if (info->base == 0 && info->height == 0 && info->width == 0 &&
- info->stride == 0)
- return 0;
-
- /* Trust the bootloader over the DMI tables */
- if (screen_info.lfb_base == 0) {
-#if defined(CONFIG_PCI)
- struct pci_dev *dev = NULL;
- int found_bar = 0;
-#endif
- if (info->base) {
- screen_info.lfb_base = choose_value(info->base,
- screen_info.lfb_base, OVERRIDE_BASE,
- info->flags);
-
-#if defined(CONFIG_PCI)
- /* make sure that the address in the table is actually
- * on a VGA device's PCI BAR */
-
- for_each_pci_dev(dev) {
- int i;
- if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
- continue;
- for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
- resource_size_t start, end;
- unsigned long flags;
-
- flags = pci_resource_flags(dev, i);
- if (!(flags & IORESOURCE_MEM))
- continue;
-
- if (flags & IORESOURCE_UNSET)
- continue;
-
- if (pci_resource_len(dev, i) == 0)
- continue;
-
- start = pci_resource_start(dev, i);
- end = pci_resource_end(dev, i);
- if (screen_info.lfb_base >= start &&
- screen_info.lfb_base < end) {
- found_bar = 1;
- break;
- }
- }
- }
- if (!found_bar)
- screen_info.lfb_base = 0;
-#endif
- }
- }
- if (screen_info.lfb_base) {
- screen_info.lfb_linelength = choose_value(info->stride,
- screen_info.lfb_linelength, OVERRIDE_STRIDE,
- info->flags);
- screen_info.lfb_width = choose_value(info->width,
- screen_info.lfb_width, OVERRIDE_WIDTH,
- info->flags);
- screen_info.lfb_height = choose_value(info->height,
- screen_info.lfb_height, OVERRIDE_HEIGHT,
- info->flags);
- if (screen_info.orig_video_isVGA == 0)
- screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
- } else {
- screen_info.lfb_linelength = 0;
- screen_info.lfb_width = 0;
- screen_info.lfb_height = 0;
- screen_info.orig_video_isVGA = 0;
- return 0;
- }
-
- printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x "
- "(%dx%d, stride %d)\n", id->ident,
- screen_info.lfb_base, screen_info.lfb_width,
- screen_info.lfb_height, screen_info.lfb_linelength);
-
- return 1;
-}
-
-#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \
- { \
- efifb_set_system, \
- name, \
- { \
- DMI_MATCH(DMI_BIOS_VENDOR, vendor), \
- DMI_MATCH(DMI_PRODUCT_NAME, name) \
- }, \
- &efifb_dmi_list[enumid] \
- }
-
-static const struct dmi_system_id efifb_dmi_system_table[] __initconst = {
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17),
- /* At least one of these two will be right; maybe both? */
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20),
- /* At least one of these two will be right; maybe both? */
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB),
- /* At least one of these two will be right; maybe both? */
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB),
- /* At least one of these two will be right; maybe both? */
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2),
- {},
-};
-
-/*
- * Some devices have a portrait LCD but advertise a landscape resolution (and
- * pitch). We simply swap width and height for these devices so that we can
- * correctly deal with some of them coming with multiple resolutions.
- */
-static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = {
- {
- /*
- * Lenovo MIIX310-10ICR, only some batches have the troublesome
- * 800x1280 portrait screen. Luckily the portrait version has
- * its own BIOS version, so we match on that.
- */
- .matches = {
- DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
- DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "MIIX 310-10ICR"),
- DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1HCN44WW"),
- },
- },
- {
- /* Lenovo MIIX 320-10ICR with 800x1280 portrait screen */
- .matches = {
- DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
- DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
- "Lenovo MIIX 320-10ICR"),
- },
- },
- {
- /* Lenovo D330 with 800x1280 or 1200x1920 portrait screen */
- .matches = {
- DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
- DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
- "Lenovo ideapad D330-10IGM"),
- },
- },
- {},
-};
-
-__init void sysfb_apply_efi_quirks(void)
-{
- if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI ||
- !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS))
- dmi_check_system(efifb_dmi_system_table);
-
- if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI &&
- dmi_check_system(efifb_dmi_swap_width_height)) {
- u16 temp = screen_info.lfb_width;
-
- screen_info.lfb_width = screen_info.lfb_height;
- screen_info.lfb_height = temp;
- screen_info.lfb_linelength = 4 * screen_info.lfb_width;
- }
-}
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
deleted file mode 100644
index 298fc1edd9c9..000000000000
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Generic System Framebuffers on x86
- * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
- */
-
-/*
- * simple-framebuffer probing
- * Try to convert "screen_info" into a "simple-framebuffer" compatible mode.
- * If the mode is incompatible, we return "false" and let the caller create
- * legacy nodes instead.
- */
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/platform_data/simplefb.h>
-#include <linux/platform_device.h>
-#include <linux/screen_info.h>
-#include <asm/sysfb.h>
-
-static const char simplefb_resname[] = "BOOTFB";
-static const struct simplefb_format formats[] = SIMPLEFB_FORMATS;
-
-/* try parsing x86 screen_info into a simple-framebuffer mode struct */
-__init bool parse_mode(const struct screen_info *si,
- struct simplefb_platform_data *mode)
-{
- const struct simplefb_format *f;
- __u8 type;
- unsigned int i;
-
- type = si->orig_video_isVGA;
- if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI)
- return false;
-
- for (i = 0; i < ARRAY_SIZE(formats); ++i) {
- f = &formats[i];
- if (si->lfb_depth == f->bits_per_pixel &&
- si->red_size == f->red.length &&
- si->red_pos == f->red.offset &&
- si->green_size == f->green.length &&
- si->green_pos == f->green.offset &&
- si->blue_size == f->blue.length &&
- si->blue_pos == f->blue.offset &&
- si->rsvd_size == f->transp.length &&
- si->rsvd_pos == f->transp.offset) {
- mode->format = f->name;
- mode->width = si->lfb_width;
- mode->height = si->lfb_height;
- mode->stride = si->lfb_linelength;
- return true;
- }
- }
-
- return false;
-}
-
-__init int create_simplefb(const struct screen_info *si,
- const struct simplefb_platform_data *mode)
-{
- struct platform_device *pd;
- struct resource res;
- u64 base, size;
- u32 length;
-
- /*
- * If the 64BIT_BASE capability is set, ext_lfb_base will contain the
- * upper half of the base address. Assemble the address, then make sure
- * it is valid and we can actually access it.
- */
- base = si->lfb_base;
- if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE)
- base |= (u64)si->ext_lfb_base << 32;
- if (!base || (u64)(resource_size_t)base != base) {
- printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n");
- return -EINVAL;
- }
-
- /*
- * Don't use lfb_size as IORESOURCE size, since it may contain the
- * entire VMEM, and thus require huge mappings. Use just the part we
- * need, that is, the part where the framebuffer is located. But verify
- * that it does not exceed the advertised VMEM.
- * Note that in case of VBE, the lfb_size is shifted by 16 bits for
- * historical reasons.
- */
- size = si->lfb_size;
- if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
- size <<= 16;
- length = mode->height * mode->stride;
- if (length > size) {
- printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
- return -EINVAL;
- }
- length = PAGE_ALIGN(length);
-
- /* setup IORESOURCE_MEM as framebuffer memory */
- memset(&res, 0, sizeof(res));
- res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- res.name = simplefb_resname;
- res.start = base;
- res.end = res.start + length - 1;
- if (res.end <= res.start)
- return -EINVAL;
-
- pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0,
- &res, 1, mode, sizeof(*mode));
- return PTR_ERR_OR_ZERO(pd);
-}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4c09ba110204..0c1154a1c403 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -24,7 +24,6 @@
#include <asm/processor.h>
#include <asm/bootparam.h>
#include <asm/pgalloc.h>
-#include <asm/swiotlb.h>
#include <asm/fixmap.h>
#include <asm/proto.h>
#include <asm/setup.h>
@@ -49,6 +48,30 @@ bool tboot_enabled(void)
return tboot != NULL;
}
+/* noinline to prevent gcc from warning about dereferencing constant fixaddr */
+static noinline __init bool check_tboot_version(void)
+{
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
+ return false;
+ }
+
+ if (tboot->version < 5) {
+ pr_warn("tboot version is invalid: %u\n", tboot->version);
+ return false;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+
+ return true;
+}
+
void __init tboot_probe(void)
{
/* Look for valid page-aligned address for shared page. */
@@ -66,25 +89,9 @@ void __init tboot_probe(void)
/* Map and check for tboot UUID. */
set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
- tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
- if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
- pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
+ tboot = (void *)fix_to_virt(FIX_TBOOT_BASE);
+ if (!check_tboot_version())
tboot = NULL;
- return;
- }
- if (tboot->version < 5) {
- pr_warn("tboot version is invalid: %u\n", tboot->version);
- tboot = NULL;
- return;
- }
-
- pr_info("found shared page at phys addr 0x%llx:\n",
- boot_params.tboot_addr);
- pr_debug("version: %d\n", tboot->version);
- pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
- pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
- pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
- pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
}
static pgd_t *tboot_pg_dir;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 64a496a0687f..3c883e064242 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -164,17 +164,11 @@ int do_set_thread_area(struct task_struct *p, int idx,
savesegment(fs, sel);
if (sel == modified_sel)
loadsegment(fs, sel);
-
- savesegment(gs, sel);
- if (sel == modified_sel)
- load_gs_index(sel);
#endif
-#ifdef CONFIG_X86_32_LAZY_GS
savesegment(gs, sel);
if (sel == modified_sel)
- loadsegment(gs, sel);
-#endif
+ load_gs_index(sel);
} else {
#ifdef CONFIG_X86_64
if (p->thread.fsindex == modified_sel)
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index f5477eab5692..8617d1ed9d31 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -113,7 +113,7 @@ int arch_register_cpu(int num)
* Two known BSP/CPU0 dependencies: Resume from suspend/hibernate
* depends on BSP. PIC interrupts depend on BSP.
*
- * If the BSP depencies are under control, one can tell kernel to
+ * If the BSP dependencies are under control, one can tell kernel to
* enable BSP hotplug. This basically adds a control file and
* one can attempt to offline BSP.
*/
@@ -154,11 +154,6 @@ static int __init topology_init(void)
{
int i;
-#ifdef CONFIG_NUMA
- for_each_online_node(i)
- register_one_node(i);
-#endif
-
for_each_present_cpu(i)
arch_register_cpu(i);
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c
new file mode 100644
index 000000000000..8322e8352777
--- /dev/null
+++ b/arch/x86/kernel/trace.c
@@ -0,0 +1,234 @@
+#include <asm/trace/irq_vectors.h>
+#include <linux/trace.h>
+
+#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC)
+/*
+ * trace_intel_irq_entry - record intel specific IRQ entry
+ */
+static void trace_intel_irq_entry(void *data, int vector)
+{
+ osnoise_trace_irq_entry(vector);
+}
+
+/*
+ * trace_intel_irq_exit - record intel specific IRQ exit
+ */
+static void trace_intel_irq_exit(void *data, int vector)
+{
+ char *vector_desc = (char *) data;
+
+ osnoise_trace_irq_exit(vector, vector_desc);
+}
+
+/*
+ * register_intel_irq_tp - Register intel specific IRQ entry tracepoints
+ */
+int osnoise_arch_register(void)
+{
+ int ret;
+
+ ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+ if (ret)
+ goto out_timer_entry;
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_timer_exit;
+
+ ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+ if (ret)
+ goto out_thermal_entry;
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+#ifdef CONFIG_X86_MCE_AMD
+ ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_thermal_exit;
+
+ ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+ if (ret)
+ goto out_deferred_entry;
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_deferred_exit;
+
+ ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+ if (ret)
+ goto out_threshold_entry;
+#endif /* CONFIG_X86_MCE_THRESHOLD */
+
+#ifdef CONFIG_SMP
+ ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_threshold_exit;
+
+ ret = register_trace_call_function_single_exit(trace_intel_irq_exit,
+ "call_function_single");
+ if (ret)
+ goto out_call_function_single_entry;
+
+ ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_call_function_single_exit;
+
+ ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+ if (ret)
+ goto out_call_function_entry;
+
+ ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_call_function_exit;
+
+ ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+ if (ret)
+ goto out_reschedule_entry;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_IRQ_WORK
+ ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_reschedule_exit;
+
+ ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+ if (ret)
+ goto out_irq_work_entry;
+#endif
+
+ ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_irq_work_exit;
+
+ ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+ if (ret)
+ goto out_x86_ipi_entry;
+
+ ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_x86_ipi_exit;
+
+ ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+ if (ret)
+ goto out_error_apic_entry;
+
+ ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_error_apic_exit;
+
+ ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+ if (ret)
+ goto out_spurious_apic_entry;
+
+ return 0;
+
+out_spurious_apic_entry:
+ unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+out_error_apic_exit:
+ unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+out_error_apic_entry:
+ unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+out_x86_ipi_exit:
+ unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+out_x86_ipi_entry:
+ unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+out_irq_work_exit:
+
+#ifdef CONFIG_IRQ_WORK
+ unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+out_irq_work_entry:
+ unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+out_reschedule_exit:
+#endif
+
+#ifdef CONFIG_SMP
+ unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+out_reschedule_entry:
+ unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+out_call_function_exit:
+ unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+out_call_function_entry:
+ unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+out_call_function_single_exit:
+ unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+out_call_function_single_entry:
+ unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+out_threshold_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+out_threshold_entry:
+ unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+out_deferred_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+out_deferred_entry:
+ unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+out_thermal_exit:
+#endif /* CONFIG_X86_MCE_AMD */
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+out_thermal_entry:
+ unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+out_timer_exit:
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+ unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+out_timer_entry:
+ unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+out_err:
+ return -EINVAL;
+}
+
+void osnoise_arch_unregister(void)
+{
+ unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+ unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+ unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+ unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+
+#ifdef CONFIG_IRQ_WORK
+ unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+ unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_SMP
+ unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+ unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+ unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+ unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+ unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+ unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+ unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+ unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+ unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+}
+#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index fcfc077afe2d..03ae1caaa878 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -1,17 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Code for supporting irq vector tracepoints.
- *
* Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
- *
*/
#include <linux/jump_label.h>
#include <linux/atomic.h>
-#include <asm/hw_irq.h>
-#include <asm/desc.h>
#include <asm/trace/exceptions.h>
-#include <asm/trace/irq_vectors.h>
DEFINE_STATIC_KEY_FALSE(trace_pagefault_key);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ac1874a2a70e..d62b2cb85cea 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -39,6 +39,7 @@
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
+#include <linux/ioasid.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
@@ -48,7 +49,7 @@
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/cpu.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
@@ -61,6 +62,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/vdso.h>
+#include <asm/tdx.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -209,6 +211,81 @@ DEFINE_IDTENTRY(exc_overflow)
do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
}
+#ifdef CONFIG_X86_KERNEL_IBT
+
+static __ro_after_init bool ibt_fatal = true;
+
+extern void ibt_selftest_ip(void); /* code label defined in asm below */
+
+enum cp_error_code {
+ CP_EC = (1 << 15) - 1,
+
+ CP_RET = 1,
+ CP_IRET = 2,
+ CP_ENDBR = 3,
+ CP_RSTRORSSP = 4,
+ CP_SETSSBSY = 5,
+
+ CP_ENCL = 1 << 15,
+};
+
+DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_IBT)) {
+ pr_err("Unexpected #CP\n");
+ BUG();
+ }
+
+ if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR))
+ return;
+
+ if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) {
+ regs->ax = 0;
+ return;
+ }
+
+ pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
+ if (!ibt_fatal) {
+ printk(KERN_DEFAULT CUT_HERE);
+ __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
+ return;
+ }
+ BUG();
+}
+
+/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */
+noinline bool ibt_selftest(void)
+{
+ unsigned long ret;
+
+ asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t"
+ ANNOTATE_RETPOLINE_SAFE
+ " jmp *%%rax\n\t"
+ "ibt_selftest_ip:\n\t"
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ " nop\n\t"
+
+ : "=a" (ret) : : "memory");
+
+ return !ret;
+}
+
+static int __init ibt_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+
+ if (!strcmp(str, "warn"))
+ ibt_fatal = false;
+
+ return 1;
+}
+
+__setup("ibt=", ibt_setup);
+
+#endif /* CONFIG_X86_KERNEL_IBT */
+
#ifdef CONFIG_X86_F00F_BUG
void handle_invalid_op(struct pt_regs *regs)
#else
@@ -313,17 +390,19 @@ out:
}
#ifdef CONFIG_VMAP_STACK
-__visible void __noreturn handle_stack_overflow(const char *message,
- struct pt_regs *regs,
- unsigned long fault_address)
+__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info)
{
- printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
- (void *)fault_address, current->stack,
- (char *)current->stack + THREAD_SIZE - 1);
- die(message, regs, 0);
+ const char *name = stack_type_name(info->type);
+
+ printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
+ name, (void *)fault_address, info->begin, info->end);
+
+ die("stack guard page", regs, 0);
/* Be absolutely certain we don't return. */
- panic("%s", message);
+ panic("%s stack guard hit", name);
}
#endif
@@ -353,6 +432,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
#ifdef CONFIG_VMAP_STACK
unsigned long address = read_cr2();
+ struct stack_info info;
#endif
#ifdef CONFIG_X86_ESPFIX64
@@ -395,7 +475,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
/*
* Adjust our frame so that we return straight to the #GP
* vector with the expected RSP value. This is safe because
- * we won't enable interupts or schedule before we invoke
+ * we won't enable interrupts or schedule before we invoke
* general_protection, so nothing will clobber the stack
* frame we just set up.
*
@@ -455,10 +535,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
* stack even if the actual trigger for the double fault was
* something else.
*/
- if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
- handle_stack_overflow("kernel stack overflow (double-fault)",
- regs, address);
- }
+ if (get_stack_guard_info((void *)address, &info))
+ handle_stack_overflow(regs, address, &info);
#endif
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
@@ -498,14 +576,15 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
{
u8 insn_buf[MAX_INSN_SIZE];
struct insn insn;
+ int ret;
if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip,
MAX_INSN_SIZE))
return GP_NO_HINT;
- kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
- insn_get_modrm(&insn);
- insn_get_sib(&insn);
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
+ return GP_NO_HINT;
*addr = (unsigned long)insn_get_addr_ref(&insn, regs);
if (*addr == -1UL)
@@ -527,13 +606,124 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
#define GPFSTR "general protection fault"
+static bool fixup_iopl_exception(struct pt_regs *regs)
+{
+ struct thread_struct *t = &current->thread;
+ unsigned char byte;
+ unsigned long ip;
+
+ if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
+ return false;
+
+ if (insn_get_effective_ip(regs, &ip))
+ return false;
+
+ if (get_user(byte, (const char __user *)ip))
+ return false;
+
+ if (byte != 0xfa && byte != 0xfb)
+ return false;
+
+ if (!t->iopl_warn && printk_ratelimit()) {
+ pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
+ current->comm, task_pid_nr(current), ip);
+ print_vma_addr(KERN_CONT " in ", ip);
+ pr_cont("\n");
+ t->iopl_warn = 1;
+ }
+
+ regs->ip += 1;
+ return true;
+}
+
+/*
+ * The unprivileged ENQCMD instruction generates #GPs if the
+ * IA32_PASID MSR has not been populated. If possible, populate
+ * the MSR from a PASID previously allocated to the mm.
+ */
+static bool try_fixup_enqcmd_gp(void)
+{
+#ifdef CONFIG_IOMMU_SVA
+ u32 pasid;
+
+ /*
+ * MSR_IA32_PASID is managed using XSAVE. Directly
+ * writing to the MSR is only possible when fpregs
+ * are valid and the fpstate is not. This is
+ * guaranteed when handling a userspace exception
+ * in *before* interrupts are re-enabled.
+ */
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * Hardware without ENQCMD will not generate
+ * #GPs that can be fixed up here.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+ return false;
+
+ pasid = current->mm->pasid;
+
+ /*
+ * If the mm has not been allocated a
+ * PASID, the #GP can not be fixed up.
+ */
+ if (!pasid_valid(pasid))
+ return false;
+
+ /*
+ * Did this thread already have its PASID activated?
+ * If so, the #GP must be from something else.
+ */
+ if (current->pasid_activated)
+ return false;
+
+ wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
+ current->pasid_activated = 1;
+
+ return true;
+#else
+ return false;
+#endif
+}
+
+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
+{
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return true;
+
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
+
+ return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP;
+}
+
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
+{
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
- struct task_struct *tsk;
unsigned long gp_addr;
- int ret;
+
+ if (user_mode(regs) && try_fixup_enqcmd_gp())
+ return;
cond_local_irq_enable(regs);
@@ -549,37 +739,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
return;
}
- tsk = current;
-
if (user_mode(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
+ if (fixup_iopl_exception(regs))
+ goto exit;
if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
- return;
+ goto exit;
- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
- force_sig(SIGSEGV);
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
goto exit;
}
- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- goto exit;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
- /*
- * To be potentially processing a kprobe fault and to trust the result
- * from kprobe_running(), we have to be non-preemptible.
- */
- if (!preemptible() &&
- kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- goto exit;
-
- ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
- if (ret == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc))
goto exit;
if (error_code)
@@ -624,6 +795,7 @@ static bool do_int3(struct pt_regs *regs)
return res == NOTIFY_STOP;
}
+NOKPROBE_SYMBOL(do_int3);
static void do_int3_user(struct pt_regs *regs)
{
@@ -708,7 +880,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
stack = (unsigned long *)sp;
if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
- info.type >= STACK_TYPE_EXCEPTION_LAST)
+ info.type > STACK_TYPE_EXCEPTION_LAST)
sp = __this_cpu_ist_top_va(VC2);
sync:
@@ -726,14 +898,10 @@ sync:
}
#endif
-struct bad_iret_stack {
- void *error_entry_ret;
- struct pt_regs regs;
-};
-
-asmlinkage __visible noinstr
-struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
{
+ struct pt_regs tmp, *new_stack;
+
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
@@ -742,19 +910,18 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
- struct bad_iret_stack tmp, *new_stack =
- (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the temporary storage. */
- __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
+ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8);
/* Copy the remainder of the stack from the current stack. */
- __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
+ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip));
/* Update the entry stack */
__memcpy(new_stack, &tmp, sizeof(tmp));
- BUG_ON(!user_mode(&new_stack->regs));
+ BUG_ON(!user_mode(new_stack));
return new_stack;
}
#endif
@@ -889,9 +1056,6 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
dr6 &= ~DR_STEP;
- if (kprobe_debug_handler(regs))
- goto out;
-
/*
* The kernel doesn't use INT1
*/
@@ -978,6 +1142,10 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
goto out_irq;
}
+ /* #DB for bus lock can only be triggered from userspace. */
+ if (dr6 & DR_BUS_LOCK)
+ handle_bus_lock(regs);
+
/* Add the virtual_dr6 bits for signals. */
dr6 |= current->thread.virtual_dr6;
if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp)
@@ -1044,9 +1212,10 @@ static void math_error(struct pt_regs *regs, int trapnr)
}
/*
- * Save the info for the exception handler and clear the error.
+ * Synchronize the FPU register state to the memory register state
+ * if necessary. This allows the exception handler to inspect it.
*/
- fpu__save(fpu);
+ fpu_sync_fpstate(fpu);
task->thread.trap_nr = trapnr;
task->thread.error_code = 0;
@@ -1057,7 +1226,7 @@ static void math_error(struct pt_regs *regs, int trapnr)
goto exit;
if (fixup_vdso_exception(regs, trapnr, 0, 0))
- return;
+ goto exit;
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs));
@@ -1105,10 +1274,48 @@ DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
*/
}
+static bool handle_xfd_event(struct pt_regs *regs)
+{
+ u64 xfd_err;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
+ return false;
+
+ rdmsrl(MSR_IA32_XFD_ERR, xfd_err);
+ if (!xfd_err)
+ return false;
+
+ wrmsrl(MSR_IA32_XFD_ERR, 0);
+
+ /* Die if that happens in kernel space */
+ if (WARN_ON(!user_mode(regs)))
+ return false;
+
+ local_irq_enable();
+
+ err = xfd_enable_feature(xfd_err);
+
+ switch (err) {
+ case -EPERM:
+ force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs));
+ break;
+ case -EFAULT:
+ force_sig(SIGSEGV);
+ break;
+ }
+
+ local_irq_disable();
+ return true;
+}
+
DEFINE_IDTENTRY(exc_device_not_available)
{
unsigned long cr0 = read_cr0();
+ if (handle_xfd_event(regs))
+ return;
+
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
@@ -1137,6 +1344,91 @@ DEFINE_IDTENTRY(exc_device_not_available)
}
}
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+#define VE_FAULT_STR "VE fault"
+
+static void ve_raise_fault(struct pt_regs *regs, long error_code)
+{
+ if (user_mode(regs)) {
+ gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
+ return;
+ }
+
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR))
+ return;
+
+ die_addr(VE_FAULT_STR, regs, error_code, 0);
+}
+
+/*
+ * Virtualization Exceptions (#VE) are delivered to TDX guests due to
+ * specific guest actions which may happen in either user space or the
+ * kernel:
+ *
+ * * Specific instructions (WBINVD, for example)
+ * * Specific MSR accesses
+ * * Specific CPUID leaf accesses
+ * * Access to specific guest physical addresses
+ *
+ * In the settings that Linux will run in, virtualization exceptions are
+ * never generated on accesses to normal, TD-private memory that has been
+ * accepted (by BIOS or with tdx_enc_status_changed()).
+ *
+ * Syscall entry code has a critical window where the kernel stack is not
+ * yet set up. Any exception in this window leads to hard to debug issues
+ * and can be exploited for privilege escalation. Exceptions in the NMI
+ * entry code also cause issues. Returning from the exception handler with
+ * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
+ *
+ * For these reasons, the kernel avoids #VEs during the syscall gap and
+ * the NMI entry code. Entry code paths do not access TD-shared memory,
+ * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
+ * that might generate #VE. VMM can remove memory from TD at any point,
+ * but access to unaccepted (or missing) private memory leads to VM
+ * termination, not to #VE.
+ *
+ * Similarly to page faults and breakpoints, #VEs are allowed in NMI
+ * handlers once the kernel is ready to deal with nested NMIs.
+ *
+ * During #VE delivery, all interrupts, including NMIs, are blocked until
+ * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
+ * the VE info.
+ *
+ * If a guest kernel action which would normally cause a #VE occurs in
+ * the interrupt-disabled region before TDGETVEINFO, a #DF (fault
+ * exception) is delivered to the guest which will result in an oops.
+ *
+ * The entry code has been audited carefully for following these expectations.
+ * Changes in the entry code have to be audited for correctness vs. this
+ * aspect. Similarly to #PF, #VE in these places will expose kernel to
+ * privilege escalation or may lead to random crashes.
+ */
+DEFINE_IDTENTRY(exc_virtualization_exception)
+{
+ struct ve_info ve;
+
+ /*
+ * NMIs/Machine-checks/Interrupts will be in a disabled state
+ * till TDGETVEINFO TDCALL is executed. This ensures that VE
+ * info cannot be overwritten by a nested #VE.
+ */
+ tdx_get_ve_info(&ve);
+
+ cond_local_irq_enable(regs);
+
+ /*
+ * If tdx_handle_virt_exception() could not process
+ * it successfully, treat it as #GP(0) and handle it.
+ */
+ if (!tdx_handle_virt_exception(regs, &ve))
+ ve_raise_fault(regs, 0);
+
+ cond_local_irq_disable(regs);
+}
+
+#endif
+
#ifdef CONFIG_X86_32
DEFINE_IDTENTRY_SW(iret_error)
{
@@ -1158,12 +1450,9 @@ void __init trap_init(void)
/* Init GHCB memory pages when running as an SEV-ES guest */
sev_es_init_vc_handling();
+ /* Initialize TSS before setting up traps so ISTs work */
+ cpu_init_exception_handling();
+ /* Setup traps as cpu_init() might #GP */
idt_setup_traps();
-
- /*
- * Should be a barrier for any external CPU state:
- */
cpu_init();
-
- idt_setup_ist_traps();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f70dffc2771f..cafacb2e58cc 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -14,6 +14,7 @@
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
+#include <linux/static_call.h>
#include <asm/hpet.h>
#include <asm/timer.h>
@@ -254,7 +255,7 @@ unsigned long long sched_clock(void)
bool using_native_sched_clock(void)
{
- return pv_ops.time.sched_clock == native_sched_clock;
+ return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
unsigned long long
@@ -739,7 +740,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
* 2) Reference counter. If available we use the HPET or the
* PMTIMER as a reference to check the sanity of that value.
* We use separate TSC readouts and check inside of the
- * reference read for any possible disturbance. We dicard
+ * reference read for any possible disturbance. We discard
* disturbed values here as well. We do that around the PIT
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
@@ -1079,7 +1080,7 @@ static void tsc_resume(struct clocksource *cs)
* very small window right after one CPU updated cycle_last under
* xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
* is smaller than the cycle_last reference value due to a TSC which
- * is slighty behind. This delta is nowhere else observable, but in
+ * is slightly behind. This delta is nowhere else observable, but in
* that case it results in a forward time jump in the range of hours
* due to the unsigned delta calculation of the time keeping core
* code, which is necessary to support wrapping clocksources like pm
@@ -1127,6 +1128,7 @@ static int tsc_cs_enable(struct clocksource *cs)
static struct clocksource clocksource_tsc_early = {
.name = "tsc-early",
.rating = 299,
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
.read = read_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -1151,7 +1153,8 @@ static struct clocksource clocksource_tsc = {
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_VALID_FOR_HRES |
- CLOCK_SOURCE_MUST_VERIFY,
+ CLOCK_SOURCE_MUST_VERIFY |
+ CLOCK_SOURCE_VERIFY_PERCPU,
.vdso_clock_mode = VDSO_CLOCKMODE_TSC,
.enable = tsc_cs_enable,
.resume = tsc_resume,
@@ -1177,6 +1180,12 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static void __init tsc_disable_clocksource_watchdog(void)
+{
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1193,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than two sockets. As the number of sockets cannot be
+ * evaluated at the early boot stage where this has to be
+ * invoked, check the number of online memory nodes as a
+ * fallback solution which is an reasonable estimate.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ nr_online_nodes <= 2)
+ tsc_disable_clocksource_watchdog();
}
/*
@@ -1264,7 +1290,7 @@ EXPORT_SYMBOL(convert_art_to_tsc);
* corresponding clocksource
* @cycles: System counter value
* @cs: Clocksource corresponding to system counter value. Used
- * by timekeeping code to verify comparibility of two cycle
+ * by timekeeping code to verify comparability of two cycle
* values.
*/
@@ -1384,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
- if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1464,6 +1487,9 @@ static unsigned long __init get_loops_per_jiffy(void)
static void __init tsc_enable_sched_clock(void)
{
+ loops_per_jiffy = get_loops_per_jiffy();
+ use_tsc_delay();
+
/* Sanitize TSC ADJUST before cyc2ns gets initialized */
tsc_store_and_check_tsc_adjust(true);
cyc2ns_init_boot_cpu();
@@ -1479,8 +1505,6 @@ void __init tsc_early_init(void)
return;
if (!determine_cpu_tsc_frequencies(true))
return;
- loops_per_jiffy = get_loops_per_jiffy();
-
tsc_enable_sched_clock();
}
@@ -1514,7 +1538,6 @@ void __init tsc_init(void)
enable_sched_clock_irqtime();
lpj_fine = get_loops_per_jiffy();
- use_tsc_delay();
check_system_tsc_reliable();
@@ -1524,7 +1547,7 @@ void __init tsc_init(void)
}
if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ tsc_disable_clocksource_watchdog();
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 3d3c761eb74a..9452dc9664b5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
/*
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
@@ -472,7 +513,7 @@ retry:
/*
* Add the result to the previous adjustment value.
*
- * The adjustement value is slightly off by the overhead of the
+ * The adjustment value is slightly off by the overhead of the
* sync mechanism (observed values are ~200 TSC cycles), but this
* really depends on CPU, node distance and frequency. So
* compensating for this is hard to get right. Experiments show
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index f6225bf22c02..5a4b21389b1d 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -92,8 +92,8 @@ static const char * const umip_insns[5] = {
#define umip_pr_err(regs, fmt, ...) \
umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
-#define umip_pr_warn(regs, fmt, ...) \
- umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define umip_pr_debug(regs, fmt, ...) \
+ umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__)
/**
* umip_printk() - Print a rate-limited message
@@ -272,7 +272,7 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
* by whether the operand is a register or a memory location.
* If operand is a register, return as many bytes as the operand
* size. If operand is memory, return only the two least
- * siginificant bytes.
+ * significant bytes.
*/
if (X86_MODRM_MOD(insn->modrm.value) == 3)
*data_size = insn->opnd_bytes;
@@ -346,27 +346,25 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (!regs)
return false;
- nr_copied = insn_fetch_from_user(regs, buf);
-
/*
- * The insn_fetch_from_user above could have failed if user code
- * is protected by a memory protection key. Give up on emulation
- * in such a case. Should we issue a page fault?
+ * Give up on emulation if fetching the instruction failed. Should a
+ * page fault or a #GP be issued?
*/
- if (!nr_copied)
+ nr_copied = insn_fetch_from_user(regs, buf);
+ if (nr_copied <= 0)
return false;
- if (!insn_decode(&insn, regs, buf, nr_copied))
+ if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))
return false;
umip_inst = identify_insn(&insn);
if (umip_inst < 0)
return false;
- umip_pr_warn(regs, "%s instruction cannot be used by applications.\n",
+ umip_pr_debug(regs, "%s instruction cannot be used by applications.\n",
umip_insns[umip_inst]);
- umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n");
+ umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n");
if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
user_64bit_mode(regs)))
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d7c44b257f7f..8e1c50c86e5d 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -240,8 +240,7 @@ static bool update_stack_state(struct unwind_state *state,
else {
addr_p = unwind_get_return_address_ptr(state);
addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
- state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
- addr, addr_p);
+ state->ip = unwind_recover_ret_addr(state, addr, addr_p);
}
/* Save the original stack pointer for unwind_dump(): */
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index c49f10ffd8cd..884d68a6e714 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -15,8 +15,7 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
addr = READ_ONCE_NOCHECK(*state->sp);
- return ftrace_graph_ret_addr(state->task, &state->graph_idx,
- addr, state->sp);
+ return unwind_recover_ret_addr(state, addr, state->sp);
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index a1202536fc57..38185aedf7d1 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -175,7 +175,7 @@ static struct orc_entry *orc_find(unsigned long ip)
}
/* vmlinux .init slow lookup: */
- if (init_kernel_text(ip))
+ if (is_kernel_inittext(ip))
return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
__stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
@@ -339,11 +339,11 @@ static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
struct stack_info *info = &state->stack_info;
void *addr = (void *)_addr;
- if (!on_stack(info, addr, len) &&
- (get_stack_info(addr, state->task, info, &state->stack_mask)))
- return false;
+ if (on_stack(info, addr, len))
+ return true;
- return true;
+ return !get_stack_info(addr, state->task, info, &state->stack_mask) &&
+ on_stack(info, addr, len);
}
static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
@@ -534,9 +534,8 @@ bool unwind_next_frame(struct unwind_state *state)
if (!deref_stack_reg(state, ip_p, &state->ip))
goto err;
- state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
- state->ip, (void *)ip_p);
-
+ state->ip = unwind_recover_ret_addr(state, state->ip,
+ (unsigned long *)ip_p);
state->sp = sp;
state->regs = NULL;
state->prev_regs = NULL;
@@ -549,7 +548,18 @@ bool unwind_next_frame(struct unwind_state *state)
(void *)orig_ip);
goto err;
}
-
+ /*
+ * There is a small chance to interrupt at the entry of
+ * arch_rethook_trampoline() where the ORC info doesn't exist.
+ * That point is right after the RET to arch_rethook_trampoline()
+ * which was modified return address.
+ * At that point, the @addr_p of the unwind_recover_rethook()
+ * (this has to point the address of the stack entry storing
+ * the modified return address) must be "SP - (a stack entry)"
+ * because SP is incremented by the RET.
+ */
+ state->ip = unwind_recover_rethook(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
state->regs = (struct pt_regs *)sp;
state->prev_regs = NULL;
state->full_regs = true;
@@ -562,6 +572,9 @@ bool unwind_next_frame(struct unwind_state *state)
(void *)orig_ip);
goto err;
}
+ /* See UNWIND_HINT_TYPE_REGS case comment. */
+ state->ip = unwind_recover_rethook(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
if (state->full_regs)
state->prev_regs = state->regs;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index a2b413394917..b63cf8f7745e 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -276,12 +276,12 @@ static bool is_prefix_bad(struct insn *insn)
static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
{
+ enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32;
u32 volatile *good_insns;
+ int ret;
- insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64);
- /* has the side-effect of processing the entire instruction */
- insn_get_length(insn);
- if (!insn_complete(insn))
+ ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m);
+ if (ret < 0)
return -ENOEXEC;
if (is_prefix_bad(insn))
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 641f0fe1e5b4..1258a5872d12 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -132,9 +132,9 @@ SYM_FUNC_START_LOCAL(verify_cpu)
.Lverify_cpu_no_longmode:
popf # Restore caller passed flags
movl $1,%eax
- ret
+ RET
.Lverify_cpu_sse_ok:
popf # Restore caller passed flags
xorl %eax, %eax
- ret
+ RET
SYM_FUNC_END(verify_cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e5a7a10a0164..e9e803a4d44c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -106,10 +106,8 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
*/
local_irq_enable();
- if (!vm86 || !vm86->user_vm86) {
- pr_alert("no user_vm86: BAD\n");
- do_exit(SIGSEGV);
- }
+ BUG_ON(!vm86);
+
set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
user = vm86->user_vm86;
@@ -142,6 +140,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
user_access_end();
+exit_vm86:
preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
@@ -152,7 +151,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
- lazy_load_gs(vm86->regs32.gs);
+ loadsegment(gs, vm86->regs32.gs);
regs->pt.ax = retval;
return;
@@ -161,7 +160,8 @@ Efault_end:
user_access_end();
Efault:
pr_alert("could not access userspace vm86 info\n");
- do_exit(SIGSEGV);
+ force_exit_sig(SIGSEGV);
+ goto exit_vm86;
}
static int do_vm86_irq_handling(int subfunction, int irqnumber);
@@ -325,7 +325,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
* Save old state
*/
vm86->saved_sp0 = tsk->thread.sp0;
- lazy_save_gs(vm86->regs32.gs);
+ savesegment(gs, vm86->regs32.gs);
/* make room for real-mode segments */
preempt_disable();
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index efd9e9ea17f2..81aba718ecd5 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -137,7 +137,6 @@ SECTIONS
ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
STATIC_CALL_TEXT
- *(.fixup)
*(.gnu.warning)
#ifdef CONFIG_RETPOLINE
@@ -272,6 +271,29 @@ SECTIONS
__parainstructions_end = .;
}
+#ifdef CONFIG_RETPOLINE
+ /*
+ * List of instructions that call/jmp/jcc to retpoline thunks
+ * __x86_indirect_thunk_*(). These instructions can be patched along
+ * with alternatives, after which the section can be freed.
+ */
+ . = ALIGN(8);
+ .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) {
+ __retpoline_sites = .;
+ *(.retpoline_sites)
+ __retpoline_sites_end = .;
+ }
+#endif
+
+#ifdef CONFIG_X86_KERNEL_IBT
+ . = ALIGN(8);
+ .ibt_endbr_seal : AT(ADDR(.ibt_endbr_seal) - LOAD_OFFSET) {
+ __ibt_endbr_seal = .;
+ *(.ibt_endbr_seal)
+ __ibt_endbr_seal_end = .;
+ }
+#endif
+
/*
* struct alt_inst entries. From the header (alternative.h):
* "Alternative instructions for different CPU types or capabilities"
@@ -293,18 +315,6 @@ SECTIONS
*(.altinstr_replacement)
}
- /*
- * struct iommu_table_entry entries are injected in this section.
- * It is an array of IOMMUs which during run time gets sorted depending
- * on its dependency order. After rootfs_initcall is complete
- * this section can be safely removed.
- */
- .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
- __iommu_table = .;
- *(.iommu_table)
- __iommu_table_end = .;
- }
-
. = ALIGN(8);
.apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
__apicdrivers = .;
@@ -375,10 +385,10 @@ SECTIONS
__end_of_kernel_reserve = .;
. = ALIGN(PAGE_SIZE);
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ .brk (NOLOAD) : AT(ADDR(.brk) - LOAD_OFFSET) {
__brk_base = .;
. += 64 * 1024; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
+ *(.bss..brk) /* areas brk users have reserved */
__brk_limit = .;
}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 8b395821cb8d..e84ee5cdbd8c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -129,6 +129,11 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
+static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { }
+static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; }
+static bool enc_tlb_flush_required_noop(bool enc) { return false; }
+static bool enc_cache_flush_required_noop(void) { return false; }
+
struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu_early,
.calibrate_tsc = native_calibrate_tsc,
@@ -138,24 +143,19 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
.get_nmi_reason = default_get_nmi_reason,
- .save_sched_clock_state = tsc_save_sched_clock_state,
- .restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
.hyper.pin_vcpu = x86_op_int_noop,
-};
-
-EXPORT_SYMBOL_GPL(x86_platform);
-#if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi __ro_after_init = {
- .restore_msi_irqs = default_restore_msi_irqs,
+ .guest = {
+ .enc_status_change_prepare = enc_status_change_prepare_noop,
+ .enc_status_change_finish = enc_status_change_finish_noop,
+ .enc_tlb_flush_required = enc_tlb_flush_required_noop,
+ .enc_cache_flush_required = enc_cache_flush_required_noop,
+ },
};
-/* MSI arch specific hooks */
-void arch_restore_msi_irqs(struct pci_dev *dev)
-{
- x86_msi.restore_msi_irqs(dev);
-}
-#endif
+EXPORT_SYMBOL_GPL(x86_platform);
struct x86_apic_ops x86_apic_ops __ro_after_init = {
.io_apic_read = native_io_apic_read,
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a788d5120d4d..e3cbd7706136 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,13 +22,13 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
depends on HIGH_RES_TIMERS
- # for TASKSTATS/TASK_DELAY_ACCT:
- depends on NET && MULTIUSER
depends on X86_LOCAL_APIC
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_PFNCACHE
select HAVE_KVM_IRQFD
+ select HAVE_KVM_DIRTY_RING
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
@@ -36,9 +36,9 @@ config KVM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
- select TASKSTATS
- select TASK_DELAY_ACCT
+ select SCHED_INFO
select PERF_EVENTS
+ select GUEST_PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_NO_POLL
@@ -46,6 +46,8 @@ config KVM
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
select SRCU
+ select INTERVAL_TREE
+ select HAVE_KVM_PM_NOTIFIER if PM
help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
@@ -84,6 +86,18 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM
@@ -112,11 +126,7 @@ config KVM_XEN
If in doubt, say "N".
-config KVM_MMU_AUDIT
- bool "Audit KVM MMU"
- depends on KVM && TRACEPOINTS
- help
- This option adds a R/W kVM module parameter 'mmu_audit', which allows
- auditing of KVM MMU events at runtime.
+config KVM_EXTERNAL_WRITE_TRACKING
+ bool
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index eafc4d601f25..30f244b64523 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,24 +7,30 @@ ifeq ($(CONFIG_FRAME_POINTER),y)
OBJECT_FILES_NON_STANDARD_vmenter.o := y
endif
-KVM := ../../../virt/kvm
-
-kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
- $(KVM)/dirty_ring.o
-kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
+include $(srctree)/virt/kvm/Makefile.kvm
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
mmu/spte.o
+
+ifdef CONFIG_HYPERV
+kvm-y += kvm_onhyperv.o
+endif
+
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
+
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
+ifdef CONFIG_HYPERV
+kvm-amd-y += svm/svm_onhyperv.o
+endif
+
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6bd2f8b830e4..de6d44e07e34 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -18,6 +18,8 @@
#include <asm/processor.h>
#include <asm/user.h>
#include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
+#include <asm/cpuid.h>
#include "cpuid.h"
#include "lapic.h"
#include "mmu.h"
@@ -28,10 +30,10 @@
* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
* aligned to sizeof(unsigned long) because it's not accessed via bitops.
*/
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
EXPORT_SYMBOL_GPL(kvm_cpu_caps);
-static u32 xstate_required_size(u64 xstate_bv, bool compacted)
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
int feature_bit = 0;
u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
@@ -41,7 +43,11 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
if (xstate_bv & 0x1) {
u32 eax, ebx, ecx, edx, offset;
cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
- offset = compacted ? ret : ebx;
+ /* ECX[1]: 64B alignment in compacted form */
+ if (compacted)
+ offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
+ else
+ offset = ebx;
ret = max(ret, offset + eax);
}
@@ -52,7 +58,15 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
+/*
+ * This one is tied to SSB in the user API, and not
+ * visible in /proc/cpuinfo.
+ */
+#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
+
#define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
+
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
@@ -63,17 +77,20 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
for (i = 0; i < nent; i++) {
e = &entries[i];
- if (e->function == function && (e->index == index ||
- !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
+ if (e->function == function &&
+ (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index))
return e;
}
return NULL;
}
-static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
+static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entries,
+ int nent)
{
struct kvm_cpuid_entry2 *best;
+ u64 xfeatures;
/*
* The existing code assumes virtual address is 48-bit or 57-bit in the
@@ -87,14 +104,91 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
return -EINVAL;
}
+ /*
+ * Exposing dynamic xfeatures to the guest requires additional
+ * enabling in the FPU, e.g. to expand the guest XSAVE state size.
+ */
+ best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ if (!best)
+ return 0;
+
+ xfeatures = best->eax | ((u64)best->edx << 32);
+ xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
+ if (!xfeatures)
+ return 0;
+
+ return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
+}
+
+/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
+static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *orig;
+ int i;
+
+ if (nent != vcpu->arch.cpuid_nent)
+ return -EINVAL;
+
+ for (i = 0; i < nent; i++) {
+ orig = &vcpu->arch.cpuid_entries[i];
+ if (e2[i].function != orig->function ||
+ e2[i].index != orig->index ||
+ e2[i].flags != orig->flags ||
+ e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
+ e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
+ return -EINVAL;
+ }
+
return 0;
}
-void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
+static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
{
- struct kvm_cpuid_entry2 *best;
+ u32 function;
+ struct kvm_cpuid_entry2 *entry;
+
+ vcpu->arch.kvm_cpuid_base = 0;
+
+ for_each_possible_hypervisor_cpuid_base(function) {
+ entry = kvm_find_cpuid_entry(vcpu, function, 0);
- best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+ if (entry) {
+ u32 signature[3];
+
+ signature[0] = entry->ebx;
+ signature[1] = entry->ecx;
+ signature[2] = entry->edx;
+
+ BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE));
+ if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) {
+ vcpu->arch.kvm_cpuid_base = function;
+ break;
+ }
+ }
+ }
+}
+
+static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entries, int nent)
+{
+ u32 base = vcpu->arch.kvm_cpuid_base;
+
+ if (!base)
+ return NULL;
+
+ return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
+}
+
+static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+{
+ return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent);
+}
+
+void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu);
/*
* save the feature bitmap to avoid cpuid lookup for every PV
@@ -104,11 +198,28 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.pv_cpuid.features = best->eax;
}
-void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+/*
+ * Calculate guest's supported XCR0 taking into account guest CPUID data and
+ * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0).
+ */
+static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ if (!best)
+ return 0;
+
+ return (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+}
+
+static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *best;
+ u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
+
+ best = cpuid_entry2_find(entries, nent, 1, 0);
if (best) {
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
@@ -119,32 +230,52 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
}
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
- best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+ best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best)
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
- best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+ best = cpuid_entry2_find(entries, nent, 0xD, 1);
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+ best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
- best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ best = cpuid_entry2_find(entries, nent, 0x1, 0);
if (best)
cpuid_entry_change(best, X86_FEATURE_MWAIT,
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
+
+ /*
+ * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+ * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+ * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
+ * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+ * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
+ * '1' even on CPUs that don't support XSAVE.
+ */
+ best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
+ if (best) {
+ best->ecx &= guest_supported_xcr0 & 0xffffffff;
+ best->edx &= guest_supported_xcr0 >> 32;
+ best->ecx |= XFEATURE_MASK_FPSSE;
+ }
+}
+
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+{
+ __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
@@ -152,6 +283,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
+ u64 guest_supported_xcr0;
best = kvm_find_cpuid_entry(vcpu, 1, 0);
if (best && apic) {
@@ -163,12 +295,10 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_apic_set_version(vcpu);
}
- best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
- if (!best)
- vcpu->arch.guest_supported_xcr0 = 0;
- else
- vcpu->arch.guest_supported_xcr0 =
- (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+ guest_supported_xcr0 =
+ cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
+
+ vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0;
kvm_update_pv_runtime(vcpu);
@@ -185,34 +315,10 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
/*
- * Except for the MMU, which needs to be reset after any vendor
- * specific adjustments to the reserved GPA bits.
+ * Except for the MMU, which needs to do its thing any vendor specific
+ * adjustments to the reserved GPA bits.
*/
- kvm_mmu_reset_context(vcpu);
-}
-
-static int is_efer_nx(void)
-{
- return host_efer & EFER_NX;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_cpuid_entry2 *e, *entry;
-
- entry = NULL;
- for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
- e = &vcpu->arch.cpuid_entries[i];
- if (e->function == 0x80000001) {
- entry = e;
- break;
- }
- }
- if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
- cpuid_entry_clear(entry, X86_FEATURE_NX);
- printk(KERN_INFO "kvm: guest NX capability removed\n");
- }
+ kvm_mmu_after_set_cpuid(vcpu);
}
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
@@ -239,6 +345,47 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
}
+static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ int r;
+
+ __kvm_update_cpuid_runtime(vcpu, e2, nent);
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly. It would've been better to forbid any
+ * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
+ * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
+ * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
+ * whether the supplied CPUID data is equal to what's already set.
+ */
+ if (vcpu->arch.last_vmentry_cpu != -1) {
+ r = kvm_cpuid_check_equal(vcpu, e2, nent);
+ if (r)
+ return r;
+
+ kvfree(e2);
+ return 0;
+ }
+
+ r = kvm_check_cpuid(vcpu, e2, nent);
+ if (r)
+ return r;
+
+ kvfree(vcpu->arch.cpuid_entries);
+ vcpu->arch.cpuid_entries = e2;
+ vcpu->arch.cpuid_nent = nent;
+
+ kvm_update_kvm_cpuid_base(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
+
+ return 0;
+}
+
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
@@ -275,19 +422,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
e2[i].padding[2] = 0;
}
- r = kvm_check_cpuid(e2, cpuid->nent);
- if (r) {
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
kvfree(e2);
- goto out_free_cpuid;
- }
-
- kvfree(vcpu->arch.cpuid_entries);
- vcpu->arch.cpuid_entries = e2;
- vcpu->arch.cpuid_nent = cpuid->nent;
-
- cpuid_fix_nx_cap(vcpu);
- kvm_update_cpuid_runtime(vcpu);
- kvm_vcpu_after_set_cpuid(vcpu);
out_free_cpuid:
kvfree(e);
@@ -311,20 +448,11 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
return PTR_ERR(e2);
}
- r = kvm_check_cpuid(e2, cpuid->nent);
- if (r) {
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
kvfree(e2);
- return r;
- }
-
- kvfree(vcpu->arch.cpuid_entries);
- vcpu->arch.cpuid_entries = e2;
- vcpu->arch.cpuid_nent = cpuid->nent;
-
- kvm_update_cpuid_runtime(vcpu);
- kvm_vcpu_after_set_cpuid(vcpu);
- return 0;
+ return r;
}
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
@@ -347,13 +475,13 @@ out:
return r;
}
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
{
const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
struct kvm_cpuid_entry2 entry;
reverse_cpuid_check(leaf);
- kvm_cpu_caps[leaf] &= mask;
cpuid_count(cpuid.function, cpuid.index,
&entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -361,22 +489,45 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
}
+static __always_inline
+void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+ BUILD_BUG_ON(leaf < NCAPINTS);
+
+ kvm_cpu_caps[leaf] = mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+ BUILD_BUG_ON(leaf >= NCAPINTS);
+
+ kvm_cpu_caps[leaf] &= mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
void kvm_set_cpu_caps(void)
{
- unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
#ifdef CONFIG_X86_64
unsigned int f_gbpages = F(GBPAGES);
unsigned int f_lm = F(LM);
+ unsigned int f_xfd = F(XFD);
#else
unsigned int f_gbpages = 0;
unsigned int f_lm = 0;
+ unsigned int f_xfd = 0;
#endif
+ memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
- BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+ BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
sizeof(boot_cpu_data.x86_capability));
memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
- sizeof(kvm_cpu_caps));
+ sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
kvm_cpu_cap_mask(CPUID_1_ECX,
/*
@@ -407,18 +558,20 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_7_0_EBX,
- F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
- F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
- F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
- F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
- F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
- );
+ F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) |
+ F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) |
+ F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) |
+ F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) |
+ F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) |
+ F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) |
+ F(AVX512VL));
kvm_cpu_cap_mask(CPUID_7_ECX,
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+ F(SGX_LC) | F(BUS_LOCK_DETECT)
);
/* Set LA57 based on hardware capability. */
if (cpuid_ecx(7) & F(LA57))
@@ -435,7 +588,8 @@ void kvm_set_cpu_caps(void)
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
- F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16)
+ F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
+ F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -454,7 +608,11 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
- F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
+ );
+
+ kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+ SF(SGX1) | SF(SGX2)
);
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
@@ -462,7 +620,7 @@ void kvm_set_cpu_caps(void)
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
- F(TOPOEXT) | F(PERFCTR_CORE)
+ F(TOPOEXT) | 0 /* PERFCTR_CORE */
);
kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
@@ -471,7 +629,7 @@ void kvm_set_cpu_caps(void)
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* Reserved */ |
- f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
);
@@ -482,7 +640,8 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
- F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
+ F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
+ __feature_bit(KVM_X86_FEATURE_PSFD)
);
/*
@@ -514,11 +673,30 @@ void kvm_set_cpu_caps(void)
*/
kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
+ kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
+ 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
+ F(SME_COHERENT));
+
kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
F(PMM) | F(PMM_EN)
);
+
+ /*
+ * Hide RDTSCP and RDPID if either feature is reported as supported but
+ * probing MSR_TSC_AUX failed. This is purely a sanity check and
+ * should never happen, but the guest will likely crash if RDTSCP or
+ * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
+ * the past. For example, the sanity check may fire if this instance of
+ * KVM is running as L1 on top of an older, broken KVM.
+ */
+ if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
+ kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
+ !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
@@ -538,29 +716,37 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
entry = &array->entries[array->nent++];
+ memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
- entry->flags = 0;
+ switch (function & 0xC0000000) {
+ case 0x40000000:
+ /* Hypervisor leaves are always synthesized by __do_cpuid_func. */
+ return entry;
+
+ case 0x80000000:
+ /*
+ * 0x80000021 is sometimes synthesized by __do_cpuid_func, which
+ * would result in out-of-bounds calls to do_host_cpuid.
+ */
+ {
+ static int max_cpuid_80000000;
+ if (!READ_ONCE(max_cpuid_80000000))
+ WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000));
+ if (function > READ_ONCE(max_cpuid_80000000))
+ return entry;
+ }
+ break;
+
+ default:
+ break;
+ }
cpuid_count(entry->function, entry->index,
&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
- switch (function) {
- case 4:
- case 7:
- case 0xb:
- case 0xd:
- case 0xf:
- case 0x10:
- case 0x12:
- case 0x14:
- case 0x17:
- case 0x18:
- case 0x1f:
- case 0x8000001d:
+ if (cpuid_function_is_indexed(function))
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- break;
- }
return entry;
}
@@ -589,8 +775,10 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
case 7:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
entry->eax = 0;
- entry->ecx = F(RDPID);
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ entry->ecx = F(RDPID);
++array->nent;
+ break;
default:
break;
}
@@ -684,13 +872,18 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
union cpuid10_eax eax;
union cpuid10_edx edx;
+ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
perf_get_x86_pmu_capability(&cap);
/*
- * Only support guest architectural pmu on a host
- * with architectural pmu.
+ * The guest architecture pmu is only supported if the architecture
+ * pmu exists on the host and the module parameters allow it.
*/
- if (!cap.version)
+ if (!cap.version || !enable_pmu)
memset(&cap, 0, sizeof(cap));
eax.split.version_id = min(cap.version, 2);
@@ -698,9 +891,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
eax.split.bit_width = cap.bit_width_gp;
eax.split.mask_length = cap.events_mask_len;
- edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS);
+ edx.split.num_counters_fixed =
+ min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED);
edx.split.bit_width_fixed = cap.bit_width_fixed;
- edx.split.anythread_deprecated = 1;
+ if (cap.version)
+ edx.split.anythread_deprecated = 1;
edx.split.reserved1 = 0;
edx.split.reserved2 = 0;
@@ -727,12 +922,15 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
}
break;
- case 0xd:
- entry->eax &= supported_xcr0;
- entry->ebx = xstate_required_size(supported_xcr0, false);
+ case 0xd: {
+ u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm();
+ u64 permitted_xss = supported_xss;
+
+ entry->eax &= permitted_xcr0;
+ entry->ebx = xstate_required_size(permitted_xcr0, false);
entry->ecx = entry->ebx;
- entry->edx &= supported_xcr0 >> 32;
- if (!supported_xcr0)
+ entry->edx &= permitted_xcr0 >> 32;
+ if (!permitted_xcr0)
break;
entry = do_host_cpuid(array, function, 1);
@@ -741,20 +939,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
cpuid_entry_override(entry, CPUID_D_1_EAX);
if (entry->eax & (F(XSAVES)|F(XSAVEC)))
- entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
+ entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
true);
else {
- WARN_ON_ONCE(supported_xss != 0);
+ WARN_ON_ONCE(permitted_xss != 0);
entry->ebx = 0;
}
- entry->ecx &= supported_xss;
- entry->edx &= supported_xss >> 32;
+ entry->ecx &= permitted_xss;
+ entry->edx &= permitted_xss >> 32;
for (i = 2; i < 64; ++i) {
bool s_state;
- if (supported_xcr0 & BIT_ULL(i))
+ if (permitted_xcr0 & BIT_ULL(i))
s_state = false;
- else if (supported_xss & BIT_ULL(i))
+ else if (permitted_xss & BIT_ULL(i))
s_state = true;
else
continue;
@@ -768,16 +966,52 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* invalid sub-leafs. Only valid sub-leafs should
* reach this point, and they should have a non-zero
* save state size. Furthermore, check whether the
- * processor agrees with supported_xcr0/supported_xss
+ * processor agrees with permitted_xcr0/permitted_xss
* on whether this is an XCR0- or IA32_XSS-managed area.
*/
if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
--array->nent;
continue;
}
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ entry->ecx &= ~BIT_ULL(2);
entry->edx = 0;
}
break;
+ }
+ case 0x12:
+ /* Intel SGX */
+ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+ * and max enclave sizes. The SGX sub-features and MISCSELECT
+ * are restricted by kernel and KVM capabilities (like most
+ * feature flags), while enclave size is unrestricted.
+ */
+ cpuid_entry_override(entry, CPUID_12_EAX);
+ entry->ebx &= SGX_MISC_EXINFO;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ /*
+ * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
+ * feature flags. Advertise all supported flags, including
+ * privileged attributes that require explicit opt-in from
+ * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
+ * expected to derive it from supported XCR0.
+ */
+ entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+ SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
+ SGX_ATTR_KSS;
+ entry->ebx &= 0;
+ break;
/* Intel PT */
case 0x14:
if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
@@ -790,9 +1024,26 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
}
break;
+ /* Intel AMX TILE */
+ case 0x1d:
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ case 0x1e: /* TMUL information */
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ break;
case KVM_CPUID_SIGNATURE: {
- static const char signature[12] = "KVMKVMKVM\0\0";
- const u32 *sigptr = (const u32 *)signature;
+ const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
entry->eax = KVM_CPUID_FEATURES;
entry->ebx = sigptr[0];
entry->ecx = sigptr[1];
@@ -822,7 +1073,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = 0;
break;
case 0x80000000:
- entry->eax = min(entry->eax, 0x8000001f);
+ entry->eax = min(entry->eax, 0x80000021);
+ /*
+ * Serializing LFENCE is reported in a multitude of ways, and
+ * NullSegClearsBase is not reported in CPUID on Zen2; help
+ * userspace by providing the CPUID leaf ourselves.
+ *
+ * However, only do it if the host has CPUID leaf 0x8000001d.
+ * QEMU thinks that it can query the host blindly for that
+ * CPUID leaf if KVM reports that it supports 0x8000001d or
+ * above. The processor merrily returns values from the
+ * highest Intel leaf which QEMU tries to use as the guest's
+ * 0x8000001d. Even worse, this can result in an infinite
+ * loop if said highest leaf has no subleaves indexed by ECX.
+ */
+ if (entry->eax >= 0x8000001d &&
+ (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
+ entry->eax = max(entry->eax, 0x80000021);
break;
case 0x80000001:
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
@@ -843,8 +1111,21 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
unsigned phys_as = entry->eax & 0xff;
- if (!g_phys_as)
+ /*
+ * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
+ * the guest operates in the same PA space as the host, i.e.
+ * reductions in MAXPHYADDR for memory encryption affect shadow
+ * paging, too.
+ *
+ * If TDP is enabled but an explicit guest MAXPHYADDR is not
+ * provided, use the raw bare metal MAXPHYADDR as reductions to
+ * the HPAs do not affect GPAs.
+ */
+ if (!tdp_enabled)
+ g_phys_as = boot_cpu_data.x86_phys_bits;
+ else if (!g_phys_as)
g_phys_as = phys_as;
+
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0008_EBX);
@@ -867,10 +1148,39 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
case 0x8000001a:
case 0x8000001e:
break;
- /* Support memory encryption cpuid if host supports it */
case 0x8000001F:
- if (!boot_cpu_has(X86_FEATURE_SEV))
+ if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ } else {
+ cpuid_entry_override(entry, CPUID_8000_001F_EAX);
+
+ /*
+ * Enumerate '0' for "PA bits reduction", the adjusted
+ * MAXPHYADDR is enumerated directly (see 0x80000008).
+ */
+ entry->ebx &= ~GENMASK(11, 6);
+ }
+ break;
+ case 0x80000020:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x80000021:
+ entry->ebx = entry->ecx = entry->edx = 0;
+ /*
+ * Pass down these bits:
+ * EAX 0 NNDBP, Processor ignores nested data breakpoints
+ * EAX 2 LAS, LFENCE always serializing
+ * EAX 6 NSCB, Null selector clear base
+ *
+ * Other defined bits are for MSRs that KVM does not expose:
+ * EAX 3 SPCL, SMM page configuration lock
+ * EAX 13 PCMSR, Prefetch control MSR
+ */
+ entry->eax &= BIT(0) | BIT(2) | BIT(6);
+ if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+ entry->eax |= BIT(2);
+ if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
+ entry->eax |= BIT(6);
break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
@@ -981,8 +1291,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
if (sanity_check_entries(entries, cpuid->nent, type))
return -EINVAL;
- array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
- cpuid->nent));
+ array.entries = kvcalloc(sizeof(struct kvm_cpuid_entry2), cpuid->nent, GFP_KERNEL);
if (!array.entries)
return -ENOMEM;
@@ -1000,7 +1309,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
r = -EFAULT;
out_free:
- vfree(array.entries);
+ kvfree(array.entries);
return r;
}
@@ -1033,7 +1342,7 @@ EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
* - Centaur: 0xc0000000 - 0xcfffffff
*
* The Hypervisor class is further subdivided into sub-classes that each act as
- * their own indepdent class associated with a 0x100 byte range. E.g. if Qemu
+ * their own independent class associated with a 0x100 byte range. E.g. if Qemu
* is advertising support for both HyperV and KVM, the resulting Hypervisor
* CPUID sub-classes are:
*
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 2a0c5064497f..8a770b481d9d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -3,11 +3,12 @@
#define ARCH_X86_KVM_CPUID_H
#include "x86.h"
+#include "reverse_cpuid.h"
#include <asm/cpu.h>
#include <asm/processor.h>
#include <uapi/asm/kvm_para.h>
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
@@ -29,6 +30,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
@@ -58,144 +61,8 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
}
-struct cpuid_reg {
- u32 function;
- u32 index;
- int reg;
-};
-
-static const struct cpuid_reg reverse_cpuid[] = {
- [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
- [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
- [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
- [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
- [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
- [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
- [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
- [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
- [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
- [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
- [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
- [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
- [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
- [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
- [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
-};
-
-/*
- * Reverse CPUID and its derivatives can only be used for hardware-defined
- * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
- * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
- * is nonsensical as the bit number/mask is an arbitrary software-defined value
- * and can't be used by KVM to query/control guest capabilities. And obviously
- * the leaf being queried must have an entry in the lookup table.
- */
-static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
-{
- BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
- BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
- BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
- BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
- BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
- BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
-}
-
-/*
- * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
- * the hardware defined bit number (stored in bits 4:0) and a software defined
- * "word" (stored in bits 31:5). The word is used to index into arrays of
- * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
- */
-static __always_inline u32 __feature_bit(int x86_feature)
-{
- reverse_cpuid_check(x86_feature / 32);
- return 1 << (x86_feature & 31);
-}
-
-#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
-
-static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
-{
- unsigned int x86_leaf = x86_feature / 32;
-
- reverse_cpuid_check(x86_leaf);
- return reverse_cpuid[x86_leaf];
-}
-
-static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
- u32 reg)
-{
- switch (reg) {
- case CPUID_EAX:
- return &entry->eax;
- case CPUID_EBX:
- return &entry->ebx;
- case CPUID_ECX:
- return &entry->ecx;
- case CPUID_EDX:
- return &entry->edx;
- default:
- BUILD_BUG();
- return NULL;
- }
-}
-
-static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
-
- return __cpuid_entry_get_reg(entry, cpuid.reg);
-}
-
-static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
- return *reg & __feature_bit(x86_feature);
-}
-
-static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- return cpuid_entry_get(entry, x86_feature);
-}
-
-static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
- *reg &= ~__feature_bit(x86_feature);
-}
-
-static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
- *reg |= __feature_bit(x86_feature);
-}
-
-static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature,
- bool set)
-{
- u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
- /*
- * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
- * compiler into using CMOV instead of Jcc when possible.
- */
- if (set)
- *reg |= __feature_bit(x86_feature);
- else
- *reg &= ~__feature_bit(x86_feature);
-}
-
static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
- enum cpuid_leafs leaf)
+ unsigned int leaf)
{
u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
@@ -248,6 +115,14 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
}
+static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
+}
+
static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -308,7 +183,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -316,7 +191,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -324,7 +199,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 7e818d64bb4d..9240b3b7f8dd 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -7,6 +7,8 @@
#include <linux/kvm_host.h>
#include <linux/debugfs.h>
#include "lapic.h"
+#include "mmu.h"
+#include "mmu/mmu_internal.h"
static int vcpu_get_timer_advance_ns(void *data, u64 *val)
{
@@ -17,6 +19,15 @@ static int vcpu_get_timer_advance_ns(void *data, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n");
+static int vcpu_get_guest_mode(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->stat.guest_mode;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_guest_mode_fops, vcpu_get_guest_mode, NULL, "%lld\n");
+
static int vcpu_get_tsc_offset(void *data, u64 *val)
{
struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
@@ -45,6 +56,8 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bi
void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
{
+ debugfs_create_file("guest_mode", 0444, debugfs_dentry, vcpu,
+ &vcpu_guest_mode_fops);
debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu,
&vcpu_tsc_offset_fops);
@@ -62,3 +75,115 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_
&vcpu_tsc_scaling_frac_fops);
}
}
+
+/*
+ * This covers statistics <1024 (11=log(1024)+1), which should be enough to
+ * cover RMAP_RECYCLE_THRESHOLD.
+ */
+#define RMAP_LOG_SIZE 11
+
+static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" };
+
+static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
+{
+ struct kvm_rmap_head *rmap;
+ struct kvm *kvm = m->private;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned int lpage_size, index;
+ /* Still small enough to be on the stack */
+ unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
+ int i, j, k, l, ret;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return 0;
+
+ ret = -ENOMEM;
+ memset(log, 0, sizeof(log));
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL);
+ if (!log[i])
+ goto out;
+ }
+
+ mutex_lock(&kvm->slots_lock);
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ int bkt;
+
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots)
+ for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
+ rmap = slot->arch.rmap[k];
+ lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
+ cur = log[k];
+ for (l = 0; l < lpage_size; l++) {
+ index = ffs(pte_list_count(&rmap[l]));
+ if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE))
+ index = RMAP_LOG_SIZE - 1;
+ cur[index]++;
+ }
+ }
+ }
+
+ write_unlock(&kvm->mmu_lock);
+ mutex_unlock(&kvm->slots_lock);
+
+ /* index=0 counts no rmap; index=1 counts 1 rmap */
+ seq_printf(m, "Rmap_Count:\t0\t1\t");
+ for (i = 2; i < RMAP_LOG_SIZE; i++) {
+ j = 1 << (i - 1);
+ k = (1 << i) - 1;
+ seq_printf(m, "%d-%d\t", j, k);
+ }
+ seq_printf(m, "\n");
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]);
+ cur = log[i];
+ for (j = 0; j < RMAP_LOG_SIZE; j++)
+ seq_printf(m, "%d\t", cur[j]);
+ seq_printf(m, "\n");
+ }
+
+ ret = 0;
+out:
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++)
+ kfree(log[i]);
+
+ return ret;
+}
+
+static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ return single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+}
+
+static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ kvm_put_kvm(kvm);
+
+ return single_release(inode, file);
+}
+
+static const struct file_operations mmu_rmaps_stat_fops = {
+ .open = kvm_mmu_rmaps_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_mmu_rmaps_stat_release,
+};
+
+int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
+ &mmu_rmaps_stat_fops);
+ return 0;
+}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f7970ba6219f..89b11e7dca8a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -22,9 +22,9 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
#include <linux/stringify.h>
-#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/nospec-branch.h>
+#include <asm/ibt.h>
#include "x86.h"
#include "tss.h"
@@ -176,6 +176,7 @@
#define No16 ((u64)1 << 53) /* No 16 bit operand */
#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
+#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -189,11 +190,12 @@
#define X16(x...) X8(x), X8(x)
#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
-#define FASTOP_SIZE 8
+#define FASTOP_SIZE (8 * (1 + HAS_KERNEL_IBT))
struct opcode {
- u64 flags : 56;
- u64 intercept : 8;
+ u64 flags;
+ u8 intercept;
+ u8 pad[7];
union {
int (*execute)(struct x86_emulate_ctxt *ctxt);
const struct opcode *group;
@@ -310,13 +312,14 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
#define __FOP_FUNC(name) \
".align " __stringify(FASTOP_SIZE) " \n\t" \
".type " name ", @function \n\t" \
- name ":\n\t"
+ name ":\n\t" \
+ ASM_ENDBR
#define FOP_FUNC(name) \
__FOP_FUNC(#name)
#define __FOP_RET(name) \
- "ret \n\t" \
+ "11: " ASM_RET \
".size " name ", .-" name "\n\t"
#define FOP_RET(name) \
@@ -345,7 +348,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
__FOP_RET(#op "_" #dst)
#define FOP1EEX(op, dst) \
- FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
+ FOP1E(op, dst) _ASM_EXTABLE_TYPE_REG(10b, 11b, EX_TYPE_ZERO_REG, %%esi)
#define FASTOP1(op) \
FOP_START(op) \
@@ -428,18 +431,30 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
FOP_END
/* Special case for SETcc - 1 instruction per cc */
+
+/*
+ * Depending on .config the SETcc functions look like:
+ *
+ * ENDBR [4 bytes; CONFIG_X86_KERNEL_IBT]
+ * SETcc %al [3 bytes]
+ * RET [1 byte]
+ * INT3 [1 byte; CONFIG_SLS]
+ *
+ * Which gives possible sizes 4, 5, 8 or 9. When rounded up to the
+ * next power-of-two alignment they become 4, 8 or 16 resp.
+ */
+#define SETCC_LENGTH (ENDBR_INSN_SIZE + 4 + IS_ENABLED(CONFIG_SLS))
+#define SETCC_ALIGN (4 << IS_ENABLED(CONFIG_SLS) << HAS_KERNEL_IBT)
+static_assert(SETCC_LENGTH <= SETCC_ALIGN);
+
#define FOP_SETCC(op) \
- ".align 4 \n\t" \
+ ".align " __stringify(SETCC_ALIGN) " \n\t" \
".type " #op ", @function \n\t" \
#op ": \n\t" \
+ ASM_ENDBR \
#op " %al \n\t" \
__FOP_RET(#op)
-asm(".pushsection .fixup, \"ax\"\n"
- ".global kvm_fastop_exception \n"
- "kvm_fastop_exception: xor %esi, %esi; ret\n"
- ".popsection");
-
FOP_START(setcc)
FOP_SETCC(seto)
FOP_SETCC(setno)
@@ -475,12 +490,8 @@ FOP_END;
\
asm volatile("1:" insn "\n" \
"2:\n" \
- ".pushsection .fixup, \"ax\"\n" \
- "3: movl $1, %[_fault]\n" \
- " jmp 2b\n" \
- ".popsection\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [_fault] "+qm"(_fault) inoutclob ); \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \
+ : [_fault] "+r"(_fault) inoutclob ); \
\
_fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
})
@@ -673,7 +684,7 @@ static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
static inline bool emul_is_noncanonical_address(u64 la,
struct x86_emulate_ctxt *ctxt)
{
- return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
+ return !__is_canonical_address(la, ctxt_virt_addr_bits(ctxt));
}
/*
@@ -723,7 +734,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
case X86EMUL_MODE_PROT64:
*linear = la;
va_bits = ctxt_virt_addr_bits(ctxt);
- if (get_canonical(la, va_bits) != la)
+ if (!__is_canonical_address(la, va_bits))
goto bad;
*max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
@@ -1055,7 +1066,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
- void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
+ void (*fop)(void) = (void *)em_setcc + SETCC_ALIGN * (condition & 0xf);
flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
asm("push %[flags]; popf; " CALL_NOSPEC
@@ -1081,116 +1092,14 @@ static void fetch_register_operand(struct operand *op)
}
}
-static void emulator_get_fpu(void)
-{
- fpregs_lock();
-
- fpregs_assert_state_consistent();
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_return();
-}
-
-static void emulator_put_fpu(void)
-{
- fpregs_unlock();
-}
-
-static void read_sse_reg(sse128_t *data, int reg)
-{
- emulator_get_fpu();
- switch (reg) {
- case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
- case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
- case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
- case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
- case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
- case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
- case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
- case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
-#ifdef CONFIG_X86_64
- case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
- case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
- case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
- case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
- case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
- case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
- case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
- case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
-#endif
- default: BUG();
- }
- emulator_put_fpu();
-}
-
-static void write_sse_reg(sse128_t *data, int reg)
-{
- emulator_get_fpu();
- switch (reg) {
- case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
- case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
- case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
- case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
- case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
- case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
- case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
- case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
-#ifdef CONFIG_X86_64
- case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
- case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
- case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
- case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
- case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
- case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
- case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
- case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
-#endif
- default: BUG();
- }
- emulator_put_fpu();
-}
-
-static void read_mmx_reg(u64 *data, int reg)
-{
- emulator_get_fpu();
- switch (reg) {
- case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
- case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
- case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
- case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
- case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
- case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
- case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
- case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
- default: BUG();
- }
- emulator_put_fpu();
-}
-
-static void write_mmx_reg(u64 *data, int reg)
-{
- emulator_get_fpu();
- switch (reg) {
- case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
- case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
- case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
- case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
- case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
- case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
- case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
- case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
- default: BUG();
- }
- emulator_put_fpu();
-}
-
static int em_fninit(struct x86_emulate_ctxt *ctxt)
{
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
- emulator_get_fpu();
+ kvm_fpu_get();
asm volatile("fninit");
- emulator_put_fpu();
+ kvm_fpu_put();
return X86EMUL_CONTINUE;
}
@@ -1201,9 +1110,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
- emulator_get_fpu();
+ kvm_fpu_get();
asm volatile("fnstcw %0": "+m"(fcw));
- emulator_put_fpu();
+ kvm_fpu_put();
ctxt->dst.val = fcw;
@@ -1217,9 +1126,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
- emulator_get_fpu();
+ kvm_fpu_get();
asm volatile("fnstsw %0": "+m"(fsw));
- emulator_put_fpu();
+ kvm_fpu_put();
ctxt->dst.val = fsw;
@@ -1238,7 +1147,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = reg;
- read_sse_reg(&op->vec_val, reg);
+ kvm_read_sse_reg(reg, &op->vec_val);
return;
}
if (ctxt->d & Mmx) {
@@ -1289,7 +1198,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = ctxt->modrm_rm;
- read_sse_reg(&op->vec_val, ctxt->modrm_rm);
+ kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
return rc;
}
if (ctxt->d & Mmx) {
@@ -1718,11 +1627,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
}
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
dpl = seg_desc.dpl;
switch (seg) {
@@ -1738,14 +1642,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (!(seg_desc.type & 8))
goto exception;
- if (seg_desc.type & 4) {
- /* conforming */
- if (dpl > cpl)
- goto exception;
- } else {
- /* nonconforming */
- if (rpl > cpl || dpl != cpl)
+ if (transfer == X86_TRANSFER_RET) {
+ /* RET can never return to an inner privilege level. */
+ if (rpl < cpl)
goto exception;
+ /* Outer-privilege level return is not implemented */
+ if (rpl > cpl)
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) {
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > rpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (dpl != rpl)
+ goto exception;
+ }
+ } else { /* X86_TRANSFER_CALL_JMP */
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
}
/* in long-mode d/b must be clear if l is set */
if (seg_desc.d && seg_desc.l) {
@@ -1762,6 +1686,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
+ if (!seg_desc.p) {
+ err_vec = NP_VECTOR;
+ goto exception;
+ }
old_desc = seg_desc;
seg_desc.type |= 2; /* busy */
ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
@@ -1786,6 +1714,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
break;
}
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
if (seg_desc.s) {
/* mark segment as accessed */
if (!(seg_desc.type & 1)) {
@@ -1866,10 +1799,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
op->bytes * op->count);
break;
case OP_XMM:
- write_sse_reg(&op->vec_val, op->addr.xmm);
+ kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
break;
case OP_MM:
- write_mmx_reg(&op->mm_val, op->addr.mm);
+ kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
break;
case OP_NONE:
/* no writeback */
@@ -2311,9 +2244,6 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- /* Outer-privilege level return is not implemented */
- if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
- return X86EMUL_UNHANDLEABLE;
rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
X86_TRANSFER_RET,
&new_desc);
@@ -2638,8 +2568,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
ctxt->ops->set_nmi_mask(ctxt, false);
- ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
- ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
+ ctxt->ops->exiting_smm(ctxt);
/*
* Get back to real mode, to prepare a safe state in which to load
@@ -2678,12 +2607,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
}
/*
- * Give pre_leave_smm() a chance to make ISA-specific changes to the
- * vCPU state (e.g. enter guest mode) before loading state from the SMM
+ * Give leave_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. enter guest mode) before loading state from the SMM
* state-save area.
*/
- if (ctxt->ops->pre_leave_smm(ctxt, buf))
- return X86EMUL_UNHANDLEABLE;
+ if (ctxt->ops->leave_smm(ctxt, buf))
+ goto emulate_shutdown;
#ifdef CONFIG_X86_64
if (emulator_has_longmode(ctxt))
@@ -2692,19 +2621,26 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
#endif
ret = rsm_load_state_32(ctxt, buf);
- if (ret != X86EMUL_CONTINUE) {
- /* FIXME: should triple fault */
- return X86EMUL_UNHANDLEABLE;
- }
+ if (ret != X86EMUL_CONTINUE)
+ goto emulate_shutdown;
- ctxt->ops->post_leave_smm(ctxt);
+ /*
+ * Note, the ctxt->ops callbacks are responsible for handling side
+ * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID
+ * runtime updates, etc... If that changes, e.g. this flow is moved
+ * out of the emulator to make it look more like enter_smm(), then
+ * those side effects need to be explicitly handled for both success
+ * and shutdown.
+ */
+ return X86EMUL_CONTINUE;
+emulate_shutdown:
+ ctxt->ops->triple_fault(ctxt);
return X86EMUL_CONTINUE;
}
static void
-setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct desc_struct *cs, struct desc_struct *ss)
+setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
{
cs->l = 0; /* will be adjusted later */
set_desc_base(cs, 0); /* flat segment */
@@ -2793,7 +2729,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
if (!(efer & EFER_SCE))
return emulate_ud(ctxt);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
cs_sel = (u16)(msr_data & 0xfffc);
@@ -2861,7 +2797,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
ss_sel = cs_sel + 8;
@@ -2898,7 +2834,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
if ((ctxt->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
@@ -3116,8 +3052,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static int task_switch_16(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_16 tss_seg;
@@ -3222,7 +3157,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
}
/*
- * Now load segment descriptors. If fault happenes at this stage
+ * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
@@ -3255,8 +3190,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
return ret;
}
-static int task_switch_32(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_32 tss_seg;
@@ -3364,10 +3298,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
old_tss_sel = 0xffff;
if (next_tss_desc.type & 8)
- ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
+ ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc);
else
- ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
+ ret = task_switch_16(ctxt, old_tss_sel,
old_tss_base, &next_tss_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -3607,8 +3540,10 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
{
u64 tsc_aux = 0;
- if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+ if (!ctxt->ops->guest_has_rdpid(ctxt))
return emulate_ud(ctxt);
+
+ ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux);
ctxt->dst.val = tsc_aux;
return X86EMUL_CONTINUE;
}
@@ -3709,7 +3644,7 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
- r = ctxt->ops->set_msr(ctxt, msr_index, msr_data);
+ r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
if (r == X86EMUL_IO_NEEDED)
return r;
@@ -3726,7 +3661,7 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
int r;
- r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data);
+ r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
if (r == X86EMUL_IO_NEEDED)
return r;
@@ -4124,11 +4059,11 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- emulator_get_fpu();
+ kvm_fpu_get();
rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
- emulator_put_fpu();
+ kvm_fpu_put();
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -4172,7 +4107,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- emulator_get_fpu();
+ kvm_fpu_get();
if (size < __fxstate_size(16)) {
rc = fxregs_fixup(&fx_state, size);
@@ -4189,7 +4124,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
out:
- emulator_put_fpu();
+ kvm_fpu_put();
return rc;
}
@@ -4220,7 +4155,7 @@ static bool valid_cr(int nr)
}
}
-static int check_cr_read(struct x86_emulate_ctxt *ctxt)
+static int check_cr_access(struct x86_emulate_ctxt *ctxt)
{
if (!valid_cr(ctxt->modrm_reg))
return emulate_ud(ctxt);
@@ -4228,80 +4163,6 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int check_cr_write(struct x86_emulate_ctxt *ctxt)
-{
- u64 new_val = ctxt->src.val64;
- int cr = ctxt->modrm_reg;
- u64 efer = 0;
-
- static u64 cr_reserved_bits[] = {
- 0xffffffff00000000ULL,
- 0, 0, 0, /* CR3 checked later */
- CR4_RESERVED_BITS,
- 0, 0, 0,
- CR8_RESERVED_BITS,
- };
-
- if (!valid_cr(cr))
- return emulate_ud(ctxt);
-
- if (new_val & cr_reserved_bits[cr])
- return emulate_gp(ctxt, 0);
-
- switch (cr) {
- case 0: {
- u64 cr4;
- if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
- ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
- return emulate_gp(ctxt, 0);
-
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
- if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
- !(cr4 & X86_CR4_PAE))
- return emulate_gp(ctxt, 0);
-
- break;
- }
- case 3: {
- u64 rsvd = 0;
-
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA) {
- u64 maxphyaddr;
- u32 eax, ebx, ecx, edx;
-
- eax = 0x80000008;
- ecx = 0;
- if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx,
- &edx, true))
- maxphyaddr = eax & 0xff;
- else
- maxphyaddr = 36;
- rsvd = rsvd_bits(maxphyaddr, 63);
- if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
- rsvd &= ~X86_CR3_PCID_NOFLUSH;
- }
-
- if (new_val & rsvd)
- return emulate_gp(ctxt, 0);
-
- break;
- }
- case 4: {
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
- if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
- return emulate_gp(ctxt, 0);
-
- break;
- }
- }
-
- return X86EMUL_CONTINUE;
-}
-
static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
{
unsigned long dr7;
@@ -4376,7 +4237,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
- return emulate_ud(ctxt);
+ return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
}
@@ -4393,6 +4254,11 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
return X86EMUL_CONTINUE;
+ /*
+ * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE
+ * check however is unnecessary because CPL is always 0 outside
+ * protected mode.
+ */
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
ctxt->ops->check_pmc(ctxt, rcx))
return emulate_gp(ctxt, 0);
@@ -4530,10 +4396,10 @@ static const struct opcode group4[] = {
static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
- I(SrcMem | NearBranch, em_call_near_abs),
- I(SrcMemFAddr | ImplicitOps, em_call_far),
- I(SrcMem | NearBranch, em_jmp_abs),
- I(SrcMemFAddr | ImplicitOps, em_jmp_far),
+ I(SrcMem | NearBranch | IsBranch, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far),
+ I(SrcMem | NearBranch | IsBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far),
I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
};
@@ -4576,7 +4442,7 @@ static const struct opcode group8[] = {
* from the register case of group9.
*/
static const struct gprefix pfx_0f_c7_7 = {
- N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
};
@@ -4743,7 +4609,7 @@ static const struct opcode opcode_table[256] = {
I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
- X16(D(SrcImmByte | NearBranch)),
+ X16(D(SrcImmByte | NearBranch | IsBranch)),
/* 0x80 - 0x87 */
G(ByteOp | DstMem | SrcImm, group1),
G(DstMem | SrcImm, group1),
@@ -4762,7 +4628,7 @@ static const struct opcode opcode_table[256] = {
DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
/* 0x98 - 0x9F */
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
- I(SrcImmFAddr | No64, em_call_far), N,
+ I(SrcImmFAddr | No64 | IsBranch, em_call_far), N,
II(ImplicitOps | Stack, em_pushf, pushf),
II(ImplicitOps | Stack, em_popf, popf),
I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
@@ -4782,17 +4648,19 @@ static const struct opcode opcode_table[256] = {
X8(I(DstReg | SrcImm64 | Mov, em_mov)),
/* 0xC0 - 0xC7 */
G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
- I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm),
- I(ImplicitOps | NearBranch, em_ret),
+ I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm),
+ I(ImplicitOps | NearBranch | IsBranch, em_ret),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
- I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
- I(ImplicitOps | SrcImmU16, em_ret_far_imm),
- I(ImplicitOps, em_ret_far),
- D(ImplicitOps), DI(SrcImmByte, intn),
- D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
+ I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter),
+ I(Stack | IsBranch, em_leave),
+ I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm),
+ I(ImplicitOps | IsBranch, em_ret_far),
+ D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn),
+ D(ImplicitOps | No64 | IsBranch),
+ II(ImplicitOps | IsBranch, em_iret, iret),
/* 0xD0 - 0xD7 */
G(Src2One | ByteOp, group2), G(Src2One, group2),
G(Src2CL | ByteOp, group2), G(Src2CL, group2),
@@ -4803,14 +4671,15 @@ static const struct opcode opcode_table[256] = {
/* 0xD8 - 0xDF */
N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
/* 0xE0 - 0xE7 */
- X3(I(SrcImmByte | NearBranch, em_loop)),
- I(SrcImmByte | NearBranch, em_jcxz),
+ X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)),
+ I(SrcImmByte | NearBranch | IsBranch, em_jcxz),
I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
/* 0xE8 - 0xEF */
- I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch),
- I(SrcImmFAddr | No64, em_jmp_far),
- D(SrcImmByte | ImplicitOps | NearBranch),
+ I(SrcImm | NearBranch | IsBranch, em_call),
+ D(SrcImm | ImplicitOps | NearBranch | IsBranch),
+ I(SrcImmFAddr | No64 | IsBranch, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch | IsBranch),
I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
/* 0xF0 - 0xF7 */
@@ -4826,7 +4695,7 @@ static const struct opcode opcode_table[256] = {
static const struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
G(0, group6), GD(0, &group7), N, N,
- N, I(ImplicitOps | EmulateOnUD, em_syscall),
+ N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall),
II(ImplicitOps | Priv, em_clts, clts), N,
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
@@ -4841,10 +4710,10 @@ static const struct opcode twobyte_table[256] = {
D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */
/* 0x20 - 0x2F */
- DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access),
DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
- check_cr_write),
+ check_cr_access),
IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
check_dr_write),
N, N, N, N,
@@ -4857,8 +4726,8 @@ static const struct opcode twobyte_table[256] = {
IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
II(ImplicitOps | Priv, em_rdmsr, rdmsr),
IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
- I(ImplicitOps | EmulateOnUD, em_sysenter),
- I(ImplicitOps | Priv | EmulateOnUD, em_sysexit),
+ I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit),
N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
@@ -4876,7 +4745,7 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N,
N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x80 - 0x8F */
- X16(D(SrcImm | NearBranch)),
+ X16(D(SrcImm | NearBranch | IsBranch)),
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
@@ -5185,7 +5054,7 @@ done:
return rc;
}
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
{
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
@@ -5390,13 +5259,16 @@ done_prefixes:
ctxt->d |= opcode.flags;
}
+ ctxt->is_branch = opcode.flags & IsBranch;
+
/* Unrecognised? */
if (ctxt->d == 0)
return EMULATION_FAILED;
ctxt->execute = opcode.u.execute;
- if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD)))
+ if (unlikely(emulation_type & EMULTYPE_TRAP_UD) &&
+ likely(!(ctxt->d & EmulateOnUD)))
return EMULATION_FAILED;
if (unlikely(ctxt->d &
@@ -5510,9 +5382,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
{
int rc;
- emulator_get_fpu();
+ kvm_fpu_get();
rc = asm_safe("fwait");
- emulator_put_fpu();
+ kvm_fpu_put();
if (unlikely(rc != X86EMUL_CONTINUE))
return emulate_exception(ctxt, MF_VECTOR, 0, false);
@@ -5523,7 +5395,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
static void fetch_possible_mmx_operand(struct operand *op)
{
if (op->type == OP_MM)
- read_mmx_reg(&op->mm_val, op->addr.mm);
+ kvm_read_mmx_reg(op->addr.mm, &op->mm_val);
}
static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
@@ -5546,8 +5418,13 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
void init_decode_cache(struct x86_emulate_ctxt *ctxt)
{
- memset(&ctxt->rip_relative, 0,
- (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+ /* Clear fields that are set conditionally but read without a guard. */
+ ctxt->rip_relative = false;
+ ctxt->rex_prefix = 0;
+ ctxt->lock_prefix = 0;
+ ctxt->rep_prefix = 0;
+ ctxt->regs_valid = 0;
+ ctxt->regs_dirty = 0;
ctxt->io_read.pos = 0;
ctxt->io_read.end = 0;
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
new file mode 100644
index 000000000000..3ba12888bf66
--- /dev/null
+++ b/arch/x86/kvm/fpu.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KVM_FPU_H_
+#define __KVM_FPU_H_
+
+#include <asm/fpu/api.h>
+
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
+#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; })
+#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; })
+#define sse128_l0(x) ({ __sse128_u t; t.vec = x; t.as_u32[0]; })
+#define sse128_l1(x) ({ __sse128_u t; t.vec = x; t.as_u32[1]; })
+#define sse128_l2(x) ({ __sse128_u t; t.vec = x; t.as_u32[2]; })
+#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
+#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
+
+static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_read_mmx_reg(int reg, u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+ case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+ case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+ case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+ case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+ case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+ case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+ case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+ case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+ case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+ case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void kvm_fpu_get(void)
+{
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static inline void kvm_fpu_put(void)
+{
+ fpregs_unlock();
+}
+
+static inline void kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_read_mmx_reg(int reg, u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_read_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_write_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+#endif
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f98370a39936..e2e95a6fccfd 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -36,6 +36,7 @@
#include "trace.h"
#include "irq.h"
+#include "fpu.h"
/* "Hv#1" signature */
#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
@@ -87,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
int vector)
{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ bool auto_eoi_old, auto_eoi_new;
+
if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
return;
@@ -95,10 +100,37 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
else
__clear_bit(vector, synic->vec_bitmap);
+ auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256);
+
if (synic_has_vector_auto_eoi(synic, vector))
__set_bit(vector, synic->auto_eoi_bitmap);
else
__clear_bit(vector, synic->auto_eoi_bitmap);
+
+ auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256);
+
+ if (auto_eoi_old == auto_eoi_new)
+ return;
+
+ if (!enable_apicv)
+ return;
+
+ down_write(&vcpu->kvm->arch.apicv_update_lock);
+
+ if (auto_eoi_new)
+ hv->synic_auto_eoi_used++;
+ else
+ hv->synic_auto_eoi_used--;
+
+ /*
+ * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on
+ * the hypervisor to manually inject IRQs.
+ */
+ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm,
+ APICV_INHIBIT_REASON_HYPERV,
+ !!hv->synic_auto_eoi_used);
+
+ up_write(&vcpu->kvm->arch.apicv_update_lock);
}
static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@ -139,7 +171,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
{
struct kvm_vcpu *vcpu = NULL;
- int i;
+ unsigned long i;
if (vpidx >= KVM_MAX_VCPUS)
return NULL;
@@ -211,7 +243,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
int ret;
- if (!synic->active && !host)
+ if (!synic->active && (!host || data))
return 1;
trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
@@ -257,6 +289,9 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
case HV_X64_MSR_EOM: {
int i;
+ if (!synic->active)
+ break;
+
for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
kvm_hv_notify_acked_sint(vcpu, i);
break;
@@ -273,15 +308,10 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
{
- struct kvm_cpuid_entry2 *entry;
-
- entry = kvm_find_cpuid_entry(vcpu,
- HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES,
- 0);
- if (!entry)
- return false;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+ return hv_vcpu->cpuid_cache.syndbg_cap_eax &
+ HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
}
static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
@@ -426,6 +456,9 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
struct kvm_lapic_irq irq;
int ret, vector;
+ if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+ return -EINVAL;
+
if (sint >= ARRAY_SIZE(synic->sint))
return -EINVAL;
@@ -635,9 +668,15 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
union hv_stimer_config new_config = {.as_uint64 = config},
old_config = {.as_uint64 = stimer->config.as_uint64};
struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
- if (!synic->active && !host)
+ if (!synic->active && (!host || config))
+ return 1;
+
+ if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_STIMER_DIRECT_MODE_AVAILABLE)))
return 1;
trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
@@ -661,7 +700,7 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
- if (!synic->active && !host)
+ if (!synic->active && (!host || count))
return 1;
trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
@@ -913,7 +952,7 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
stimer_init(&hv_vcpu->stimer[i], i);
- hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+ hv_vcpu->vp_index = vcpu->vcpu_idx;
return 0;
}
@@ -931,12 +970,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
synic = to_hv_synic(vcpu);
- /*
- * Hyper-V SynIC auto EOI SINT's are
- * not compatible with APICV, so request
- * to deactivate APICV permanently.
- */
- kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
synic->active = true;
synic->dont_zero_synic_pages = dont_zero_synic_pages;
synic->control = HV_SYNIC_CONTROL_ENABLE;
@@ -1102,11 +1135,13 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
+ mutex_lock(&hv->hv_lock);
+
if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_SET ||
hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
- return;
+ goto out_unlock;
- mutex_lock(&hv->hv_lock);
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
goto out_unlock;
@@ -1168,34 +1203,91 @@ out_unlock:
mutex_unlock(&hv->hv_lock);
}
-void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
+void kvm_hv_request_tsc_page_update(struct kvm *kvm)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
- u64 gfn;
-
- if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
- hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
- tsc_page_update_unsafe(hv))
- return;
mutex_lock(&hv->hv_lock);
- if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
- goto out_unlock;
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET &&
+ !tsc_page_update_unsafe(hv))
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
- /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */
- if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET)
- hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING;
+ mutex_unlock(&hv->hv_lock);
+}
- gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
- hv->tsc_ref.tsc_sequence = 0;
- if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
- &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
- hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_HYPERCALL_AVAILABLE;
+ case HV_X64_MSR_VP_RUNTIME:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_RUNTIME_AVAILABLE;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_TIME_REF_COUNT_AVAILABLE;
+ case HV_X64_MSR_VP_INDEX:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_INDEX_AVAILABLE;
+ case HV_X64_MSR_RESET:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_RESET_AVAILABLE;
+ case HV_X64_MSR_REFERENCE_TSC:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_REFERENCE_TSC_AVAILABLE;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNIC_AVAILABLE;
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG:
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNTIMER_AVAILABLE;
+ case HV_X64_MSR_EOI:
+ case HV_X64_MSR_ICR:
+ case HV_X64_MSR_TPR:
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_APIC_ACCESS_AVAILABLE;
+ break;
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_FREQUENCY_MSRS;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_REENLIGHTENMENT;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ default:
+ break;
+ }
-out_unlock:
- mutex_unlock(&hv->hv_lock);
+ return false;
}
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
@@ -1204,6 +1296,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
struct kvm *kvm = vcpu->kvm;
struct kvm_hv *hv = to_kvm_hv(kvm);
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
hv->hv_guest_os_id = data;
@@ -1332,10 +1427,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
switch (msr) {
case HV_X64_MSR_VP_INDEX: {
struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
- int vcpu_idx = kvm_vcpu_get_idx(vcpu);
u32 new_vp_index = (u32)data;
if (!host || new_vp_index >= KVM_MAX_VCPUS)
@@ -1350,9 +1447,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
* VP index is changing, adjust num_mismatched_vp_indexes if
* it now matches or no longer matches vcpu_idx.
*/
- if (hv_vcpu->vp_index == vcpu_idx)
+ if (hv_vcpu->vp_index == vcpu->vcpu_idx)
atomic_inc(&hv->num_mismatched_vp_indexes);
- else if (new_vp_index == vcpu_idx)
+ else if (new_vp_index == vcpu->vcpu_idx)
atomic_dec(&hv->num_mismatched_vp_indexes);
hv_vcpu->vp_index = new_vp_index;
@@ -1364,7 +1461,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
hv_vcpu->hv_vapic = data;
- if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
+ if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
return 1;
break;
}
@@ -1382,7 +1479,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
hv_vcpu->hv_vapic = data;
kvm_vcpu_mark_page_dirty(vcpu, gfn);
- if (kvm_lapic_enable_pv_eoi(vcpu,
+ if (kvm_lapic_set_pv_eoi(vcpu,
gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
sizeof(struct hv_vp_assist_page)))
return 1;
@@ -1446,6 +1543,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
struct kvm *kvm = vcpu->kvm;
struct kvm_hv *hv = to_kvm_hv(kvm);
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
data = hv->hv_guest_os_id;
@@ -1495,6 +1595,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
u64 data = 0;
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
switch (msr) {
case HV_X64_MSR_VP_INDEX:
data = hv_vcpu->vp_index;
@@ -1596,50 +1699,125 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
return kvm_hv_get_msr(vcpu, msr, pdata, host);
}
-static __always_inline unsigned long *sparse_set_to_vcpu_mask(
- struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
- u64 *vp_bitmap, unsigned long *vcpu_bitmap)
+static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
+ u64 valid_bank_mask, unsigned long *vcpu_mask)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
+ bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes);
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
struct kvm_vcpu *vcpu;
- int i, bank, sbank = 0;
+ int bank, sbank = 0;
+ unsigned long i;
+ u64 *bitmap;
+
+ BUILD_BUG_ON(sizeof(vp_bitmap) >
+ sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS));
- memset(vp_bitmap, 0,
- KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
+ /*
+ * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else
+ * fill a temporary buffer and manually test each vCPU's VP index.
+ */
+ if (likely(!has_mismatch))
+ bitmap = (u64 *)vcpu_mask;
+ else
+ bitmap = vp_bitmap;
+
+ /*
+ * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask
+ * having a '1' for each bank that exists in sparse_banks. Sets must
+ * be in ascending order, i.e. bank0..bankN.
+ */
+ memset(bitmap, 0, sizeof(vp_bitmap));
for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
- vp_bitmap[bank] = sparse_banks[sbank++];
+ bitmap[bank] = sparse_banks[sbank++];
- if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) {
- /* for all vcpus vp_index == vcpu_idx */
- return (unsigned long *)vp_bitmap;
- }
+ if (likely(!has_mismatch))
+ return;
- bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
kvm_for_each_vcpu(i, vcpu, kvm) {
if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
- __set_bit(i, vcpu_bitmap);
+ __set_bit(i, vcpu_mask);
}
- return vcpu_bitmap;
}
-static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex)
+struct kvm_hv_hcall {
+ u64 param;
+ u64 ingpa;
+ u64 outgpa;
+ u16 code;
+ u16 var_cnt;
+ u16 rep_cnt;
+ u16 rep_idx;
+ bool fast;
+ bool rep;
+ sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
+};
+
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ int consumed_xmm_halves,
+ u64 *sparse_banks, gpa_t offset)
+{
+ u16 var_cnt;
+ int i;
+
+ if (hc->var_cnt > 64)
+ return -EINVAL;
+
+ /* Ignore banks that cannot possibly contain a legal VP index. */
+ var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS);
+
+ if (hc->fast) {
+ /*
+ * Each XMM holds two sparse banks, but do not count halves that
+ * have already been consumed for hypercall parameters.
+ */
+ if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ for (i = 0; i < var_cnt; i++) {
+ int j = i + consumed_xmm_halves;
+ if (j % 2)
+ sparse_banks[i] = sse128_hi(hc->xmm[j / 2]);
+ else
+ sparse_banks[i] = sse128_lo(hc->xmm[j / 2]);
+ }
+ return 0;
+ }
+
+ return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks,
+ var_cnt * sizeof(*sparse_banks));
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
- u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
- unsigned long *vcpu_mask;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
u64 valid_bank_mask;
- u64 sparse_banks[64];
- int sparse_banks_len;
+ u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
bool all_cpus;
- if (!ex) {
- if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ /*
+ * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
+ * valid mask is a u64. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) would exceed this limit, KVM will additional changes
+ * for Hyper-V support to avoid setting the guest up to fail.
+ */
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64);
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
+ if (hc->fast) {
+ flush.address_space = hc->ingpa;
+ flush.flags = hc->outgpa;
+ flush.processor_mask = sse128_lo(hc->xmm[0]);
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa,
+ &flush, sizeof(flush))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
trace_kvm_hv_flush_tlb(flush.processor_mask,
flush.address_space, flush.flags);
@@ -1657,9 +1835,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool
all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
flush.processor_mask == 0;
} else {
- if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
- sizeof(flush_ex))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ if (hc->fast) {
+ flush_ex.address_space = hc->ingpa;
+ flush_ex.flags = hc->outgpa;
+ memcpy(&flush_ex.hv_vp_set,
+ &hc->xmm[0], sizeof(hc->xmm[0]));
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
+ sizeof(flush_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
flush_ex.hv_vp_set.format,
@@ -1670,39 +1855,38 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool
all_cpus = flush_ex.hv_vp_set.format !=
HV_GENERIC_SET_SPARSE_4K;
- sparse_banks_len =
- bitmap_weight((unsigned long *)&valid_bank_mask, 64) *
- sizeof(sparse_banks[0]);
+ if (hc->var_cnt != hweight64(valid_bank_mask))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ goto do_flush;
- if (!sparse_banks_len && !all_cpus)
+ if (!hc->var_cnt)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
- ingpa + offsetof(struct hv_tlb_flush_ex,
- hv_vp_set.bank_contents),
- sparse_banks,
- sparse_banks_len))
+ if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks,
+ offsetof(struct hv_tlb_flush_ex,
+ hv_vp_set.bank_contents)))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
- cpumask_clear(&hv_vcpu->tlb_flush);
-
- vcpu_mask = all_cpus ? NULL :
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
-
+do_flush:
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH,
- NULL, vcpu_mask, &hv_vcpu->tlb_flush);
+ if (all_cpus) {
+ kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
+ } else {
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask);
+ }
ret_success:
- /* We always do full TLB flush, set rep_done = rep_cnt. */
+ /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
return (u64)HV_STATUS_SUCCESS |
- ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+ ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
}
static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
@@ -1713,7 +1897,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
.vector = vector
};
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
@@ -1724,43 +1908,45 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
}
}
-static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
- bool ex, bool fast)
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
- u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
- unsigned long *vcpu_mask;
- unsigned long valid_bank_mask;
- u64 sparse_banks[64];
- int sparse_banks_len;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+ u64 valid_bank_mask;
+ u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
u32 vector;
bool all_cpus;
- if (!ex) {
- if (!fast) {
- if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
+ if (hc->code == HVCALL_SEND_IPI) {
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
sizeof(send_ipi))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
sparse_banks[0] = send_ipi.cpu_mask;
vector = send_ipi.vector;
} else {
/* 'reserved' part of hv_send_ipi should be 0 */
- if (unlikely(ingpa >> 32 != 0))
+ if (unlikely(hc->ingpa >> 32 != 0))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- sparse_banks[0] = outgpa;
- vector = (u32)ingpa;
+ sparse_banks[0] = hc->outgpa;
+ vector = (u32)hc->ingpa;
}
all_cpus = false;
valid_bank_mask = BIT_ULL(0);
trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
} else {
- if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
- sizeof(send_ipi_ex))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
+ sizeof(send_ipi_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ } else {
+ send_ipi_ex.vector = (u32)hc->ingpa;
+ send_ipi_ex.vp_set.format = hc->outgpa;
+ send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]);
+ }
trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
send_ipi_ex.vp_set.format,
@@ -1768,31 +1954,34 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
vector = send_ipi_ex.vector;
valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
- sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
- sizeof(sparse_banks[0]);
-
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
- if (!sparse_banks_len)
+ if (hc->var_cnt != hweight64(valid_bank_mask))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ goto check_and_send_ipi;
+
+ if (!hc->var_cnt)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
- ingpa + offsetof(struct hv_send_ipi_ex,
- vp_set.bank_contents),
- sparse_banks,
- sparse_banks_len))
+ if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks,
+ offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents)))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- vcpu_mask = all_cpus ? NULL :
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
+ if (all_cpus) {
+ kvm_send_ipi_to_many(kvm, vector, NULL);
+ } else {
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+ kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
+ }
ret_success:
return HV_STATUS_SUCCESS;
@@ -1801,24 +1990,74 @@ ret_success:
void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *entry;
+ struct kvm_vcpu_hv *hv_vcpu;
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
- if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX)
+ if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
vcpu->arch.hyperv_enabled = true;
- else
+ } else {
vcpu->arch.hyperv_enabled = false;
+ return;
+ }
+
+ if (!to_hv_vcpu(vcpu) && kvm_hv_vcpu_init(vcpu))
+ return;
+
+ hv_vcpu = to_hv_vcpu(vcpu);
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0);
+ if (entry) {
+ hv_vcpu->cpuid_cache.features_eax = entry->eax;
+ hv_vcpu->cpuid_cache.features_ebx = entry->ebx;
+ hv_vcpu->cpuid_cache.features_edx = entry->edx;
+ } else {
+ hv_vcpu->cpuid_cache.features_eax = 0;
+ hv_vcpu->cpuid_cache.features_ebx = 0;
+ hv_vcpu->cpuid_cache.features_edx = 0;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0);
+ if (entry) {
+ hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax;
+ hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx;
+ } else {
+ hv_vcpu->cpuid_cache.enlightenments_eax = 0;
+ hv_vcpu->cpuid_cache.enlightenments_ebx = 0;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0);
+ if (entry)
+ hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax;
+ else
+ hv_vcpu->cpuid_cache.syndbg_cap_eax = 0;
}
-bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
{
- return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
+ struct kvm_vcpu_hv *hv_vcpu;
+ int ret = 0;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (enforce) {
+ ret = kvm_hv_vcpu_init(vcpu);
+ if (ret)
+ return ret;
+ } else {
+ return 0;
+ }
+ }
+
+ hv_vcpu = to_hv_vcpu(vcpu);
+ hv_vcpu->enforce_cpuid = enforce;
+
+ return ret;
}
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
bool longmode;
- longmode = is_64_bit_mode(vcpu);
+ longmode = is_64_bit_hypercall(vcpu);
if (longmode)
kvm_rax_write(vcpu, result);
else {
@@ -1829,6 +2068,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
{
+ trace_kvm_hv_hypercall_done(result);
kvm_hv_hypercall_set_result(vcpu, result);
++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
@@ -1839,20 +2079,21 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
}
-static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
struct eventfd_ctx *eventfd;
- if (unlikely(!fast)) {
+ if (unlikely(!hc->fast)) {
int ret;
- gpa_t gpa = param;
+ gpa_t gpa = hc->ingpa;
- if ((gpa & (__alignof__(param) - 1)) ||
- offset_in_page(gpa) + sizeof(param) > PAGE_SIZE)
+ if ((gpa & (__alignof__(hc->ingpa) - 1)) ||
+ offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE)
return HV_STATUS_INVALID_ALIGNMENT;
- ret = kvm_vcpu_read_guest(vcpu, gpa, &param, sizeof(param));
+ ret = kvm_vcpu_read_guest(vcpu, gpa,
+ &hc->ingpa, sizeof(hc->ingpa));
if (ret < 0)
return HV_STATUS_INVALID_ALIGNMENT;
}
@@ -1862,15 +2103,15 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
* have no use for it, and in all known usecases it is zero, so just
* report lookup failure if it isn't.
*/
- if (param & 0xffff00000000ULL)
+ if (hc->ingpa & 0xffff00000000ULL)
return HV_STATUS_INVALID_PORT_ID;
/* remaining bits are reserved-zero */
- if (param & ~KVM_HYPERV_CONN_ID_MASK)
+ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK)
return HV_STATUS_INVALID_HYPERCALL_INPUT;
/* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
rcu_read_lock();
- eventfd = idr_find(&hv->conn_to_evt, param);
+ eventfd = idr_find(&hv->conn_to_evt, hc->ingpa);
rcu_read_unlock();
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
@@ -1879,11 +2120,82 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
return HV_STATUS_SUCCESS;
}
+static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
+{
+ switch (hc->code) {
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ case HVCALL_SEND_IPI_EX:
+ return true;
+ }
+
+ return false;
+}
+
+static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc)
+{
+ int reg;
+
+ kvm_fpu_get();
+ for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++)
+ _kvm_read_sse_reg(reg, &hc->xmm[reg]);
+ kvm_fpu_put();
+}
+
+static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ return hv_vcpu->cpuid_cache.enlightenments_ebx &&
+ hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX;
+ case HVCALL_POST_MESSAGE:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES;
+ case HVCALL_SIGNAL_EVENT:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS;
+ case HVCALL_POST_DEBUG_DATA:
+ case HVCALL_RETRIEVE_DEBUG_DATA:
+ case HVCALL_RESET_DEBUG_SESSION:
+ /*
+ * Return 'true' when SynDBG is disabled so the resulting code
+ * will be HV_STATUS_INVALID_HYPERCALL_CODE.
+ */
+ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) ||
+ hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+ case HVCALL_SEND_IPI_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_SEND_IPI:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_CLUSTER_IPI_RECOMMENDED;
+ default:
+ break;
+ }
+
+ return true;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
- u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
- uint16_t code, rep_idx, rep_cnt;
- bool fast, rep;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_hv_hcall hc;
+ u64 ret = HV_STATUS_SUCCESS;
/*
* hypercall generates UD from non zero cpl and real mode
@@ -1895,105 +2207,125 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_X86_64
- if (is_64_bit_mode(vcpu)) {
- param = kvm_rcx_read(vcpu);
- ingpa = kvm_rdx_read(vcpu);
- outgpa = kvm_r8_read(vcpu);
+ if (is_64_bit_hypercall(vcpu)) {
+ hc.param = kvm_rcx_read(vcpu);
+ hc.ingpa = kvm_rdx_read(vcpu);
+ hc.outgpa = kvm_r8_read(vcpu);
} else
#endif
{
- param = ((u64)kvm_rdx_read(vcpu) << 32) |
- (kvm_rax_read(vcpu) & 0xffffffff);
- ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
- (kvm_rcx_read(vcpu) & 0xffffffff);
- outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
- (kvm_rsi_read(vcpu) & 0xffffffff);
+ hc.param = ((u64)kvm_rdx_read(vcpu) << 32) |
+ (kvm_rax_read(vcpu) & 0xffffffff);
+ hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
+ (kvm_rcx_read(vcpu) & 0xffffffff);
+ hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
+ (kvm_rsi_read(vcpu) & 0xffffffff);
}
- code = param & 0xffff;
- fast = !!(param & HV_HYPERCALL_FAST_BIT);
- rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
- rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
- rep = !!(rep_cnt || rep_idx);
+ hc.code = hc.param & 0xffff;
+ hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET;
+ hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT);
+ hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
+ hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
+ hc.rep = !!(hc.rep_cnt || hc.rep_idx);
- trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+ trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt,
+ hc.rep_idx, hc.ingpa, hc.outgpa);
- switch (code) {
+ if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) {
+ ret = HV_STATUS_ACCESS_DENIED;
+ goto hypercall_complete;
+ }
+
+ if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ goto hypercall_complete;
+ }
+
+ if (hc.fast && is_xmm_fast_hypercall(&hc)) {
+ if (unlikely(hv_vcpu->enforce_cpuid &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ kvm_hv_hypercall_read_xmm(&hc);
+ }
+
+ switch (hc.code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
- if (unlikely(rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
kvm_vcpu_on_spin(vcpu, true);
break;
case HVCALL_SIGNAL_EVENT:
- if (unlikely(rep)) {
+ if (unlikely(hc.rep || hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
+ ret = kvm_hvcall_signal_event(vcpu, &hc);
if (ret != HV_STATUS_INVALID_PORT_ID)
break;
fallthrough; /* maybe userspace knows this conn_id */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (unlikely(rep || !to_hv_synic(vcpu)->active)) {
+ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = param;
- vcpu->run->hyperv.u.hcall.params[0] = ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
vcpu->arch.complete_userspace_io =
kvm_hv_hypercall_complete_userspace;
return 0;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
- if (unlikely(fast || !rep_cnt || rep_idx)) {
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
- break;
- case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
- if (unlikely(fast || rep)) {
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
break;
- case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
- if (unlikely(fast || !rep_cnt || rep_idx)) {
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
- break;
+ fallthrough;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
- if (unlikely(fast || rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
break;
case HVCALL_SEND_IPI:
- if (unlikely(rep)) {
+ if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast);
- break;
+ fallthrough;
case HVCALL_SEND_IPI_EX:
- if (unlikely(fast || rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false);
+ ret = kvm_hv_send_ipi(vcpu, &hc);
break;
case HVCALL_POST_DEBUG_DATA:
case HVCALL_RETRIEVE_DEBUG_DATA:
- if (unlikely(fast)) {
+ if (unlikely(hc.fast)) {
ret = HV_STATUS_INVALID_PARAMETER;
break;
}
@@ -2012,9 +2344,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = param;
- vcpu->run->hyperv.u.hcall.params[0] = ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
vcpu->arch.complete_userspace_io =
kvm_hv_hypercall_complete_userspace;
return 0;
@@ -2024,6 +2356,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
break;
}
+hypercall_complete:
return kvm_hv_hypercall_complete(vcpu, ret);
}
@@ -2119,10 +2452,6 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
if (kvm_x86_ops.nested_ops->get_evmcs_version)
evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
- /* Skip NESTED_FEATURES if eVMCS is not supported */
- if (!evmcs_ver)
- --nent;
-
if (cpuid->nent < nent)
return -E2BIG;
@@ -2172,6 +2501,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->ebx |= HV_POST_MESSAGES;
ent->ebx |= HV_SIGNAL_EVENTS;
+ ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
@@ -2198,6 +2528,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
if (!cpu_smt_possible())
ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+
+ ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
/*
* Default number of spinlock retry attempts, matches
* HyperV 2016.
@@ -2219,6 +2551,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60547d5cb6d7..da2737f2a956 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -83,13 +83,17 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu);
+ return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx;
}
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
-bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu);
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
@@ -133,11 +137,12 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
-void kvm_hv_invalidate_tsc_page(struct kvm *kvm);
+void kvm_hv_request_tsc_page_update(struct kvm *kvm);
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu);
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce);
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index a6e218c6140d..1c83076091af 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
struct hrtimer *timer;
- if (!kvm_vcpu_is_bsp(vcpu) || !pit)
+ /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */
+ if (vcpu->vcpu_id || !pit)
return;
timer = &pit->pit_state.timer;
@@ -241,7 +242,7 @@ static void pit_do_work(struct kthread_work *work)
struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
struct kvm *kvm = pit->kvm;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
struct kvm_kpit_state *ps = &pit->pit_state;
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
@@ -304,15 +305,13 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
* So, deactivate APICv when PIT is in reinject mode.
*/
if (reinject) {
- kvm_request_apicv_update(kvm, false,
- APICV_INHIBIT_REASON_PIT_REINJ);
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
/* The initial state is preserved while ps->reinject == 0. */
kvm_pit_reset_reinject(pit);
kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
} else {
- kvm_request_apicv_update(kvm, true,
- APICV_INHIBIT_REASON_PIT_REINJ);
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 629a09ca9860..e1bb6218bb96 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -50,7 +50,7 @@ static void pic_unlock(struct kvm_pic *s)
{
bool wakeup = s->wakeup_needed;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
s->wakeup_needed = false;
@@ -252,7 +252,6 @@ int kvm_pic_read_irq(struct kvm *kvm)
*/
irq2 = 7;
intno = s->pics[1].irq_base + irq2;
- irq = irq2 + 8;
} else
intno = s->pics[0].irq_base + irq;
} else {
@@ -270,7 +269,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
static void kvm_pic_reset(struct kvm_kpic_state *s)
{
- int irq, i;
+ int irq;
+ unsigned long i;
struct kvm_vcpu *vcpu;
u8 edge_irr = s->irr & ~s->elcr;
bool found = false;
@@ -436,13 +436,13 @@ static u32 pic_ioport_read(void *opaque, u32 addr)
return ret;
}
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+static void elcr_ioport_write(void *opaque, u32 val)
{
struct kvm_kpic_state *s = opaque;
s->elcr = val & s->elcr_mask;
}
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
+static u32 elcr_ioport_read(void *opaque)
{
struct kvm_kpic_state *s = opaque;
return s->elcr;
@@ -473,7 +473,7 @@ static int picdev_write(struct kvm_pic *s,
case 0x4d0:
case 0x4d1:
pic_lock(s);
- elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ elcr_ioport_write(&s->pics[addr & 1], data);
pic_unlock(s);
break;
default:
@@ -504,7 +504,7 @@ static int picdev_read(struct kvm_pic *s,
case 0x4d0:
case 0x4d1:
pic_lock(s);
- *data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ *data = elcr_ioport_read(&s->pics[addr & 1]);
pic_unlock(s);
break;
default:
@@ -541,17 +541,17 @@ static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
addr, len, val);
}
-static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, const void *val)
{
- return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
+ return picdev_write(container_of(dev, struct kvm_pic, dev_elcr),
addr, len, val);
}
-static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, void *val)
{
- return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
+ return picdev_read(container_of(dev, struct kvm_pic, dev_elcr),
addr, len, val);
}
@@ -577,9 +577,9 @@ static const struct kvm_io_device_ops picdev_slave_ops = {
.write = picdev_slave_write,
};
-static const struct kvm_io_device_ops picdev_eclr_ops = {
- .read = picdev_eclr_read,
- .write = picdev_eclr_write,
+static const struct kvm_io_device_ops picdev_elcr_ops = {
+ .read = picdev_elcr_read,
+ .write = picdev_elcr_write,
};
int kvm_pic_init(struct kvm *kvm)
@@ -602,7 +602,7 @@ int kvm_pic_init(struct kvm *kvm)
*/
kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
- kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
+ kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops);
mutex_lock(&kvm->slots_lock);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
&s->dev_master);
@@ -613,7 +613,7 @@ int kvm_pic_init(struct kvm *kvm)
if (ret < 0)
goto fail_unreg_2;
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr);
if (ret < 0)
goto fail_unreg_1;
@@ -647,7 +647,7 @@ void kvm_pic_destroy(struct kvm *kvm)
mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
- kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr);
mutex_unlock(&kvm->slots_lock);
kvm->arch.vpic = NULL;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 698969e18fe3..765943d7cfa5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -54,9 +54,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
int trigger_mode,
int pin);
-static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
- unsigned long addr,
- unsigned long length)
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic)
{
unsigned long result = 0;
@@ -96,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
- bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -149,7 +147,7 @@ void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
if (RTC_GSI >= IOAPIC_NUM_PINS)
return;
@@ -184,7 +182,7 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq)
{
- int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
@@ -319,8 +317,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
unsigned index;
bool mask_before, mask_after;
union kvm_ioapic_redirect_entry *e;
- unsigned long vcpu_bitmap;
int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
+ DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
switch (ioapic->ioregsel) {
case IOAPIC_REG_VERSION:
@@ -384,9 +382,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
irq.shorthand = APIC_DEST_NOSHORT;
irq.dest_id = e->fields.dest_id;
irq.msi_redir_hint = false;
- bitmap_zero(&vcpu_bitmap, 16);
+ bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
- &vcpu_bitmap);
+ vcpu_bitmap);
if (old_dest_mode != e->fields.dest_mode ||
old_dest_id != e->fields.dest_id) {
/*
@@ -399,10 +397,10 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
kvm_lapic_irq_dest_mode(
!!e->fields.dest_mode);
kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
- &vcpu_bitmap);
+ vcpu_bitmap);
}
kvm_make_scan_ioapic_request_mask(ioapic->kvm,
- &vcpu_bitmap);
+ vcpu_bitmap);
} else {
kvm_make_scan_ioapic_request(ioapic->kvm);
}
@@ -593,7 +591,7 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
break;
case IOAPIC_REG_WINDOW:
- result = ioapic_read_indirect(ioapic, addr, len);
+ result = ioapic_read_indirect(ioapic);
break;
default:
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 660401700075..539333ac4b38 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -35,21 +35,17 @@ struct kvm_vcpu;
#define IOAPIC_INIT 0x5
#define IOAPIC_EXTINT 0x7
-#ifdef CONFIG_X86
#define RTC_GSI 8
-#else
-#define RTC_GSI -1U
-#endif
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
- DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS);
/*
* Vector sent to a given vcpu, only valid when
* the vcpu's bit in map is set
*/
- u8 vectors[KVM_MAX_VCPU_ID];
+ u8 vectors[KVM_MAX_VCPU_IDS];
};
@@ -85,7 +81,6 @@ struct kvm_ioapic {
unsigned long irq_states[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
- void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 172b05343cfd..f371f1292ca3 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -22,10 +22,14 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
+ int r = 0;
+
if (lapic_in_kernel(vcpu))
- return apic_has_pending_timer(vcpu);
+ r = apic_has_pending_timer(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ r += kvm_xen_has_pending_timer(vcpu);
- return 0;
+ return r;
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu))
kvm_inject_apic_timer_irqs(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_inject_timer_irqs(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9b64abf9b3f1..c2d7cfe82d00 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -55,8 +55,7 @@ struct kvm_pic {
int output; /* intr from master PIC */
struct kvm_io_device dev_master;
struct kvm_io_device dev_slave;
- struct kvm_io_device dev_eclr;
- void (*ack_notifier)(void *opaque, int irq);
+ struct kvm_io_device dev_elcr;
unsigned long irq_states[PIC_NUM_PINS];
};
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8a4de3f12820..0687162c4f22 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -24,6 +24,7 @@
#include "hyperv.h"
#include "x86.h"
+#include "xen.h"
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
@@ -45,9 +46,9 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, struct dest_map *dest_map)
{
- int i, r = -1;
+ int r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
unsigned int dest_vcpus = 0;
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
@@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
return r;
break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
default:
break;
}
@@ -269,7 +277,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
const struct kvm_irq_routing_entry *ue)
{
/* We can't check irqchip_in_kernel() here as some callers are
- * currently inititalizing the irqchip. Other callers should therefore
+ * currently initializing the irqchip. Other callers should therefore
* check kvm_arch_can_set_irq_routing() before calling this function.
*/
switch (ue->type) {
@@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
e->hv_sint.sint = ue->u.hv_sint.sint;
break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
default:
return -EINVAL;
}
@@ -320,7 +332,8 @@ int kvm_set_routing_entry(struct kvm *kvm,
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
- int i, r = 0;
+ int r = 0;
+ unsigned long i;
struct kvm_vcpu *vcpu;
if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 2e11da2f5621..3febc342360c 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,6 +9,12 @@
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
+#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG)
+#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP)
+#define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP)
+
+static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS));
+
#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
{ \
@@ -37,6 +43,13 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14)
BUILD_KVM_GPR_ACCESSORS(r15, R15)
#endif
+/*
+ * avail dirty
+ * 0 0 register in VMCS/VMCB
+ * 0 1 *INVALID*
+ * 1 0 register in vcpu->arch
+ * 1 1 register in vcpu->arch, needs to be stored back
+ */
static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
@@ -62,7 +75,12 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
}
-static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
+/*
+ * The "raw" register helpers are only for cases where the full 64 bits of a
+ * register are read/written irrespective of current vCPU mode. In other words,
+ * odds are good you shouldn't be using the raw variants.
+ */
+static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg)
{
if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
return 0;
@@ -73,8 +91,8 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
return vcpu->arch.regs[reg];
}
-static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
- unsigned long val)
+static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg,
+ unsigned long val)
{
if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
return;
@@ -85,22 +103,22 @@ static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
{
- return kvm_register_read(vcpu, VCPU_REGS_RIP);
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RIP);
}
static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
{
- kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+ kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val);
}
static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
{
- return kvm_register_read(vcpu, VCPU_REGS_RSP);
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RSP);
}
static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
{
- kvm_register_write(vcpu, VCPU_REGS_RSP, val);
+ kvm_register_write_raw(vcpu, VCPU_REGS_RSP, val);
}
static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
@@ -113,6 +131,11 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
return vcpu->arch.walk_mmu->pdptrs[index];
}
+static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value)
+{
+ vcpu->arch.walk_mmu->pdptrs[index] = value;
+}
+
static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
{
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
@@ -157,6 +180,7 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
+ vcpu->stat.guest_mode = 1;
}
static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
@@ -167,6 +191,8 @@ static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
vcpu->arch.load_eoi_exitmap_pending = false;
kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
}
+
+ vcpu->stat.guest_mode = 0;
}
static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 0d359115429a..8dff25d267b7 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -13,6 +13,7 @@
#define _ASM_X86_KVM_X86_EMULATE_H
#include <asm/desc_defs.h>
+#include "fpu.h"
struct x86_emulate_ctxt;
enum x86_intercept;
@@ -209,6 +210,8 @@ struct x86_emulate_ops {
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+ int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
+ int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -225,19 +228,17 @@ struct x86_emulate_ops {
bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
- void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
- int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt,
- const char *smstate);
- void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt);
+ void (*exiting_smm)(struct x86_emulate_ctxt *ctxt);
+ int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate);
+ void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
};
-typedef u32 __attribute__((vector_size(16))) sse128_t;
-
/* Type, address-of, and value of an instruction's operand. */
struct operand {
enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
@@ -314,7 +315,6 @@ struct x86_emulate_ctxt {
int interruptibility;
bool perm_ok; /* do not check permissions if true */
- bool ud; /* inject an #UD if host doesn't support insn */
bool tf; /* TF value before instruction (after for syscall/sysret) */
bool have_exception;
@@ -339,11 +339,7 @@ struct x86_emulate_ctxt {
fastop_t fop;
};
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
- /*
- * The following six fields are cleared together,
- * the rest are initialized unconditionally in x86_decode_insn
- * or elsewhere
- */
+
bool rip_relative;
u8 rex_prefix;
u8 lock_prefix;
@@ -372,6 +368,7 @@ struct x86_emulate_ctxt {
struct fetch_cache fetch;
struct read_cache io_read;
struct read_cache mem_read;
+ bool is_branch;
};
/* Repeat String Operation Prefix */
@@ -468,6 +465,7 @@ enum x86_intercept {
x86_intercept_clgi,
x86_intercept_skinit,
x86_intercept_rdtscp,
+ x86_intercept_rdpid,
x86_intercept_icebp,
x86_intercept_wbinvd,
x86_intercept_monitor,
@@ -490,7 +488,7 @@ enum x86_intercept {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type);
bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
new file mode 100644
index 000000000000..ee4f696a0782
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mshyperv.h>
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+
+static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
+ void *data)
+{
+ struct kvm_tlb_range *range = data;
+
+ return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
+ range->pages);
+}
+
+static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
+ struct kvm_tlb_range *range)
+{
+ if (range)
+ return hyperv_flush_guest_mapping_range(root_tdp,
+ kvm_fill_hv_flush_list_func, (void *)range);
+ else
+ return hyperv_flush_guest_mapping(root_tdp);
+}
+
+int hv_remote_flush_tlb_with_range(struct kvm *kvm,
+ struct kvm_tlb_range *range)
+{
+ struct kvm_arch *kvm_arch = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ int ret = 0, nr_unique_valid_roots;
+ unsigned long i;
+ hpa_t root;
+
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+
+ if (!VALID_PAGE(kvm_arch->hv_root_tdp)) {
+ nr_unique_valid_roots = 0;
+
+ /*
+ * Flush all valid roots, and see if all vCPUs have converged
+ * on a common root, in which case future flushes can skip the
+ * loop and flush the common root.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ root = vcpu->arch.hv_root_tdp;
+ if (!VALID_PAGE(root) || root == kvm_arch->hv_root_tdp)
+ continue;
+
+ /*
+ * Set the tracked root to the first valid root. Keep
+ * this root for the entirety of the loop even if more
+ * roots are encountered as a low effort optimization
+ * to avoid flushing the same (first) root again.
+ */
+ if (++nr_unique_valid_roots == 1)
+ kvm_arch->hv_root_tdp = root;
+
+ if (!ret)
+ ret = hv_remote_flush_root_tdp(root, range);
+
+ /*
+ * Stop processing roots if a failure occurred and
+ * multiple valid roots have already been detected.
+ */
+ if (ret && nr_unique_valid_roots > 1)
+ break;
+ }
+
+ /*
+ * The optimized flush of a single root can't be used if there
+ * are multiple valid roots (obviously).
+ */
+ if (nr_unique_valid_roots > 1)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ } else {
+ ret = hv_remote_flush_root_tdp(kvm_arch->hv_root_tdp, range);
+ }
+
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range);
+
+int hv_remote_flush_tlb(struct kvm *kvm)
+{
+ return hv_remote_flush_tlb_with_range(kvm, NULL);
+}
+EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
+
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+ struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
+
+ if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+ vcpu->arch.hv_root_tdp = root_tdp;
+ if (root_tdp != kvm_arch->hv_root_tdp)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_track_root_tdp);
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
new file mode 100644
index 000000000000..287e98ef9df3
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+
+#ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
+
+#if IS_ENABLED(CONFIG_HYPERV)
+int hv_remote_flush_tlb_with_range(struct kvm *kvm,
+ struct kvm_tlb_range *range);
+int hv_remote_flush_tlb(struct kvm *kvm);
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
+#else /* !CONFIG_HYPERV */
+static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+}
+#endif /* !CONFIG_HYPERV */
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cc369b9ad8f1..0e68b4c937fc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,39 @@ static bool lapic_timer_advance_dynamic __read_mostly;
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+ *((u32 *) (regs + reg_off)) = val;
+}
+
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+ __kvm_lapic_set_reg(apic->regs, reg_off, val);
+}
+
+static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ return *((u64 *) (regs + reg));
+}
+
+static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
+{
+ return __kvm_lapic_get_reg64(apic->regs, reg);
+}
+
+static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ *((u64 *) (regs + reg)) = val;
+}
+
+static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
+ int reg, u64 val)
+{
+ __kvm_lapic_set_reg64(apic->regs, reg, val);
+}
+
static inline int apic_test_vector(int vec, void *bitmap)
{
return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -113,7 +146,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
@@ -185,13 +219,16 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
return;
+ WARN_ONCE(!irqchip_in_kernel(kvm),
+ "Dirty APIC map without an in-kernel local APIC");
+
mutex_lock(&kvm->arch.apic_map_lock);
/*
* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
@@ -296,6 +333,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
+
+ /* Check if there are APF page ready requests pending */
+ if (enabled)
+ kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
@@ -484,8 +525,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(vcpu->arch.apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- static_call(kvm_x86_hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -515,7 +555,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(vcpu->arch.apicv_active))
- static_call(kvm_x86_hwapic_isr_update)(vcpu, vec);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -563,8 +603,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(vcpu->arch.apicv_active))
- static_call(kvm_x86_hwapic_isr_update)(vcpu,
- apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -666,41 +705,40 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
}
-static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
-{
- u8 val;
- if (pv_eoi_get_user(vcpu, &val) < 0) {
- printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
- return false;
- }
- return val & KVM_PV_EOI_ENABLED;
-}
-
static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
{
- if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
- printk(KERN_WARNING "Can't set EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0)
return;
- }
+
__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
+static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu)
{
- if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
- printk(KERN_WARNING "Can't clear EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
- return;
- }
+ u8 val;
+
+ if (pv_eoi_get_user(vcpu, &val) < 0)
+ return false;
+
+ val &= KVM_PV_EOI_ENABLED;
+
+ if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0)
+ return false;
+
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+
+ return val;
}
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
- if (apic->vcpu->arch.apicv_active)
+ if (kvm_x86_ops.sync_pir_to_irr)
highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
@@ -986,6 +1024,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
*r = -1;
if (irq->shorthand == APIC_DEST_SELF) {
+ if (KVM_BUG_ON(!src, kvm)) {
+ *r = 0;
+ return true;
+ }
*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
return true;
}
@@ -1090,11 +1132,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) {
- kvm_lapic_set_irr(vector, apic);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_vcpu_kick(vcpu);
- }
+ static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode,
+ trig_mode, vector);
break;
case APIC_DM_REMRD:
@@ -1165,8 +1204,8 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_lapic *src = NULL;
struct kvm_apic_map *map;
struct kvm_vcpu *vcpu;
- unsigned long bitmap;
- int i, vcpu_idx;
+ unsigned long bitmap, i;
+ int vcpu_idx;
bool ret;
rcu_read_lock();
@@ -1273,6 +1312,9 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
{
struct kvm_lapic_irq irq;
+ /* KVM has no delay and should always clear the BUSY/PENDING flag. */
+ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
+
irq.vector = icr_low & APIC_VECTOR_MASK;
irq.delivery_mode = icr_low & APIC_MODE_MASK;
irq.dest_mode = icr_low & APIC_DEST_MASK;
@@ -1289,6 +1331,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
}
+EXPORT_SYMBOL_GPL(kvm_apic_send_ipi);
static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
@@ -1372,8 +1415,8 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
#define APIC_REGS_MASK(first, count) \
(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data)
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
{
unsigned char alignment = offset & 0xf;
u32 result;
@@ -1391,7 +1434,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
APIC_REG_MASK(APIC_ESR) |
APIC_REG_MASK(APIC_ICR) |
- APIC_REG_MASK(APIC_ICR2) |
APIC_REG_MASK(APIC_LVTT) |
APIC_REG_MASK(APIC_LVTTHMR) |
APIC_REG_MASK(APIC_LVTPC) |
@@ -1402,9 +1444,19 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
APIC_REG_MASK(APIC_TMCCT) |
APIC_REG_MASK(APIC_TDCR);
- /* ARBPRI is not valid on x2APIC */
+ /*
+ * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR
+ * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
+ * manually handled by the caller.
+ */
if (!apic_x2apic_mode(apic))
- valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
+ valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_ICR2);
+ else
+ WARN_ON_ONCE(offset == APIC_ICR);
+
+ if (alignment + len > 4)
+ return 1;
if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset)))
return 1;
@@ -1426,7 +1478,6 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
{
@@ -1490,6 +1541,16 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
static void cancel_hv_timer(struct kvm_lapic *apic);
+static void cancel_apic_timer(struct kvm_lapic *apic)
+{
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ preempt_disable();
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ preempt_enable();
+ atomic_set(&apic->lapic_timer.pending, 0);
+}
+
static void apic_update_lvtt(struct kvm_lapic *apic)
{
u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
@@ -1498,11 +1559,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
if (apic->lapic_timer.timer_mode != timer_mode) {
if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
APIC_LVT_TIMER_TSCDEADLINE)) {
- hrtimer_cancel(&apic->lapic_timer.timer);
- preempt_disable();
- if (apic->lapic_timer.hv_timer_in_use)
- cancel_hv_timer(apic);
- preempt_enable();
+ cancel_apic_timer(apic);
kvm_lapic_set_reg(apic, APIC_TMICT, 0);
apic->lapic_timer.period = 0;
apic->lapic_timer.tscdeadline = 0;
@@ -1592,13 +1649,21 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ if (lapic_timer_advance_dynamic) {
+ adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
+ /*
+ * If the timer fired early, reread the TSC to account for the
+ * overhead of the above adjustment to avoid waiting longer
+ * than is necessary.
+ */
+ if (guest_tsc < tsc_deadline)
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ }
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
-
- if (lapic_timer_advance_dynamic)
- adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
@@ -1657,7 +1722,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
}
atomic_inc(&apic->lapic_timer.pending);
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
if (from_timer_fn)
kvm_vcpu_kick(vcpu);
}
@@ -1908,9 +1973,9 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
/* If the preempt notifier has already run, it also called apic_timer_expired */
if (!apic->lapic_timer.hv_timer_in_use)
goto out;
- WARN_ON(rcuwait_active(&vcpu->wait));
- cancel_hv_timer(apic);
+ WARN_ON(kvm_vcpu_is_blocking(vcpu));
apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
advance_periodic_target_expiration(apic);
@@ -1925,7 +1990,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
restart_apic_timer(vcpu->arch.apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
{
@@ -1937,7 +2001,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
start_sw_timer(apic);
preempt_enable();
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
{
@@ -1976,7 +2039,20 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
}
}
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
+{
+ struct kvm *kvm = apic->vcpu->kvm;
+
+ if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
+ return;
+
+ if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
+ return;
+
+ kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+}
+
+static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
int ret = 0;
@@ -1984,10 +2060,12 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
switch (reg) {
case APIC_ID: /* Local APIC ID */
- if (!apic_x2apic_mode(apic))
+ if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
- else
+ kvm_lapic_xapic_id_updated(apic);
+ } else {
ret = 1;
+ }
break;
case APIC_TASKPRI:
@@ -2035,16 +2113,18 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
}
case APIC_ICR:
+ WARN_ON_ONCE(apic_x2apic_mode(apic));
+
/* No delay here, so we always clear the pending bit */
- val &= ~(1 << 12);
+ val &= ~APIC_ICR_BUSY;
kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
kvm_lapic_set_reg(apic, APIC_ICR, val);
break;
-
case APIC_ICR2:
- if (!apic_x2apic_mode(apic))
- val &= 0xff000000;
- kvm_lapic_set_reg(apic, APIC_ICR2, val);
+ if (apic_x2apic_mode(apic))
+ ret = 1;
+ else
+ kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
break;
case APIC_LVT0:
@@ -2080,7 +2160,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
if (apic_lvtt_tscdeadline(apic))
break;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ cancel_apic_timer(apic);
kvm_lapic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
break;
@@ -2104,10 +2184,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic)) {
- kvm_lapic_reg_write(apic, APIC_ICR,
- APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
- } else
+ if (apic_x2apic_mode(apic))
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
+ else
ret = 1;
break;
default:
@@ -2115,11 +2194,15 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
}
+ /*
+ * Recalculate APIC maps if necessary, e.g. if the software enable bit
+ * was toggled, the APIC ID changed, etc... The maps are marked dirty
+ * on relevant changes, i.e. this is a nop for most writes.
+ */
kvm_recalculate_apic_map(apic->vcpu->kvm);
return ret;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
gpa_t address, int len, const void *data)
@@ -2163,12 +2246,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
/* emulate APIC access in a trap manner */
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
- u32 val = 0;
-
- /* hw has done the conditional check and inst decode */
- offset &= 0xff0;
-
- kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val);
+ u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset);
/* TODO: optimize to just emulate side effect w/o one more write */
kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
@@ -2225,10 +2303,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4));
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
@@ -2245,9 +2320,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- value |= MSR_IA32_APICBASE_BSP;
-
vcpu->arch.apic_base = value;
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
@@ -2261,6 +2333,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if (value & MSR_IA32_APICBASE_ENABLE) {
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
static_branch_slow_dec_deferred(&apic_hw_disabled);
+ /* Check if there are APF page ready requests pending */
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
} else {
static_branch_inc(&apic_hw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
@@ -2271,14 +2345,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
- static_call(kvm_x86_set_virtual_apic_mode)(vcpu);
+ static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
if ((value & MSR_IA32_APICBASE_ENABLE) &&
- apic->base_address != APIC_DEFAULT_PHYS_BASE)
- pr_warn_once("APIC base relocation is unsupported by KVM");
+ apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+ kvm_set_apicv_inhibit(apic->vcpu->kvm,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+ }
}
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
@@ -2290,7 +2366,12 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
apic->irr_pending = true;
apic->isr_count = 1;
} else {
- apic->irr_pending = (apic_search_irr(apic) != -1);
+ /*
+ * Don't clear irr_pending, searching the IRR can race with
+ * updates from the CPU as APICv is still active from hardware's
+ * perspective. The flag will be cleared as appropriate when
+ * KVM injects the interrupt.
+ */
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
}
@@ -2299,19 +2380,25 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 msr_val;
int i;
+ if (!init_event) {
+ msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ msr_val |= MSR_IA32_APICBASE_BSP;
+ kvm_lapic_set_base(vcpu, msr_val);
+ }
+
if (!apic)
return;
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- if (!init_event) {
- kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE);
+ /* The xAPIC ID is set at RESET even if the APIC was already enabled. */
+ if (!init_event)
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
- }
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -2329,8 +2416,12 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!apic_x2apic_mode(apic))
kvm_apic_set_ldr(apic, 0);
kvm_lapic_set_reg(apic, APIC_ESR, 0);
- kvm_lapic_set_reg(apic, APIC_ICR, 0);
- kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ if (!apic_x2apic_mode(apic)) {
+ kvm_lapic_set_reg(apic, APIC_ICR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, 0);
+ }
kvm_lapic_set_reg(apic, APIC_TDCR, 0);
kvm_lapic_set_reg(apic, APIC_TMICT, 0);
for (i = 0; i < 8; i++) {
@@ -2342,15 +2433,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
- if (kvm_vcpu_is_bsp(vcpu))
- kvm_lapic_set_base(vcpu,
- vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
+
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
- static_call(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call(kvm_x86_hwapic_isr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2455,8 +2544,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
}
/*
- * APIC is created enabled. This will prevent kvm_lapic_set_base from
- * thinking that APIC state has changed.
+ * Stuff the APIC ENABLE bit in lieu of temporarily incrementing
+ * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
*/
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
@@ -2549,6 +2638,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
if (apic_x2apic_mode(vcpu->arch.apic)) {
u32 *id = (u32 *)(s->regs + APIC_ID);
u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+ u64 icr;
if (vcpu->kvm->arch.x2apic_format) {
if (*id != vcpu->vcpu_id)
@@ -2560,9 +2650,23 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
*id <<= 24;
}
- /* In x2APIC mode, the LDR is fixed and based on the id */
- if (set)
+ /*
+ * In x2APIC mode, the LDR is fixed and based on the id. And
+ * ICR is internally a single 64-bit register, but needs to be
+ * split to ICR+ICR2 in userspace for backwards compatibility.
+ */
+ if (set) {
*ldr = kvm_apic_calc_x2apic_ldr(*id);
+
+ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
+ } else {
+ kvm_lapic_xapic_id_updated(vcpu->arch.apic);
}
return 0;
@@ -2603,20 +2707,19 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
- hrtimer_cancel(&apic->lapic_timer.timer);
+ cancel_apic_timer(apic);
apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
__start_apic_timer(apic, APIC_TMCCT);
+ kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- static_call(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call(kvm_x86_hwapic_irr_update)(vcpu,
- apic_find_highest_irr(apic));
- static_call(kvm_x86_hwapic_isr_update)(vcpu,
- apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
@@ -2650,7 +2753,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
struct kvm_lapic *apic)
{
- bool pending;
int vector;
/*
* PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
@@ -2664,14 +2766,8 @@ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
* -> host enabled PV EOI, guest executed EOI.
*/
BUG_ON(!pv_eoi_enabled(vcpu));
- pending = pv_eoi_get_pending(vcpu);
- /*
- * Clear pending bit in any case: it will be set again on vmentry.
- * While this might not be ideal from performance point of view,
- * this makes sure pv eoi is only enabled when we know it's safe.
- */
- pv_eoi_clr_pending(vcpu);
- if (pending)
+
+ if (pv_eoi_test_and_clr_pending(vcpu))
return;
vector = apic_set_eoi(apic);
trace_kvm_pv_eoi(apic, vector);
@@ -2760,97 +2856,114 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
return 0;
}
-int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4;
+ data &= ~APIC_ICR_BUSY;
- if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
- return 1;
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+}
+
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+{
+ u32 low;
+
+ if (reg == APIC_ICR) {
+ *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ return 0;
+ }
- if (reg == APIC_ICR2)
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
return 1;
- /* if this is ICR write vector before command */
+ *data = low;
+
+ return 0;
+}
+
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
+{
+ /*
+ * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and
+ * can be written as such, all other registers remain accessible only
+ * through 32-bit reads/writes.
+ */
if (reg == APIC_ICR)
- kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_x2apic_icr_write(apic, data);
+
return kvm_lapic_reg_write(apic, reg, (u32)data);
}
-int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_DFR || reg == APIC_ICR2)
- return 1;
+ return kvm_lapic_msr_write(apic, reg, data);
+}
- if (kvm_lapic_reg_read(apic, reg, 4, &low))
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_ICR)
- kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
- *data = (((u64)high) << 32) | low;
+ if (reg == APIC_DFR)
+ return 1;
- return 0;
+ return kvm_lapic_msr_read(apic, reg, data);
}
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
if (!lapic_in_kernel(vcpu))
return 1;
- /* if this is ICR write vector before command */
- if (reg == APIC_ICR)
- kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
- return kvm_lapic_reg_write(apic, reg, (u32)data);
+ return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
}
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 low, high = 0;
-
if (!lapic_in_kernel(vcpu))
return 1;
- if (kvm_lapic_reg_read(apic, reg, 4, &low))
- return 1;
- if (reg == APIC_ICR)
- kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
-
- *data = (((u64)high) << 32) | low;
-
- return 0;
+ return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
}
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
{
u64 addr = data & ~KVM_MSR_ENABLED;
struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
unsigned long new_len;
+ int ret;
if (!IS_ALIGNED(addr, 4))
return 1;
- vcpu->arch.pv_eoi.msr_val = data;
- if (!pv_eoi_enabled(vcpu))
- return 0;
+ if (data & KVM_MSR_ENABLED) {
+ if (addr == ghc->gpa && len <= ghc->len)
+ new_len = ghc->len;
+ else
+ new_len = len;
- if (addr == ghc->gpa && len <= ghc->len)
- new_len = ghc->len;
- else
- new_len = len;
+ ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+ if (ret)
+ return ret;
+ }
- return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+ vcpu->arch.pv_eoi.msr_val = data;
+
+ return 0;
}
-void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
u8 sipi_vector;
@@ -2858,7 +2971,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
unsigned long pe;
if (!lapic_in_kernel(vcpu))
- return;
+ return 0;
/*
* Read pending events before calling the check_events
@@ -2866,12 +2979,12 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
*/
pe = smp_load_acquire(&apic->pending_events);
if (!pe)
- return;
+ return 0;
if (is_guest_mode(vcpu)) {
- r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ r = kvm_check_nested_events(vcpu);
if (r < 0)
- return;
+ return r == -EBUSY ? 0 : r;
/*
* If an event has happened and caused a vmexit,
* we know INITs are latched and therefore
@@ -2892,7 +3005,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
if (test_bit(KVM_APIC_SIPI, &pe))
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
- return;
+ return 0;
}
if (test_bit(KVM_APIC_INIT, &pe)) {
@@ -2909,14 +3022,17 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+ static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
+ return 0;
}
void kvm_lapic_exit(void)
{
static_key_deferred_flush(&apic_hw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_hw_disabled.key));
static_key_deferred_flush(&apic_sw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 997c45a5963a..65bb2a8cf145 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -38,7 +38,6 @@ struct kvm_timer {
u64 tscdeadline;
u64 expired_tscdeadline;
u32 timer_advance_ns;
- s64 advance_expire_delta;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
};
@@ -76,7 +75,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
@@ -85,9 +84,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
-int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data);
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
@@ -121,13 +117,14 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data);
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
void kvm_lapic_exit(void);
#define VEC_POS(v) ((v) & (32 - 1))
@@ -153,19 +150,14 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
apic->irr_pending = true;
}
-static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
{
- return *((u32 *) (apic->regs + reg_off));
+ return *((u32 *) (regs + reg_off));
}
-static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
-{
- *((u32 *) (regs + reg_off)) = val;
-}
-
-static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
{
- __kvm_lapic_set_reg(apic->regs, reg_off, val);
+ return __kvm_lapic_get_reg(apic->regs, reg_off);
}
DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c68bfc3e2402..f8192864b496 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,6 +44,12 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
+
+#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
+
static __always_inline u64 rsvd_bits(int s, int e)
{
BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s);
@@ -59,23 +65,72 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+extern u8 __read_mostly shadow_phys_bits;
+
+static inline gfn_t kvm_mmu_max_gfn(void)
+{
+ /*
+ * Note that this uses the host MAXPHYADDR, not the guest's.
+ * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
+ * assuming KVM is running on bare metal, guest accesses beyond
+ * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
+ * (either EPT Violation/Misconfig or #NPF), and so KVM will never
+ * install a SPTE for such addresses. If KVM is running as a VM
+ * itself, on the other hand, it might see a MAXPHYADDR that is less
+ * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
+ * disallows such SPTEs entirely and simplifies the TDP MMU.
+ */
+ int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+
+ return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
+}
-void
-reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+static inline u8 kvm_get_shadow_phys_bits(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
-void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
- gpa_t nested_cr3);
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty, gpa_t new_eptp);
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
+
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
- if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
+ if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
@@ -97,69 +152,13 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
{
- u64 root_hpa = vcpu->arch.mmu->root_hpa;
+ u64 root_hpa = vcpu->arch.mmu->root.hpa;
if (!VALID_PAGE(root_hpa))
return;
- static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
- vcpu->arch.mmu->shadow_root_level);
-}
-
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault);
-
-static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefault)
-{
-#ifdef CONFIG_RETPOLINE
- if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault))
- return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault);
-#endif
- return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault);
-}
-
-/*
- * Currently, we have two sorts of write-protection, a) the first one
- * write-protects guest page to sync the guest modification, b) another one is
- * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
- * between these two sorts are:
- * 1) the first case clears SPTE_MMU_WRITEABLE bit.
- * 2) the first case requires flushing tlb immediately avoiding corrupting
- * shadow page table between all vcpus so it should be in the protection of
- * mmu-lock. And the another case does not need to flush tlb until returning
- * the dirty bitmap to userspace since it only write-protects the page
- * logged in the bitmap, that means the page in the dirty bitmap is not
- * missed, so it can flush tlb out of mmu-lock.
- *
- * So, there is the problem: the first case can meet the corrupted tlb caused
- * by another case which write-protects pages but without flush tlb
- * immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
- * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
- *
- * Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
- * readonly, if that happens, we need to flush tlb. Fortunately,
- * mmu_spte_update() has already handled it perfectly.
- *
- * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
- * - if we want to see if it has writable tlb entry or if the spte can be
- * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
- * case, otherwise
- * - if we fix page fault on the spte or do write-protection by dirty logging,
- * check PT_WRITABLE_MASK.
- *
- * TODO: introduce APIs to split these two cases.
- */
-static inline bool is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-static inline bool is_write_protection(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+ static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->root_role.level);
}
/*
@@ -172,27 +171,27 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
*/
static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned pte_access, unsigned pte_pkey,
- unsigned pfec)
+ u64 access)
{
- int cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ /* strip nested paging fault error codes */
+ unsigned int pfec = access;
unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
/*
- * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
+ * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1.
+ * For implicit supervisor accesses, SMAP cannot be overridden.
*
- * If CPL = 3, SMAP applies to all supervisor-mode data accesses
- * (these are implicit supervisor accesses) regardless of the value
- * of EFLAGS.AC.
+ * SMAP works on supervisor accesses only, and not_smap can
+ * be set or not set when user access with neither has any bearing
+ * on the result.
*
- * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving
- * the result in X86_EFLAGS_AC. We then insert it in place of
- * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec,
- * but it will be one in index if SMAP checks are being overridden.
- * It is important to keep this branchless.
+ * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit;
+ * this bit will always be zero in pfec, but it will be one in index
+ * if SMAP checks are being disabled.
*/
- unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
- int index = (pfec >> 1) +
- (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
+ bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
+ int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
bool fault = (mmu->permissions[index] >> pte_access) & 1;
u32 errcode = PFERR_PRESENT_MASK;
@@ -227,4 +226,64 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
+{
+ /*
+ * Read shadow_root_allocated before related pointers. Hence, threads
+ * reading shadow_root_allocated in any lock context are guaranteed to
+ * see the pointers. Pairs with smp_store_release in
+ * mmu_first_shadow_root_alloc.
+ */
+ return smp_load_acquire(&kvm->arch.shadow_root_allocated);
+}
+
+#ifdef CONFIG_X86_64
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+#else
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+#endif
+
+static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+{
+ return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm);
+}
+
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+ /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
+ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
+static inline unsigned long
+__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages,
+ int level)
+{
+ return gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+}
+
+static inline unsigned long
+kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level)
+{
+ return __kvm_mmu_slot_lpages(slot, slot->npages, level);
+}
+
+static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
+{
+ atomic64_add(count, &kvm->stat.pages[level - 1]);
+}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
+ struct x86_exception *exception);
+
+static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ gpa_t gpa, u64 access,
+ struct x86_exception *exception)
+{
+ if (mmu != &vcpu->arch.nested_mmu)
+ return gpa;
+ return translate_nested_gpa(vcpu, gpa, access, exception);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 486aa94ecf1d..17252f39bd7c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -48,13 +48,17 @@
#include <asm/memtype.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+#include <asm/set_memory.h>
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
#include "trace.h"
+#include "paging.h"
+
extern bool itlb_multihit_kvm_mitigation;
-static int __read_mostly nx_huge_pages = -1;
+int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_period_ms;
#ifdef CONFIG_PREEMPT_RT
/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
@@ -63,23 +67,26 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
#endif
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
-static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops nx_huge_pages_ops = {
.set = set_nx_huge_pages,
.get = param_get_bool,
};
-static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
- .set = set_nx_huge_pages_recovery_ratio,
+static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
+ .set = set_nx_huge_pages_recovery_param,
.get = param_get_uint,
};
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
-module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
&nx_huge_pages_recovery_ratio, 0644);
__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_period_ms, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
static bool __read_mostly force_flush_and_sync_on_reuse;
module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
@@ -94,17 +101,9 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
bool tdp_enabled = false;
static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
-enum {
- AUDIT_PRE_PAGE_FAULT,
- AUDIT_POST_PAGE_FAULT,
- AUDIT_PRE_PTE_WRITE,
- AUDIT_POST_PTE_WRITE,
- AUDIT_PRE_SYNC,
- AUDIT_POST_SYNC
-};
-
#ifdef MMU_DEBUG
bool dbg = 0;
module_param(dbg, bool, 0644);
@@ -134,12 +133,22 @@ module_param(dbg, bool, 0644);
#include <trace/events/kvm.h>
-/* make pte_list_desc fit well in cache line */
-#define PTE_LIST_EXT 3
+/* make pte_list_desc fit well in cache lines */
+#define PTE_LIST_EXT 14
+/*
+ * Slight optimization of cacheline layout, by putting `more' and `spte_count'
+ * at the start; then accessing it will only use one single cacheline for
+ * either full (entries==PTE_LIST_EXT) case or entries<=6.
+ */
struct pte_list_desc {
- u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
+ /*
+ * Stores number of entries stored in the pte_list_desc. No need to be
+ * u64 but just for easier alignment. When PTE_LIST_EXT, means full.
+ */
+ u64 spte_count;
+ u64 *sptes[PTE_LIST_EXT];
};
struct kvm_shadow_walk_iterator {
@@ -172,12 +181,78 @@ struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
static void mmu_spte_set(u64 *sptep, u64 spte);
-static union kvm_mmu_page_role
-kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
+
+struct kvm_mmu_role_regs {
+ const unsigned long cr0;
+ const unsigned long cr4;
+ const u64 efer;
+};
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
+/*
+ * Yes, lot's of underscores. They're a hint that you probably shouldn't be
+ * reading from the role_regs. Once the root_role is constructed, it becomes
+ * the single source of truth for the MMU's state.
+ */
+#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
+static inline bool __maybe_unused \
+____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
+{ \
+ return !!(regs->reg & flag); \
+}
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
+
+/*
+ * The MMU itself (with a valid role) is the single source of truth for the
+ * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
+ * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
+ * and the vCPU may be incorrect/irrelevant.
+ */
+#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
+static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
+{ \
+ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
+}
+BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
+BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
+BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
+
+static inline bool is_cr0_pg(struct kvm_mmu *mmu)
+{
+ return mmu->cpu_role.base.level > 0;
+}
+
+static inline bool is_cr4_pae(struct kvm_mmu *mmu)
+{
+ return !mmu->cpu_role.base.has_4_byte_gpte;
+}
+
+static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
+ .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
+ .efer = vcpu->arch.efer,
+ };
+
+ return regs;
+}
static inline bool kvm_available_flush_tlb_with_range(void)
{
@@ -207,18 +282,13 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
kvm_flush_remote_tlbs_with_range(kvm, &range);
}
-bool is_nx_huge_page_enabled(void)
-{
- return READ_ONCE(nx_huge_pages);
-}
-
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
- u64 mask = make_mmio_spte(vcpu, gfn, access);
+ u64 spte = make_mmio_spte(vcpu, gfn, access);
- trace_mark_mmio_spte(sptep, gfn, mask);
- mmu_spte_set(sptep, mask);
+ trace_mark_mmio_spte(sptep, gfn, spte);
+ mmu_spte_set(sptep, spte);
}
static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -236,17 +306,6 @@ static unsigned get_mmio_spte_access(u64 spte)
return spte & shadow_mmio_access_mask;
}
-static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- kvm_pfn_t pfn, unsigned int access)
-{
- if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(vcpu, sptep, gfn, access);
- return true;
- }
-
- return false;
-}
-
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
u64 kvm_gen, spte_gen, gen;
@@ -262,28 +321,11 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
return likely(kvm_gen == spte_gen);
}
-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception)
-{
- /* Check if guest physical address doesn't exceed guest maximum */
- if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
- exception->error_code |= PFERR_RSVD_MASK;
- return UNMAPPED_GVA;
- }
-
- return gpa;
-}
-
static int is_cpuid_PSE36(void)
{
return 1;
}
-static int is_nx(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.efer & EFER_NX;
-}
-
static gfn_t pse36_gfn_delta(u32 gpte)
{
int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
@@ -428,30 +470,6 @@ retry:
}
#endif
-static bool spte_has_volatile_bits(u64 spte)
-{
- if (!is_shadow_present_pte(spte))
- return false;
-
- /*
- * Always atomically update spte if it can be updated
- * out of mmu-lock, it can ensure dirty bit is not lost,
- * also, it can help us to get a stable is_writable_pte()
- * to ensure tlb flush is not missed.
- */
- if (spte_can_locklessly_be_made_writable(spte) ||
- is_access_track_spte(spte))
- return true;
-
- if (spte_ad_enabled(spte)) {
- if ((spte & shadow_accessed_mask) == 0 ||
- (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
- return true;
- }
-
- return false;
-}
-
/* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present
@@ -473,6 +491,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
u64 old_spte = *sptep;
WARN_ON(!is_shadow_present_pte(new_spte));
+ check_spte_writable_invariants(new_spte);
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
@@ -492,11 +511,9 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changed.
*
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
+ * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
+ * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
+ * spte, even though the writable spte might be cached on a CPU's TLB.
*
* Returns true if the TLB needs to be flushed
*/
@@ -513,7 +530,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
- if (spte_can_locklessly_be_made_writable(old_spte) &&
+ if (is_mmu_writable_spte(old_spte) &&
!is_writable_pte(new_spte))
flush = true;
@@ -539,20 +556,24 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* Rules for using mmu_spte_clear_track_bits:
* It sets the sptep from present to nonpresent, and track the
* state bits, it is used to clear the last level sptep.
- * Returns non-zero if the PTE was previously valid.
+ * Returns the old PTE.
*/
-static int mmu_spte_clear_track_bits(u64 *sptep)
+static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
{
kvm_pfn_t pfn;
u64 old_spte = *sptep;
+ int level = sptep_to_sp(sptep)->role.level;
- if (!spte_has_volatile_bits(old_spte))
+ if (!is_shadow_present_pte(old_spte) ||
+ !spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, 0ull);
else
old_spte = __update_clear_spte_slow(sptep, 0ull);
if (!is_shadow_present_pte(old_spte))
- return 0;
+ return old_spte;
+
+ kvm_update_page_stats(kvm, level, -1);
pfn = spte_to_pfn(old_spte);
@@ -569,7 +590,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
if (is_dirty_spte(old_spte))
kvm_set_pfn_dirty(pfn);
- return 1;
+ return old_spte;
}
/*
@@ -587,24 +608,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-/* Restore an acc-track PTE back to a regular PTE */
-static u64 restore_acc_track_spte(u64 spte)
-{
- u64 new_spte = spte;
- u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
- & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
-
- WARN_ON_ONCE(spte_ad_enabled(spte));
- WARN_ON_ONCE(!is_access_track_spte(spte));
-
- new_spte &= ~shadow_acc_track_mask;
- new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
- SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
- new_spte |= saved_bits;
-
- return new_spte;
-}
-
/* Returns the Accessed status of the PTE and resets it at the same time. */
static bool mmu_spte_age(u64 *sptep)
{
@@ -633,28 +636,36 @@ static bool mmu_spte_age(u64 *sptep)
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
- /*
- * Prevent page table teardown by making any free-er wait during
- * kvm_flush_remote_tlbs() IPI to all active vcpus.
- */
- local_irq_disable();
+ if (is_tdp_mmu(vcpu->arch.mmu)) {
+ kvm_tdp_mmu_walk_lockless_begin();
+ } else {
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
- /*
- * Make sure a following spte read is not reordered ahead of the write
- * to vcpu->mode.
- */
- smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ }
}
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
- /*
- * Make sure the write to vcpu->mode is not reordered in front of
- * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
- * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
- */
- smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
- local_irq_enable();
+ if (is_tdp_mmu(vcpu->arch.mmu)) {
+ kvm_tdp_mmu_walk_lockless_end();
+ } else {
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+ local_irq_enable();
+ }
}
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
@@ -700,6 +711,9 @@ static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
+ if (sp->role.passthrough)
+ return sp->gfn;
+
if (!sp->role.direct)
return sp->gfns[index];
@@ -708,6 +722,11 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
+ if (sp->role.passthrough) {
+ WARN_ON_ONCE(gfn != sp->gfn);
+ return;
+ }
+
if (!sp->role.direct) {
sp->gfns[index] = gfn;
return;
@@ -725,8 +744,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
* handling slots that are not large page aligned.
*/
static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+ const struct kvm_memory_slot *slot, int level)
{
unsigned long idx;
@@ -734,7 +752,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
-static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
gfn_t gfn, int count)
{
struct kvm_lpage_info *linfo;
@@ -747,12 +765,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
}
}
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
{
update_gfn_disallow_lpage_count(slot, gfn, 1);
}
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
{
update_gfn_disallow_lpage_count(slot, gfn, -1);
}
@@ -841,7 +859,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- int i, count = 0;
+ int count = 0;
if (!rmap_head->val) {
rmap_printk("%p %llx 0->1\n", spte, *spte);
@@ -851,24 +869,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
desc = mmu_alloc_pte_list_desc(vcpu);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
+ desc->spte_count = 2;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->sptes[PTE_LIST_EXT-1]) {
+ while (desc->spte_count == PTE_LIST_EXT) {
count += PTE_LIST_EXT;
-
if (!desc->more) {
desc->more = mmu_alloc_pte_list_desc(vcpu);
desc = desc->more;
+ desc->spte_count = 0;
break;
}
desc = desc->more;
}
- for (i = 0; desc->sptes[i]; ++i)
- ++count;
- desc->sptes[i] = spte;
+ count += desc->spte_count;
+ desc->sptes[desc->spte_count++] = spte;
}
return count;
}
@@ -878,13 +896,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i,
struct pte_list_desc *prev_desc)
{
- int j;
+ int j = desc->spte_count - 1;
- for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
- ;
desc->sptes[i] = desc->sptes[j];
desc->sptes[j] = NULL;
- if (j != 0)
+ desc->spte_count--;
+ if (desc->spte_count)
return;
if (!prev_desc && !desc->more)
rmap_head->val = 0;
@@ -917,7 +934,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+ for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
pte_list_desc_remove_entry(rmap_head,
desc, i, prev_desc);
@@ -932,60 +949,99 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
}
}
-static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
+static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ u64 *sptep)
{
- mmu_spte_clear_track_bits(sptep);
+ mmu_spte_clear_track_bits(kvm, sptep);
__pte_list_remove(sptep, rmap_head);
}
-static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+/* Return true if rmap existed, false otherwise */
+static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
- unsigned long idx;
+ struct pte_list_desc *desc, *next;
+ int i;
- idx = gfn_to_index(gfn, slot->base_gfn, level);
- return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+ if (!rmap_head->val)
+ return false;
+
+ if (!(rmap_head->val & 1)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
+ goto out;
+ }
+
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+ for (; desc; desc = next) {
+ for (i = 0; i < desc->spte_count; i++)
+ mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
+ next = desc->more;
+ mmu_free_pte_list_desc(desc);
+ }
+out:
+ /* rmap_head is meaningless now, remember to reset it */
+ rmap_head->val = 0;
+ return true;
}
-static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
- struct kvm_mmu_page *sp)
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
+ struct pte_list_desc *desc;
+ unsigned int count = 0;
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, gfn);
- return __gfn_to_rmap(gfn, sp->role.level, slot);
+ if (!rmap_head->val)
+ return 0;
+ else if (!(rmap_head->val & 1))
+ return 1;
+
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+ while (desc) {
+ count += desc->spte_count;
+ desc = desc->more;
+ }
+
+ return count;
}
-static bool rmap_can_add(struct kvm_vcpu *vcpu)
+static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
+ const struct kvm_memory_slot *slot)
{
- struct kvm_mmu_memory_cache *mc;
+ unsigned long idx;
- mc = &vcpu->arch.mmu_pte_list_desc_cache;
- return kvm_mmu_memory_cache_nr_free_objects(mc);
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
}
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static bool rmap_can_add(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu_page *sp;
- struct kvm_rmap_head *rmap_head;
+ struct kvm_mmu_memory_cache *mc;
- sp = sptep_to_sp(spte);
- kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
- return pte_list_add(vcpu, spte, rmap_head);
+ mc = &vcpu->arch.mmu_pte_list_desc_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(mc);
}
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
struct kvm_mmu_page *sp;
gfn_t gfn;
struct kvm_rmap_head *rmap_head;
sp = sptep_to_sp(spte);
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmap_head = gfn_to_rmap(kvm, gfn, sp);
+
+ /*
+ * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
+ * so we have to determine which memslots to use based on context
+ * information in sp->role.
+ */
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+
+ slot = __gfn_to_memslot(slots, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+
__pte_list_remove(spte, rmap_head);
}
@@ -1067,7 +1123,9 @@ out:
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
- if (mmu_spte_clear_track_bits(sptep))
+ u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
+
+ if (is_shadow_present_pte(old_spte))
rmap_remove(kvm, sptep);
}
@@ -1077,7 +1135,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
if (is_large_pte(*sptep)) {
WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
- --kvm->stat.lpages;
return true;
}
@@ -1112,21 +1169,20 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
u64 spte = *sptep;
if (!is_writable_pte(spte) &&
- !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
+ !(pt_protect && is_mmu_writable_spte(spte)))
return false;
rmap_printk("spte %p %llx\n", sptep, *sptep);
if (pt_protect)
- spte &= ~SPTE_MMU_WRITEABLE;
+ spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_write_protect(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- bool pt_protect)
+static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1166,7 +1222,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
* Returns true iff any D or W bits were cleared.
*/
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1188,8 +1244,7 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
* @gfn_offset: start of the BITS_PER_LONG pages we care about
* @mask: indicates which pages we should protect
*
- * Used when we do not need to care about huge page mappings: e.g. during dirty
- * logging we do not have any such mappings.
+ * Used when we do not need to care about huge page mappings.
*/
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
@@ -1200,10 +1255,14 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
while (mask) {
- rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PG_LEVEL_4K, slot);
- __rmap_write_protect(kvm, rmap_head, false);
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
+ rmap_write_protect(rmap_head, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1229,9 +1288,13 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
while (mask) {
- rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PG_LEVEL_4K, slot);
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
__rmap_clear_dirty(kvm, rmap_head, slot);
/* clear the first set bit */
@@ -1246,13 +1309,39 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
* enable dirty logging for them.
*
- * Used when we do not need to care about huge page mappings: e.g. during dirty
- * logging we do not have any such mappings.
+ * We need to care about huge page mappings: e.g. during dirty logging we may
+ * have such mappings.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
+ /*
+ * Huge pages are NOT write protected when we start dirty logging in
+ * initially-all-set mode; must write protect them here so that they
+ * are split to 4K on the first write.
+ *
+ * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
+ * of memslot has no such restriction, so the range can cross two large
+ * pages.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+ gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
+ gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
+
+ /* Cross two large pages? */
+ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
+ ALIGN(end << PAGE_SHIFT, PMD_SIZE))
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
+ PG_LEVEL_2M);
+ }
+
+ /* Now handle 4K PTEs. */
if (kvm_x86_ops.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
@@ -1265,92 +1354,83 @@ int kvm_cpu_dirty_log_size(void)
}
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
- struct kvm_memory_slot *slot, u64 gfn)
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level)
{
struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
- for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
- rmap_head = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = gfn_to_rmap(gfn, i, slot);
+ write_protected |= rmap_write_protect(rmap_head, true);
+ }
}
if (is_tdp_mmu_enabled(kvm))
write_protected |=
- kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
+ kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
return write_protected;
}
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
{
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
}
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
- u64 *sptep;
- struct rmap_iterator iter;
- bool flush = false;
-
- while ((sptep = rmap_get_first(rmap_head, &iter))) {
- rmap_printk("spte %p %llx.\n", sptep, *sptep);
-
- pte_list_remove(rmap_head, sptep);
- flush = true;
- }
-
- return flush;
+ return pte_list_destroy(kvm, rmap_head);
}
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
return kvm_zap_rmapp(kvm, rmap_head, slot);
}
-static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t pte)
{
u64 *sptep;
struct rmap_iterator iter;
- int need_flush = 0;
+ bool need_flush = false;
u64 new_spte;
- pte_t *ptep = (pte_t *)data;
kvm_pfn_t new_pfn;
- WARN_ON(pte_huge(*ptep));
- new_pfn = pte_pfn(*ptep);
+ WARN_ON(pte_huge(pte));
+ new_pfn = pte_pfn(pte);
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
- need_flush = 1;
+ need_flush = true;
- if (pte_write(*ptep)) {
- pte_list_remove(rmap_head, sptep);
+ if (pte_write(pte)) {
+ pte_list_remove(kvm, rmap_head, sptep);
goto restart;
} else {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(
*sptep, new_pfn);
- mmu_spte_clear_track_bits(sptep);
+ mmu_spte_clear_track_bits(kvm, sptep);
mmu_spte_set(sptep, new_spte);
}
}
if (need_flush && kvm_available_flush_tlb_with_range()) {
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
- return 0;
+ return false;
}
return need_flush;
@@ -1358,7 +1438,7 @@ restart:
struct slot_rmap_walk_iterator {
/* input fields. */
- struct kvm_memory_slot *slot;
+ const struct kvm_memory_slot *slot;
gfn_t start_gfn;
gfn_t end_gfn;
int start_level;
@@ -1378,14 +1458,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
{
iterator->level = level;
iterator->gfn = iterator->start_gfn;
- iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
- iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
- iterator->slot);
+ iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
}
static void
slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
- struct kvm_memory_slot *slot, int start_level,
+ const struct kvm_memory_slot *slot, int start_level,
int end_level, gfn_t start_gfn, gfn_t end_gfn)
{
iterator->slot = slot;
@@ -1404,9 +1483,11 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
{
- if (++iterator->rmap <= iterator->end_rmap) {
+ while (++iterator->rmap <= iterator->end_rmap) {
iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
- return;
+
+ if (iterator->rmap->val)
+ return;
}
if (++iterator->level > iterator->end_level) {
@@ -1424,93 +1505,54 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-static __always_inline int
-kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn,
- int level,
- unsigned long data))
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t pte);
+
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ rmap_handler_t handler)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
struct slot_rmap_walk_iterator iterator;
- int ret = 0;
- int i;
+ bool ret = false;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- gfn_start, gfn_end - 1,
- &iterator)
- ret |= handler(kvm, iterator.rmap, memslot,
- iterator.gfn, iterator.level, data);
- }
- }
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator)
+ ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+ iterator.level, range->pte);
return ret;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn, int level,
- unsigned long data))
-{
- return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int r;
+ bool flush = false;
- r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
- return r;
+ return flush;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int r;
+ bool flush = false;
- r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
if (is_tdp_mmu_enabled(kvm))
- r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+ flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
- return r;
+ return flush;
}
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1519,57 +1561,65 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
for_each_rmap_spte(rmap_head, &iter, sptep)
young |= mmu_spte_age(sptep);
- trace_kvm_age_page(gfn, level, slot, young);
return young;
}
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, unsigned long data)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
for_each_rmap_spte(rmap_head, &iter, sptep)
if (is_accessed_spte(*sptep))
- return 1;
- return 0;
+ return true;
+ return false;
}
#define RMAP_RECYCLE_THRESHOLD 1000
-static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn)
{
- struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
+ struct kvm_rmap_head *rmap_head;
+ int rmap_count;
sp = sptep_to_sp(spte);
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+ rmap_count = pte_list_add(vcpu, spte, rmap_head);
- rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
-
- kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
+ kvm_flush_remote_tlbs_with_address(
+ vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ }
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int young = false;
+ bool young = false;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
- young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
- young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+ young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
return young;
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int young = false;
+ bool young = false;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
- young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
- young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+ young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
return young;
}
@@ -1596,7 +1646,7 @@ static int is_empty_shadow_page(u64 *spt)
* aggregate version in order to make the slab shrinker
* faster
*/
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
@@ -1689,7 +1739,7 @@ static void mark_unsync(u64 *spte)
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
- return 0;
+ return -1;
}
#define KVM_PAGE_ARRAY_NR 16
@@ -1790,32 +1840,35 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
+static bool sp_has_gptes(struct kvm_mmu_page *sp)
+{
+ if (sp->role.direct)
+ return false;
+
+ if (sp->role.passthrough)
+ return false;
+
+ return true;
+}
+
#define for_each_valid_sp(_kvm, _sp, _list) \
hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
-#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
+#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
for_each_valid_sp(_kvm, _sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
- if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
-static inline bool is_ept_sp(struct kvm_mmu_page *sp)
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
- return sp->role.cr0_wp && sp->role.smap_andnot_wp;
-}
+ int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
-/* @sp->gfn should be write-protected at the call site */
-static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
-{
- if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
- vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return false;
- }
-
- return true;
+ return ret;
}
static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@@ -1832,53 +1885,14 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
return true;
}
-static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
- struct list_head *invalid_list,
- bool remote_flush, bool local_flush)
-{
- if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
- return;
-
- if (local_flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-}
-
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
-static void mmu_audit_disable(void) { }
-#endif
-
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- return sp->role.invalid ||
- unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
-}
-
-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
-{
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- return __kvm_sync_page(vcpu, sp, invalid_list);
-}
-
-/* @gfn should be write-protected at the call site */
-static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
- struct list_head *invalid_list)
-{
- struct kvm_mmu_page *s;
- bool ret = false;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
- if (!s->unsync)
- continue;
-
- WARN_ON(s->role.level != PG_LEVEL_4K);
- ret |= kvm_sync_page(vcpu, s, invalid_list);
- }
+ if (sp->role.invalid)
+ return true;
- return ret;
+ /* TDP MMU pages due not use the MMU generation. */
+ return !sp->tdp_mmu_page &&
+ unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
struct mmu_page_path {
@@ -1953,8 +1967,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
} while (!sp->unsync_children);
}
-static void mmu_sync_children(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *parent)
+static int mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent, bool can_yield)
{
int i;
struct kvm_mmu_page *sp;
@@ -1967,25 +1981,32 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
bool protected = false;
for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu, sp->gfn);
+ protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
if (protected) {
- kvm_flush_remote_tlbs(vcpu->kvm);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
flush = false;
}
for_each_sp(pages, sp, parents, i) {
- flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
mmu_pages_clear_parents(&parents);
}
if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ if (!can_yield) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ return -EINTR;
+ }
+
cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
flush = false;
}
}
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ return 0;
}
static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
@@ -2005,27 +2026,26 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
int direct,
unsigned int access)
{
- bool direct_mmu = vcpu->arch.mmu->direct_map;
+ bool direct_mmu = vcpu->arch.mmu->root_role.direct;
union kvm_mmu_page_role role;
struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
- bool need_sync = false;
- bool flush = false;
+ int ret;
int collisions = 0;
LIST_HEAD(invalid_list);
- role = vcpu->arch.mmu->mmu_role.base;
+ role = vcpu->arch.mmu->root_role;
role.level = level;
role.direct = direct;
- if (role.direct)
- role.gpte_is_8_bytes = true;
role.access = access;
- if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+ if (role.has_4_byte_gpte) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
+ if (level <= vcpu->arch.mmu->cpu_role.base.level)
+ role.passthrough = 0;
sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
for_each_valid_sp(vcpu->kvm, sp, sp_list) {
@@ -2034,29 +2054,47 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
continue;
}
- if (!need_sync && sp->unsync)
- need_sync = true;
-
- if (sp->role.word != role.word)
+ if (sp->role.word != role.word) {
+ /*
+ * If the guest is creating an upper-level page, zap
+ * unsync pages for the same gfn. While it's possible
+ * the guest is using recursive page tables, in all
+ * likelihood the guest has stopped using the unsync
+ * page and is installing a completely unrelated page.
+ * Unsync pages must not be left as is, because the new
+ * upper-level page will be write-protected.
+ */
+ if (level > PG_LEVEL_4K && sp->unsync)
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
continue;
+ }
if (direct_mmu)
goto trace_get_page;
if (sp->unsync) {
- /* The page is good, but __kvm_sync_page might still end
- * up zapping it. If so, break in order to rebuild it.
+ /*
+ * The page is good, but is stale. kvm_sync_page does
+ * get the latest guest state, but (unlike mmu_unsync_children)
+ * it doesn't write-protect the page or mark it synchronized!
+ * This way the validity of the mapping is ensured, but the
+ * overhead of write protection is not incurred until the
+ * guest invalidates the TLB mapping. This allows multiple
+ * SPs for a single gfn to be unsync.
+ *
+ * If the sync fails, the page is zapped. If so, break
+ * in order to rebuild it.
*/
- if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ ret = kvm_sync_page(vcpu, sp, &invalid_list);
+ if (ret < 0)
break;
WARN_ON(!list_empty(&invalid_list));
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ if (ret > 0)
+ kvm_flush_remote_tlbs(vcpu->kvm);
}
- if (sp->unsync_children)
- kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-
__clear_sp_write_flooding_count(sp);
trace_get_page:
@@ -2071,23 +2109,15 @@ trace_get_page:
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, sp_list);
- if (!direct) {
- /*
- * we should do write protection before syncing pages
- * otherwise the content of the synced shadow page may
- * be inconsistent with guest page table.
- */
+ if (sp_has_gptes(sp)) {
account_shadowed(vcpu->kvm, sp);
- if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
+ if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
-
- if (level > PG_LEVEL_4K && need_sync)
- flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
trace_kvm_mmu_get_page(sp, true);
-
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
out:
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
return sp;
@@ -2099,19 +2129,19 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
{
iterator->addr = addr;
iterator->shadow_addr = root;
- iterator->level = vcpu->arch.mmu->shadow_root_level;
+ iterator->level = vcpu->arch.mmu->root_role.level;
- if (iterator->level == PT64_ROOT_4LEVEL &&
- vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
- !vcpu->arch.mmu->direct_map)
- --iterator->level;
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
+ vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
+ !vcpu->arch.mmu->root_role.direct)
+ iterator->level = PT32E_ROOT_LEVEL;
if (iterator->level == PT32E_ROOT_LEVEL) {
/*
* prev_root is currently only used for 64-bit hosts. So only
* the active root_hpa is valid here.
*/
- BUG_ON(root != vcpu->arch.mmu->root_hpa);
+ BUG_ON(root != vcpu->arch.mmu->root.hpa);
iterator->shadow_addr
= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
@@ -2125,7 +2155,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
struct kvm_vcpu *vcpu, u64 addr)
{
- shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
addr);
}
@@ -2142,7 +2172,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
u64 spte)
{
- if (is_last_spte(spte, iterator->level)) {
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
iterator->level = 0;
return;
}
@@ -2206,8 +2236,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
- if (is_large_pte(pte))
- --kvm->stat.lpages;
} else {
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
@@ -2241,7 +2269,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
return zapped;
}
-static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -2279,18 +2307,18 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
struct list_head *invalid_list,
int *nr_zapped)
{
- bool list_unstable;
+ bool list_unstable, zapped_root = false;
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
- kvm_mmu_unlink_parents(kvm, sp);
+ kvm_mmu_unlink_parents(sp);
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
- if (!sp->role.invalid && !sp->role.direct)
+ if (!sp->role.invalid && sp_has_gptes(sp))
unaccount_shadowed(kvm, sp);
if (sp->unsync)
@@ -2321,14 +2349,20 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
* in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
* treats invalid shadow pages as being obsolete.
*/
- if (!is_obsolete_sp(kvm, sp))
- kvm_reload_remote_mmus(kvm);
+ zapped_root = !is_obsolete_sp(kvm, sp);
}
if (sp->lpage_disallowed)
unaccount_huge_nx_page(kvm, sp);
sp->role.invalid = 1;
+
+ /*
+ * Make the request to free obsolete roots after marking the root
+ * invalid, otherwise other vCPUs may not see it as invalid.
+ */
+ if (zapped_root)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
return list_unstable;
}
@@ -2421,6 +2455,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
+ /*
+ * Note, this check is intentionally soft, it only guarantees that one
+ * page is available, while the caller may end up allocating as many as
+ * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
+ * exceeding the (arbitrary by default) limit will not harm the host,
+ * being too aggressive may unnecessarily kill the guest, and getting an
+ * exact count is far more trouble than it's worth, especially in the
+ * page fault paths.
+ */
if (!kvm_mmu_available_pages(vcpu->kvm))
return -ENOSPC;
return 0;
@@ -2455,7 +2498,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
write_lock(&kvm->mmu_lock);
- for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
sp->role.word);
r = 1;
@@ -2472,7 +2515,7 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- if (vcpu->arch.mmu->direct_map)
+ if (vcpu->arch.mmu->root_role.direct)
return 0;
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2482,33 +2525,79 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
return r;
}
-static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
trace_kvm_mmu_unsync_page(sp);
- ++vcpu->kvm->stat.mmu_unsync;
+ ++kvm->stat.mmu_unsync;
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
}
-bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
+/*
+ * Attempt to unsync any shadow pages that can be reached by the specified gfn,
+ * KVM is creating a writable mapping for said gfn. Returns 0 if all pages
+ * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
+ * be write-protected.
+ */
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool can_unsync, bool prefetch)
{
struct kvm_mmu_page *sp;
+ bool locked = false;
- if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
- return true;
+ /*
+ * Force write-protection if the page is being tracked. Note, the page
+ * track machinery is used to write-protect upper-level shadow pages,
+ * i.e. this guards the role.level == 4K assertion below!
+ */
+ if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
+ return -EPERM;
- for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ /*
+ * The page is not write-tracked, mark existing shadow pages unsync
+ * unless KVM is synchronizing an unsync SP (can_unsync = false). In
+ * that case, KVM must complete emulation of the guest TLB flush before
+ * allowing shadow pages to become unsync (writable by the guest).
+ */
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
if (!can_unsync)
- return true;
+ return -EPERM;
if (sp->unsync)
continue;
+ if (prefetch)
+ return -EEXIST;
+
+ /*
+ * TDP MMU page faults require an additional spinlock as they
+ * run with mmu_lock held for read, not write, and the unsync
+ * logic is not thread safe. Take the spinklock regardless of
+ * the MMU type to avoid extra conditionals/parameters, there's
+ * no meaningful penalty if mmu_lock is held for write.
+ */
+ if (!locked) {
+ locked = true;
+ spin_lock(&kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * Recheck after taking the spinlock, a different vCPU
+ * may have since marked the page unsync. A false
+ * positive on the unprotected check above is not
+ * possible as clearing sp->unsync _must_ hold mmu_lock
+ * for write, i.e. unsync cannot transition from 0->1
+ * while this CPU holds mmu_lock for read (or write).
+ */
+ if (READ_ONCE(sp->unsync))
+ continue;
+ }
+
WARN_ON(sp->role.level != PG_LEVEL_4K);
- kvm_unsync_page(vcpu, sp);
+ kvm_unsync_page(kvm, sp);
}
+ if (locked)
+ spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
/*
* We need to ensure that the marking of unsync pages is visible
@@ -2532,8 +2621,8 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
* 2.2 Guest issues TLB flush.
* That causes a VM Exit.
*
- * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
- * Since it is false, so it just returns.
+ * 2.3 Walking of unsync pages sees sp->unsync is
+ * false and skips the page.
*
* 2.4 Guest accesses GVA X.
* Since the mapping in the SP was not updated,
@@ -2544,55 +2633,40 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
* (sp->unsync = true)
*
* The write barrier below ensures that 1.1 happens before 1.2 and thus
- * the situation in 2.4 does not arise. The implicit barrier in 2.2
- * pairs with this write barrier.
+ * the situation in 2.4 does not arise. It pairs with the read barrier
+ * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
*/
smp_wmb();
- return false;
-}
-
-static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, bool speculative,
- bool can_unsync, bool host_writable)
-{
- u64 spte;
- struct kvm_mmu_page *sp;
- int ret;
-
- if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
- return 0;
-
- sp = sptep_to_sp(sptep);
-
- ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
- can_unsync, host_writable, sp_ad_disabled(sp), &spte);
-
- if (spte & PT_WRITABLE_MASK)
- kvm_vcpu_mark_page_dirty(vcpu, gfn);
-
- if (*sptep == spte)
- ret |= SET_SPTE_SPURIOUS;
- else if (mmu_spte_update(sptep, spte))
- ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
- return ret;
+ return 0;
}
-static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, bool write_fault, int level,
- gfn_t gfn, kvm_pfn_t pfn, bool speculative,
- bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *sptep, unsigned int pte_access, gfn_t gfn,
+ kvm_pfn_t pfn, struct kvm_page_fault *fault)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ int level = sp->role.level;
int was_rmapped = 0;
- int rmap_count;
- int set_spte_ret;
int ret = RET_PF_FIXED;
bool flush = false;
+ bool wrprot;
+ u64 spte;
+
+ /* Prefetching always gets a writable pfn. */
+ bool host_writable = !fault || fault->map_writable;
+ bool prefetch = !fault || fault->prefetch;
+ bool write_fault = fault && fault->write;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
+ if (unlikely(is_noslot_pfn(pfn))) {
+ vcpu->stat.pf_mmio_spte_created++;
+ mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+ return RET_PF_EMULATE;
+ }
+
if (is_shadow_present_pte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
@@ -2614,58 +2688,36 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
was_rmapped = 1;
}
- set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
- speculative, true, host_writable);
- if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
+ wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
+ true, host_writable, &spte);
+
+ if (*sptep == spte) {
+ ret = RET_PF_SPURIOUS;
+ } else {
+ flush |= mmu_spte_update(sptep, spte);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
+ }
+
+ if (wrprot) {
if (write_fault)
ret = RET_PF_EMULATE;
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
+ if (flush)
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
KVM_PAGES_PER_HPAGE(level));
- if (unlikely(is_mmio_spte(*sptep)))
- ret = RET_PF_EMULATE;
-
- /*
- * The fault is fully spurious if and only if the new SPTE and old SPTE
- * are identical, and emulation is not required.
- */
- if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
- WARN_ON_ONCE(!was_rmapped);
- return RET_PF_SPURIOUS;
- }
-
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- trace_kvm_mmu_set_spte(level, gfn, sptep);
- if (!was_rmapped && is_large_pte(*sptep))
- ++vcpu->kvm->stat.lpages;
- if (is_shadow_present_pte(*sptep)) {
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
- }
+ if (!was_rmapped) {
+ WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
+ kvm_update_page_stats(vcpu->kvm, level, 1);
+ rmap_add(vcpu, slot, sptep, gfn);
}
return ret;
}
-static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
-{
- struct kvm_memory_slot *slot;
-
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
- if (!slot)
- return KVM_PFN_ERR_FAULT;
-
- return gfn_to_pfn_memslot_atomic(slot, gfn);
-}
-
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -2686,8 +2738,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++) {
- mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
- page_to_pfn(pages[i]), true, true);
+ mmu_set_spte(vcpu, slot, start, access, gfn,
+ page_to_pfn(pages[i]), NULL);
put_page(pages[i]);
}
@@ -2710,11 +2762,13 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
if (!start)
continue;
if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
- break;
+ return;
start = NULL;
} else if (!start)
start = spte;
}
+ if (start)
+ direct_pte_prefetch_many(vcpu, sp, start, spte);
}
static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -2745,11 +2799,15 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
}
static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
unsigned long hva;
- pte_t *pte;
- int level;
+ unsigned long flags;
+ int level = PG_LEVEL_4K;
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
return PG_LEVEL_4K;
@@ -2764,17 +2822,52 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
*/
hva = __gfn_to_hva_memslot(slot, gfn);
- pte = lookup_address_in_mm(kvm->mm, hva, &level);
- if (unlikely(!pte))
- return PG_LEVEL_4K;
+ /*
+ * Lookup the mapping level in the current mm. The information
+ * may become stale soon, but it is safe to use as long as
+ * 1) mmu_notifier_retry was checked after taking mmu_lock, and
+ * 2) mmu_lock is taken now.
+ *
+ * We still need to disable IRQs to prevent concurrent tear down
+ * of page tables.
+ */
+ local_irq_save(flags);
+
+ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
+ if (pgd_none(pgd))
+ goto out;
+
+ p4d = READ_ONCE(*p4d_offset(&pgd, hva));
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ goto out;
+
+ pud = READ_ONCE(*pud_offset(&p4d, hva));
+ if (pud_none(pud) || !pud_present(pud))
+ goto out;
+ if (pud_large(pud)) {
+ level = PG_LEVEL_1G;
+ goto out;
+ }
+
+ pmd = READ_ONCE(*pmd_offset(&pud, hva));
+ if (pmd_none(pmd) || !pmd_present(pmd))
+ goto out;
+
+ if (pmd_large(pmd))
+ level = PG_LEVEL_2M;
+
+out:
+ local_irq_restore(flags);
return level;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level)
{
struct kvm_lpage_info *linfo;
+ int host_level;
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -2786,60 +2879,50 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
- return host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ return min(host_level, max_level);
}
-int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp,
- bool huge_page_disallowed, int *req_level)
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_memory_slot *slot;
- kvm_pfn_t pfn = *pfnp;
+ struct kvm_memory_slot *slot = fault->slot;
kvm_pfn_t mask;
- int level;
-
- *req_level = PG_LEVEL_4K;
-
- if (unlikely(max_level == PG_LEVEL_4K))
- return PG_LEVEL_4K;
- if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
- return PG_LEVEL_4K;
+ fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
- if (!slot)
- return PG_LEVEL_4K;
+ if (unlikely(fault->max_level == PG_LEVEL_4K))
+ return;
- level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
- if (level == PG_LEVEL_4K)
- return level;
+ if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
+ return;
- *req_level = level = min(level, max_level);
+ if (kvm_slot_dirty_track_enabled(slot))
+ return;
/*
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- if (huge_page_disallowed)
- return PG_LEVEL_4K;
+ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+ fault->gfn, fault->pfn,
+ fault->max_level);
+ if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
+ return;
/*
* mmu_notifier_retry() was successful and mmu_lock is held, so
* the pmd can't be split from under us.
*/
- mask = KVM_PAGES_PER_HPAGE(level) - 1;
- VM_BUG_ON((gfn & mask) != (pfn & mask));
- *pfnp = pfn & ~mask;
-
- return level;
+ fault->goal_level = fault->req_level;
+ mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
+ VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
+ fault->pfn &= ~mask;
}
-void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
- kvm_pfn_t *pfnp, int *goal_levelp)
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
{
- int level = *goal_levelp;
-
- if (cur_level == level && level > PG_LEVEL_4K &&
+ if (cur_level > PG_LEVEL_4K &&
+ cur_level == fault->goal_level &&
is_shadow_present_pte(spte) &&
!is_large_pte(spte)) {
/*
@@ -2849,67 +2932,57 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
* patching back for them into pfn the next 9 bits of
* the address.
*/
- u64 page_mask = KVM_PAGES_PER_HPAGE(level) -
- KVM_PAGES_PER_HPAGE(level - 1);
- *pfnp |= gfn & page_mask;
- (*goal_levelp)--;
+ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
+ KVM_PAGES_PER_HPAGE(cur_level - 1);
+ fault->pfn |= fault->gfn & page_mask;
+ fault->goal_level--;
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault, bool is_tdp)
+static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int level, req_level, ret;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- gfn_t base_gfn = gfn;
-
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
+ int ret;
+ gfn_t base_gfn = fault->gfn;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(gpa, level, pfn);
- for_each_shadow_entry(vcpu, gpa, it) {
+ trace_kvm_mmu_spte_requested(fault);
+ for_each_shadow_entry(vcpu, fault->addr, it) {
/*
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(*it.sptep, gfn, it.level,
- &pfn, &level);
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
- if (it.level == level)
+ base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == fault->goal_level)
break;
drop_large_spte(vcpu, it.sptep);
- if (!is_shadow_present_pte(*it.sptep)) {
- sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
- it.level - 1, true, ACC_ALL);
-
- link_shadow_page(vcpu, it.sptep, sp);
- if (is_tdp && huge_page_disallowed &&
- req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
- }
+ if (is_shadow_present_pte(*it.sptep))
+ continue;
+
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+ it.level - 1, true, ACC_ALL);
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (fault->is_tdp && fault->huge_page_disallowed &&
+ fault->req_level >= it.level)
+ account_huge_nx_page(vcpu->kvm, sp);
}
- ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
- write, level, base_gfn, pfn, prefault,
- map_writable);
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+ base_gfn, fault->pfn, fault);
if (ret == RET_PF_SPURIOUS)
return ret;
direct_pte_prefetch(vcpu, it.sptep);
- ++vcpu->stat.pf_fixed;
return ret;
}
@@ -2936,54 +3009,70 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
return -EFAULT;
}
-static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- kvm_pfn_t pfn, unsigned int access,
- int *ret_val)
+static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ unsigned int access)
{
/* The pfn is invalid, report the error! */
- if (unlikely(is_error_pfn(pfn))) {
- *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
- return true;
- }
+ if (unlikely(is_error_pfn(fault->pfn)))
+ return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
- if (unlikely(is_noslot_pfn(pfn)))
- vcpu_cache_mmio_info(vcpu, gva, gfn,
+ if (unlikely(!fault->slot)) {
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
+
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
access & shadow_mmio_access_mask);
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop. Do not cache MMIO
+ * whose gfn is greater than host.MAXPHYADDR, any guest that
+ * generates such gfns is running nested and is being tricked
+ * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
+ * and only if L1's MAXPHYADDR is inaccurate with respect to
+ * the hardware's).
+ */
+ if (unlikely(!enable_mmio_caching) ||
+ unlikely(fault->gfn > kvm_mmu_max_gfn()))
+ return RET_PF_EMULATE;
+ }
- return false;
+ return RET_PF_CONTINUE;
}
-static bool page_fault_can_be_fast(u32 error_code)
+static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
{
/*
- * Do not fix the mmio spte with invalid generation number which
- * need to be updated by slow page fault path.
+ * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
+ * reach the common page fault handler if the SPTE has an invalid MMIO
+ * generation number. Refreshing the MMIO generation needs to go down
+ * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag!
*/
- if (unlikely(error_code & PFERR_RSVD_MASK))
- return false;
-
- /* See if the page fault is due to an NX violation */
- if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
- == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+ if (fault->rsvd)
return false;
/*
* #PF can be fast if:
- * 1. The shadow page table entry is not present, which could mean that
- * the fault is potentially caused by access tracking (if enabled).
- * 2. The shadow page table entry is present and the fault
- * is caused by write-protect, that means we just need change the W
- * bit of the spte which can be done out of mmu-lock.
*
- * However, if access tracking is disabled we know that a non-present
- * page must be a genuine page fault where we have to create a new SPTE.
- * So, if access tracking is disabled, we return true only for write
- * accesses to a present page.
+ * 1. The shadow page table entry is not present and A/D bits are
+ * disabled _by KVM_, which could mean that the fault is potentially
+ * caused by access tracking (if enabled). If A/D bits are enabled
+ * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
+ * bits for L2 and employ access tracking, but the fast page fault
+ * mechanism only supports direct MMUs.
+ * 2. The shadow page table entry is present, the access is a write,
+ * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
+ * the fault was caused by a write-protection violation. If the
+ * SPTE is MMU-writable (determined later), the fault can be fixed
+ * by setting the Writable bit, which can be done out of mmu_lock.
*/
+ if (!fault->present)
+ return !kvm_ad_enabled();
- return shadow_acc_track_mask != 0 ||
- ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
- == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
+ /*
+ * Note, instruction fetches and writes are mutually exclusive, ignore
+ * the "exec" flag.
+ */
+ return fault->write;
}
/*
@@ -2991,13 +3080,9 @@ static bool page_fault_can_be_fast(u32 error_code)
* someone else modified the SPTE from its original value.
*/
static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
u64 *sptep, u64 old_spte, u64 new_spte)
{
- gfn_t gfn;
-
- WARN_ON(!sp->role.direct);
-
/*
* Theoretically we could also set dirty bit (and flush TLB) here in
* order to eliminate unnecessary PML logging. See comments in
@@ -3013,24 +3098,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
return false;
- if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
- /*
- * The gfn of direct spte is stable since it is
- * calculated by sp->gfn.
- */
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- kvm_vcpu_mark_page_dirty(vcpu, gfn);
- }
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
+ mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
return true;
}
-static bool is_access_allowed(u32 fault_err_code, u64 spte)
+static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
{
- if (fault_err_code & PFERR_FETCH_MASK)
+ if (fault->exec)
return is_executable_pte(spte);
- if (fault_err_code & PFERR_WRITE_MASK)
+ if (fault->write)
return is_writable_pte(spte);
/* Fault was on Read access */
@@ -3038,18 +3117,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
}
/*
- * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between walk_shadow_page_lockless_{begin,end}.
+ * - The returned sptep must not be used after walk_shadow_page_lockless_end.
*/
-static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 error_code)
+static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
{
struct kvm_shadow_walk_iterator iterator;
+ u64 old_spte;
+ u64 *sptep = NULL;
+
+ for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
+ sptep = iterator.sptep;
+ *spte = old_spte;
+ }
+
+ return sptep;
+}
+
+/*
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ */
+static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
u64 spte = 0ull;
+ u64 *sptep = NULL;
uint retry_count = 0;
- if (!page_fault_can_be_fast(error_code))
+ if (!page_fault_can_be_fast(fault))
return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3057,11 +3158,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
do {
u64 new_spte;
- for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
- if (!is_shadow_present_pte(spte))
- break;
+ if (is_tdp_mmu(vcpu->arch.mmu))
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ else
+ sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+
+ if (!is_shadow_present_pte(spte))
+ break;
- sp = sptep_to_sp(iterator.sptep);
+ sp = sptep_to_sp(sptep);
if (!is_last_spte(spte, sp->role.level))
break;
@@ -3075,43 +3180,54 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* Need not check the access of upper level table entries since
* they are always ACC_ALL.
*/
- if (is_access_allowed(error_code, spte)) {
+ if (is_access_allowed(fault, spte)) {
ret = RET_PF_SPURIOUS;
break;
}
new_spte = spte;
- if (is_access_track_spte(spte))
+ /*
+ * KVM only supports fixing page faults outside of MMU lock for
+ * direct MMUs, nested MMUs are always indirect, and KVM always
+ * uses A/D bits for non-nested MMUs. Thus, if A/D bits are
+ * enabled, the SPTE can't be an access-tracked SPTE.
+ */
+ if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
new_spte = restore_acc_track_spte(new_spte);
/*
- * Currently, to simplify the code, write-protection can
- * be removed in the fast path only if the SPTE was
- * write-protected for dirty-logging or access tracking.
+ * To keep things simple, only SPTEs that are MMU-writable can
+ * be made fully writable outside of mmu_lock, e.g. only SPTEs
+ * that were write-protected for dirty-logging or access
+ * tracking are handled here. Don't bother checking if the
+ * SPTE is writable to prioritize running with A/D bits enabled.
+ * The is_access_allowed() check above handles the common case
+ * of the fault being spurious, and the SPTE is known to be
+ * shadow-present, i.e. except for access tracking restoration
+ * making the new SPTE writable, the check is wasteful.
*/
- if ((error_code & PFERR_WRITE_MASK) &&
- spte_can_locklessly_be_made_writable(spte)) {
+ if (fault->write && is_mmu_writable_spte(spte)) {
new_spte |= PT_WRITABLE_MASK;
/*
- * Do not fix write-permission on the large spte. Since
- * we only dirty the first page into the dirty-bitmap in
+ * Do not fix write-permission on the large spte when
+ * dirty logging is enabled. Since we only dirty the
+ * first page into the dirty-bitmap in
* fast_pf_fix_direct_spte(), other pages are missed
* if its slot has dirty logging enabled.
*
* Instead, we let the slow page fault path create a
* normal spte to fix the access.
- *
- * See the comments in kvm_arch_commit_memory_region().
*/
- if (sp->role.level > PG_LEVEL_4K)
+ if (sp->role.level > PG_LEVEL_4K &&
+ kvm_slot_dirty_track_enabled(fault->slot))
break;
}
/* Verify that the fault can be handled in the fast path */
if (new_spte == spte ||
- !is_access_allowed(error_code, new_spte))
+ !is_access_allowed(fault, new_spte))
break;
/*
@@ -3119,8 +3235,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* since the gfn is not stable for indirect shadow page. See
* Documentation/virt/kvm/locking.rst to get more detail.
*/
- if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
- new_spte)) {
+ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
ret = RET_PF_FIXED;
break;
}
@@ -3133,10 +3248,12 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
} while (true);
- trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
- spte, ret);
+ trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
walk_shadow_page_lockless_end(vcpu);
+ if (ret != RET_PF_INVALID)
+ vcpu->stat.pf_fast++;
+
return ret;
}
@@ -3149,30 +3266,32 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
return;
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+ if (WARN_ON(!sp))
+ return;
- if (kvm_mmu_put_root(kvm, sp)) {
- if (is_tdp_mmu_page(sp))
- kvm_tdp_mmu_free_root(kvm, sp);
- else if (sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
- }
+ if (is_tdp_mmu_page(sp))
+ kvm_tdp_mmu_put_root(kvm, sp, false);
+ else if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
*root_hpa = INVALID_PAGE;
}
/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
{
- struct kvm *kvm = vcpu->kvm;
int i;
LIST_HEAD(invalid_list);
- bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
+ bool free_active_root;
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
/* Before acquiring the MMU lock, see if we need to do any real work. */
- if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
+ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
+ && VALID_PAGE(mmu->root.hpa);
+
+ if (!free_active_root) {
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
VALID_PAGE(mmu->prev_roots[i].hpa))
@@ -3190,18 +3309,20 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
&invalid_list);
if (free_active_root) {
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
- mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
- } else {
- for (i = 0; i < 4; ++i)
- if (mmu->pae_root[i] != 0)
- mmu_free_root_page(kvm,
- &mmu->pae_root[i],
- &invalid_list);
- mmu->root_hpa = INVALID_PAGE;
+ if (to_shadow_page(mmu->root.hpa)) {
+ mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
+ } else if (mmu->pae_root) {
+ for (i = 0; i < 4; ++i) {
+ if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+ continue;
+
+ mmu_free_root_page(kvm, &mmu->pae_root[i],
+ &invalid_list);
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ }
}
- mmu->root_pgd = 0;
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
@@ -3209,6 +3330,33 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ hpa_t root_hpa;
+ int i;
+
+ /*
+ * This should not be called while L2 is active, L2 can't invalidate
+ * _only_ its own roots, e.g. INVVPID unconditionally exits.
+ */
+ WARN_ON_ONCE(mmu->root_role.guest_mode);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ root_hpa = mmu->prev_roots[i].hpa;
+ if (!VALID_PAGE(root_hpa))
+ continue;
+
+ if (!to_shadow_page(root_hpa) ||
+ to_shadow_page(root_hpa)->role.guest_mode)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
+
+
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
int ret = 0;
@@ -3226,153 +3374,335 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
{
struct kvm_mmu_page *sp;
- write_lock(&vcpu->kvm->mmu_lock);
-
- if (make_mmu_pages_available(vcpu)) {
- write_unlock(&vcpu->kvm->mmu_lock);
- return INVALID_PAGE;
- }
sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
++sp->root_count;
- write_unlock(&vcpu->kvm->mmu_lock);
return __pa(sp->spt);
}
static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
- u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u8 shadow_root_level = mmu->root_role.level;
hpa_t root;
unsigned i;
+ int r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ mmu->root.hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
- root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
- true);
-
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+ mmu->root.hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
for (i = 0; i < 4; ++i) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, true);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
+ mmu->pae_root[i] = root | PT_PRESENT_MASK |
+ shadow_me_value;
}
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
- } else
- BUG();
+ mmu->root.hpa = __pa(mmu->pae_root);
+ } else {
+ WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+ r = -EIO;
+ goto out_unlock;
+ }
- /* root_pgd is ignored for direct MMUs. */
- vcpu->arch.mmu->root_pgd = 0;
+ /* root.pgd is ignored for direct MMUs. */
+ mmu->root.pgd = 0;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
- return 0;
+static int mmu_first_shadow_root_alloc(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ /*
+ * Check if this is the first shadow root being allocated before
+ * taking the lock.
+ */
+ if (kvm_shadow_root_allocated(kvm))
+ return 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /* Recheck, under the lock, whether this is the first shadow root. */
+ if (kvm_shadow_root_allocated(kvm))
+ goto out_unlock;
+
+ /*
+ * Check if anything actually needs to be allocated, e.g. all metadata
+ * will be allocated upfront if TDP is disabled.
+ */
+ if (kvm_memslots_have_rmaps(kvm) &&
+ kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Both of these functions are no-ops if the target is
+ * already allocated, so unconditionally calling both
+ * is safe. Intentionally do NOT free allocations on
+ * failure to avoid having to track which allocations
+ * were made now versus when the memslot was created.
+ * The metadata is guaranteed to be freed when the slot
+ * is freed, and will be kept/used if userspace retries
+ * KVM_RUN instead of killing the VM.
+ */
+ r = memslot_rmap_alloc(slot, slot->npages);
+ if (r)
+ goto out_unlock;
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Ensure that shadow_root_allocated becomes true strictly after
+ * all the related pointers are set.
+ */
+out_success:
+ smp_store_release(&kvm->arch.shadow_root_allocated, true);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
}
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
- u64 pdptr, pm_mask;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 pdptrs[4], pm_mask;
gfn_t root_gfn, root_pgd;
hpa_t root;
- int i;
+ unsigned i;
+ int r;
- root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+ root_pgd = mmu->get_guest_pgd(vcpu);
root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
/*
+ * On SVM, reading PDPTRs might access guest memory, which might fault
+ * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
+ */
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ pdptrs[i] = mmu->get_pdptr(vcpu, i);
+ if (!(pdptrs[i] & PT_PRESENT_MASK))
+ continue;
+
+ if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ return 1;
+ }
+ }
+
+ r = mmu_first_shadow_root_alloc(vcpu->kvm);
+ if (r)
+ return r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ /*
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
+ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
- vcpu->arch.mmu->shadow_root_level, false);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ mmu->root_role.level, false);
+ mmu->root.hpa = root;
goto set_root_pgd;
}
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
/*
* We shadow a 32 bit page table. This may be a legacy 2-level
* or a PAE 3-level page table. In either case we need to be aware that
* the shadow page table may be a PAE or a long mode page table.
*/
- pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ pm_mask = PT_PRESENT_MASK | shadow_me_value;
+ if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+ if (WARN_ON_ONCE(!mmu->pml4_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
+
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pml5_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
+ }
+ }
+
for (i = 0; i < 4; ++i) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
- if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
- pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
- if (!(pdptr & PT_PRESENT_MASK)) {
- vcpu->arch.mmu->pae_root[i] = 0;
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
continue;
}
- root_gfn = pdptr >> PAGE_SHIFT;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
+ root_gfn = pdptrs[i] >> PAGE_SHIFT;
}
root = mmu_alloc_root(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, false);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->pae_root[i] = root | pm_mask;
+ mmu->pae_root[i] = root | pm_mask;
}
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
+
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL)
+ mmu->root.hpa = __pa(mmu->pml5_root);
+ else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
+ mmu->root.hpa = __pa(mmu->pml4_root);
+ else
+ mmu->root.hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+ mmu->root.pgd = root_pgd;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return r;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
+ u64 *pml5_root = NULL;
+ u64 *pml4_root = NULL;
+ u64 *pae_root;
/*
- * If we shadow a 32 bit page table with a long mode page
- * table we enter this path.
+ * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+ * tables are allocated and initialized at root creation as there is no
+ * equivalent level in the guest's NPT to shadow. Allocate the tables
+ * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
*/
- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
- if (vcpu->arch.mmu->lm_root == NULL) {
- /*
- * The additional page necessary for this is only
- * allocated on demand.
- */
-
- u64 *lm_root;
+ if (mmu->root_role.direct ||
+ mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
+ mmu->root_role.level < PT64_ROOT_4LEVEL)
+ return 0;
- lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (lm_root == NULL)
- return 1;
+ /*
+ * NPT, the only paging mode that uses this horror, uses a fixed number
+ * of levels for the shadow page tables, e.g. all MMUs are 4-level or
+ * all MMus are 5-level. Thus, this can safely require that pml5_root
+ * is allocated if the other roots are valid and pml5 is needed, as any
+ * prior MMU would also have required pml5.
+ */
+ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
+ return 0;
- lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
+ /*
+ * The special roots should always be allocated in concert. Yell and
+ * bail if KVM ends up in a state where only one of the roots is valid.
+ */
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
+ (need_pml5 && mmu->pml5_root)))
+ return -EIO;
- vcpu->arch.mmu->lm_root = lm_root;
- }
+ /*
+ * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+ * doesn't need to be decrypted.
+ */
+ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pae_root)
+ return -ENOMEM;
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
+#ifdef CONFIG_X86_64
+ pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml4_root)
+ goto err_pml4;
+
+ if (need_pml5) {
+ pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml5_root)
+ goto err_pml5;
}
+#endif
-set_root_pgd:
- vcpu->arch.mmu->root_pgd = root_pgd;
+ mmu->pae_root = pae_root;
+ mmu->pml4_root = pml4_root;
+ mmu->pml5_root = pml5_root;
return 0;
+
+#ifdef CONFIG_X86_64
+err_pml5:
+ free_page((unsigned long)pml4_root);
+err_pml4:
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
+#endif
}
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static bool is_unsync_root(hpa_t root)
{
- if (vcpu->arch.mmu->direct_map)
- return mmu_alloc_direct_roots(vcpu);
- else
- return mmu_alloc_shadow_roots(vcpu);
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root))
+ return false;
+
+ /*
+ * The read barrier orders the CPU's read of SPTE.W during the page table
+ * walk before the reads of sp->unsync/sp->unsync_children here.
+ *
+ * Even if another CPU was marking the SP as unsync-ed simultaneously,
+ * any guest page table changes are not guaranteed to be visible anyway
+ * until this VCPU issues a TLB flush strictly after those changes are
+ * made. We only need to ensure that the other CPU sets these flags
+ * before any actual changes to the page tables are made. The comments
+ * in mmu_try_to_unsync_pages() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ smp_rmb();
+ sp = to_shadow_page(root);
+
+ /*
+ * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
+ * PDPTEs for a given PAE root need to be synchronized individually.
+ */
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ if (sp->unsync || sp->unsync_children)
+ return true;
+
+ return false;
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3380,87 +3710,62 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
int i;
struct kvm_mmu_page *sp;
- if (vcpu->arch.mmu->direct_map)
+ if (vcpu->arch.mmu->root_role.direct)
return;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+ if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
return;
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu->root_hpa;
+ if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu->root.hpa;
sp = to_shadow_page(root);
- /*
- * Even if another CPU was marking the SP as unsync-ed
- * simultaneously, any guest page table changes are not
- * guaranteed to be visible anyway until this VCPU issues a TLB
- * flush strictly after those changes are made. We only need to
- * ensure that the other CPU sets these flags before any actual
- * changes to the page tables are made. The comments in
- * mmu_need_write_protect() describe what could go wrong if this
- * requirement isn't satisfied.
- */
- if (!smp_load_acquire(&sp->unsync) &&
- !smp_load_acquire(&sp->unsync_children))
+ if (!is_unsync_root(root))
return;
write_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
-
- mmu_sync_children(vcpu, sp);
-
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ mmu_sync_children(vcpu, sp, true);
write_unlock(&vcpu->kvm->mmu_lock);
return;
}
write_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
- if (root && VALID_PAGE(root)) {
+ if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
- mmu_sync_children(vcpu, sp);
+ mmu_sync_children(vcpu, sp, true);
}
}
- kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access, struct x86_exception *exception)
-{
- if (exception)
- exception->error_code = 0;
- return vaddr;
-}
-
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access,
- struct x86_exception *exception)
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
{
- if (exception)
- exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
-}
+ unsigned long roots_to_free = 0;
+ int i;
-static bool
-__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
-{
- int bit7 = (pte >> 7) & 1;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
- return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
+ /* sync prev_roots by simply freeing them */
+ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
}
-static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t vaddr, u64 access,
+ struct x86_exception *exception)
{
- return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+ if (exception)
+ exception->error_code = 0;
+ return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
}
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -3481,6 +3786,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
/*
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
+ *
+ * Must be called between walk_shadow_page_lockless_{begin,end}.
*/
static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
{
@@ -3488,8 +3795,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
int leaf = -1;
u64 spte;
- walk_shadow_page_lockless_begin(vcpu);
-
for (shadow_walk_init(&iterator, vcpu, addr),
*root_level = iterator.level;
shadow_walk_okay(&iterator);
@@ -3498,13 +3803,8 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
spte = mmu_spte_get_lockless(iterator.sptep);
sptes[leaf] = spte;
-
- if (!is_shadow_present_pte(spte))
- break;
}
- walk_shadow_page_lockless_end(vcpu);
-
return leaf;
}
@@ -3516,16 +3816,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
int root, leaf, level;
bool reserved = false;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) {
- *sptep = 0ull;
- return reserved;
- }
+ walk_shadow_page_lockless_begin(vcpu);
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu(vcpu->arch.mmu))
leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
leaf = get_walk(vcpu, addr, sptes, &root);
+ walk_shadow_page_lockless_end(vcpu);
+
if (unlikely(leaf < 0)) {
*sptep = 0ull;
return reserved;
@@ -3545,20 +3844,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
for (level = root; level >= leaf; level--)
- /*
- * Use a bitwise-OR instead of a logical-OR to aggregate the
- * reserved bit and EPT's invalid memtype/XWR checks to avoid
- * adding a Jcc in the loop.
- */
- reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) |
- __is_rsvd_bits_set(rsvd_check, sptes[level], level);
+ reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
if (reserved) {
- pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+ pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
__func__, addr);
for (level = root; level >= leaf; level--)
- pr_err("------ spte 0x%llx level %d.\n",
- sptes[level], level);
+ pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+ sptes[level], level,
+ get_rsvd_bits(rsvd_check, sptes[level], level));
}
return reserved;
@@ -3599,20 +3893,19 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
}
static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
- u32 error_code, gfn_t gfn)
+ struct kvm_page_fault *fault)
{
- if (unlikely(error_code & PFERR_RSVD_MASK))
+ if (unlikely(fault->rsvd))
return false;
- if (!(error_code & PFERR_PRESENT_MASK) ||
- !(error_code & PFERR_WRITE_MASK))
+ if (!fault->present || !fault->write)
return false;
/*
* guest is writing the page which is write tracked which can
* not be fixed by page fault handler.
*/
- if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
return true;
return false;
@@ -3624,83 +3917,155 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
u64 spte;
walk_shadow_page_lockless_begin(vcpu);
- for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
clear_sp_write_flooding_count(iterator.sptep);
- if (!is_shadow_present_pte(spte))
- break;
- }
walk_shadow_page_lockless_end(vcpu);
}
+static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
+{
+ /* make sure the token value is not 0 */
+ u32 id = vcpu->arch.apf.id;
+
+ if (id << 12 == 0)
+ vcpu->arch.apf.id = 1;
+
+ return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+}
+
static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
gfn_t gfn)
{
struct kvm_arch_async_pf arch;
- arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.token = alloc_apf_token(vcpu);
arch.gfn = gfn;
- arch.direct_map = vcpu->arch.mmu->direct_map;
+ arch.direct_map = vcpu->arch.mmu->root_role.direct;
arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
return kvm_setup_async_pf(vcpu, cr2_or_gpa,
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
- bool write, bool *writable)
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
+ work->wakeup_all)
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu->root_role.direct &&
+ work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
+ return;
+
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
+}
+
+static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ struct kvm_memory_slot *slot = fault->slot;
bool async;
- /* Don't expose private memslots to L2. */
- if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
- *pfn = KVM_PFN_NOSLOT;
- *writable = false;
- return false;
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+ return RET_PF_RETRY;
+
+ if (!kvm_is_visible_memslot(slot)) {
+ /* Don't expose private memslots to L2. */
+ if (is_guest_mode(vcpu)) {
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
+ return RET_PF_CONTINUE;
+ }
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
+ !kvm_apicv_activated(vcpu->kvm))
+ return RET_PF_EMULATE;
}
async = false;
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
- write, writable, hva);
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
+ fault->write, &fault->map_writable,
+ &fault->hva);
if (!async)
- return false; /* *pfn has correct page already */
+ return RET_PF_CONTINUE; /* *pfn has correct page already */
- if (!prefault && kvm_can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
- if (kvm_find_async_pf_gfn(vcpu, gfn)) {
- trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
+ if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(fault->addr, fault->gfn);
+ if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
+ trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- return true;
- } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
- return true;
+ return RET_PF_RETRY;
+ } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
+ return RET_PF_RETRY;
+ }
}
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
- write, writable, hva);
- return false;
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
+ fault->write, &fault->map_writable,
+ &fault->hva);
+ return RET_PF_CONTINUE;
}
-static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault, int max_level, bool is_tdp)
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int mmu_seq)
{
- bool write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
+ struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
+
+ /* Special roots, e.g. pae_root, are not backed by shadow pages. */
+ if (sp && is_obsolete_sp(vcpu->kvm, sp))
+ return true;
+
+ /*
+ * Roots without an associated shadow page are considered invalid if
+ * there is a pending request to free obsolete roots. The request is
+ * only a hint that the current root _may_ be obsolete and needs to be
+ * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+ * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+ * to reload even if no vCPU is actively using the root.
+ */
+ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ return true;
+
+ return fault->slot &&
+ mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+}
+
+static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
- gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
- kvm_pfn_t pfn;
- hva_t hva;
int r;
- if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ fault->gfn = fault->addr >> PAGE_SHIFT;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
- if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
- r = fast_page_fault(vcpu, gpa, error_code);
- if (r != RET_PF_INVALID)
- return r;
- }
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
r = mmu_topup_memory_caches(vcpu, false);
if (r)
@@ -3709,50 +4074,50 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
- write, &map_writable))
- return RET_PF_RETRY;
+ r = kvm_faultin_pfn(vcpu, fault);
+ if (r != RET_PF_CONTINUE)
+ return r;
- if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+ r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
return r;
r = RET_PF_RETRY;
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu_fault)
read_lock(&vcpu->kvm->mmu_lock);
else
write_lock(&vcpu->kvm->mmu_lock);
- if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
+
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
- r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
- pfn, prefault);
+ if (is_tdp_mmu_fault)
+ r = kvm_tdp_mmu_map(vcpu, fault);
else
- r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
- prefault, is_tdp);
+ r = __direct_map(vcpu, fault);
out_unlock:
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu_fault)
read_unlock(&vcpu->kvm->mmu_lock);
else
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
+ kvm_release_pfn_clean(fault->pfn);
return r;
}
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
- u32 error_code, bool prefault)
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
- pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+ pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
- return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
- PG_LEVEL_2M, false);
+ fault->max_level = PG_LEVEL_2M;
+ return direct_page_fault(vcpu, fault);
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
@@ -3788,118 +4153,139 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault)
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- int max_level;
-
- for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
- max_level > PG_LEVEL_4K;
- max_level--) {
- int page_num = KVM_PAGES_PER_HPAGE(max_level);
- gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
+ while (fault->max_level > PG_LEVEL_4K) {
+ int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
+ gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
break;
+
+ --fault->max_level;
}
- return direct_page_fault(vcpu, gpa, error_code, prefault,
- max_level, true);
+ return direct_page_fault(vcpu, fault);
}
-static void nonpaging_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->root_level = 0;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->direct_map = true;
- context->nx = false;
}
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
return (role.direct || pgd == root->pgd) &&
- VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
+ VALID_PAGE(root->hpa) &&
role.word == to_shadow_page(root->hpa)->role.word;
}
/*
- * Find out if a previously cached root matching the new pgd/role is available.
- * The current root is also inserted into the cache.
- * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
- * returned.
- * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
- * false is returned. This root should now be freed by the caller.
+ * Find out if a previously cached root matching the new pgd/role is available,
+ * and insert the current root as the MRU in the cache.
+ * If a matching root is found, it is assigned to kvm_mmu->root and
+ * true is returned.
+ * If no match is found, kvm_mmu->root is left invalid, the LRU root is
+ * evicted to make room for the current root, and false is returned.
*/
-static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role)
+static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
{
uint i;
- struct kvm_mmu_root_info root;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
- root.pgd = mmu->root_pgd;
- root.hpa = mmu->root_hpa;
-
- if (is_root_usable(&root, new_pgd, new_role))
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
return true;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
- swap(root, mmu->prev_roots[i]);
-
- if (is_root_usable(&root, new_pgd, new_role))
- break;
+ /*
+ * The swaps end up rotating the cache like this:
+ * C 0 1 2 3 (on entry to the function)
+ * 0 C 1 2 3
+ * 1 C 0 2 3
+ * 2 C 0 1 3
+ * 3 C 0 1 2 (on exit from the loop)
+ */
+ swap(mmu->root, mmu->prev_roots[i]);
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
}
- mmu->root_hpa = root.hpa;
- mmu->root_pgd = root.pgd;
-
- return i < KVM_MMU_NUM_PREV_ROOTS;
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+ return false;
}
-static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role)
+/*
+ * Find out if a previously cached root matching the new pgd/role is available.
+ * On entry, mmu->root is invalid.
+ * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
+ * of the cache becomes invalid, and true is returned.
+ * If no match is found, kvm_mmu->root is left invalid and false is returned.
+ */
+static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ uint i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
+ goto hit;
+
+ return false;
+
+hit:
+ swap(mmu->root, mmu->prev_roots[i]);
+ /* Bubble up the remaining roots. */
+ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
+ mmu->prev_roots[i] = mmu->prev_roots[i + 1];
+ mmu->prev_roots[i].hpa = INVALID_PAGE;
+ return true;
+}
+static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd, union kvm_mmu_page_role new_role)
+{
/*
- * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
+ * For now, limit the caching to 64-bit hosts+VMs in order to avoid
* having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
* later if necessary.
*/
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
- mmu->root_level >= PT64_ROOT_4LEVEL)
- return cached_root_available(vcpu, new_pgd, new_role);
+ if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
- return false;
+ if (VALID_PAGE(mmu->root.hpa))
+ return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
+ else
+ return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
}
-static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role,
- bool skip_tlb_flush, bool skip_mmu_sync)
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
{
- if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role new_role = mmu->root_role;
+
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
+ /* kvm_mmu_ensure_valid_pgd will set up a new root. */
return;
}
/*
* It's possible that the cached previous root page is obsolete because
* of a change in the MMU generation number. However, changing the
- * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
- * free the root set here and allocate a new one.
+ * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
+ * which will free the root set here and allocate a new one.
*/
kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
- if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
+ if (force_flush_and_sync_on_reuse) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
/*
* The last MMIO access's GVA and GPA are cached in the VCPU. When
@@ -3915,14 +4301,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
if (!new_role.direct)
__clear_sp_write_flooding_count(
- to_shadow_page(vcpu->arch.mmu->root_hpa));
-}
-
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
- bool skip_mmu_sync)
-{
- __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
- skip_tlb_flush, skip_mmu_sync);
+ to_shadow_page(vcpu->arch.mmu->root.hpa));
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
@@ -3932,7 +4311,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
}
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- unsigned int access, int *nr_present)
+ unsigned int access)
{
if (unlikely(is_mmio_spte(*sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3940,7 +4319,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return true;
}
- (*nr_present)++;
mark_mmio_spte(vcpu, sptep, gfn, access);
return true;
}
@@ -3948,26 +4326,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return false;
}
-static inline bool is_last_gpte(struct kvm_mmu *mmu,
- unsigned level, unsigned gpte)
-{
- /*
- * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
- * If it is clear, there are no large pages at this level, so clear
- * PT_PAGE_SIZE_MASK in gpte if that is the case.
- */
- gpte &= level - mmu->last_nonleaf_level;
-
- /*
- * PG_LEVEL_4K always terminates. The RHS has bit 7 set
- * iff level <= PG_LEVEL_4K, which for our purpose means
- * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
- */
- gpte |= level - PG_LEVEL_4K - 1;
-
- return gpte & PT_PAGE_SIZE_MASK;
-}
-
#define PTTYPE_EPT 18 /* arbitrary */
#define PTTYPE PTTYPE_EPT
#include "paging_tmpl.h"
@@ -3982,8 +4340,7 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
#undef PTTYPE
static void
-__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct rsvd_bits_validate *rsvd_check,
+__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
bool pse, bool amd)
{
@@ -4072,35 +4429,56 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
}
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ */
+ return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
+}
+
+static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
+ __reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
- context->root_level, context->nx,
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
- is_pse(vcpu),
+ context->cpu_role.base.level, is_efer_nx(context),
+ guest_can_use_gbpages(vcpu),
+ is_cr4_pse(context),
guest_cpuid_is_amd_or_hygon(vcpu));
}
static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, bool execonly)
+ u64 pa_bits_rsvd, bool execonly, int huge_page_level)
{
u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+ u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
u64 bad_mt_xwr;
+ if (huge_page_level < PG_LEVEL_1G)
+ large_1g_rsvd = rsvd_bits(7, 7);
+ if (huge_page_level < PG_LEVEL_2M)
+ large_2m_rsvd = rsvd_bits(7, 7);
+
rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
/* large page */
rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29);
- rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
@@ -4116,10 +4494,11 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
}
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+ struct kvm_mmu *context, bool execonly, int huge_page_level)
{
__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
- vcpu->arch.reserved_gpa_bits, execonly);
+ vcpu->arch.reserved_gpa_bits, execonly,
+ huge_page_level);
}
static inline u64 reserved_hpa_bits(void)
@@ -4132,35 +4511,41 @@ static inline u64 reserved_hpa_bits(void)
* table in guest or amd nested guest, its mmu features completely
* follow the features in guest.
*/
-void
-reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- bool uses_nx = context->nx ||
- context->mmu_role.base.smep_andnot_wp;
+ /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
+ bool is_amd = true;
+ /* KVM doesn't use 2-level page tables for the shadow MMU. */
+ bool is_pse = false;
struct rsvd_bits_validate *shadow_zero_check;
int i;
- /*
- * Passing "true" to the last argument is okay; it adds a check
- * on bit 8 of the SPTEs which KVM doesn't use anyway.
- */
+ WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
+
shadow_zero_check = &context->shadow_zero_check;
- __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- reserved_hpa_bits(),
- context->shadow_root_level, uses_nx,
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
- is_pse(vcpu), true);
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ context->root_role.level,
+ context->root_role.efer_nx,
+ guest_can_use_gbpages(vcpu), is_pse, is_amd);
if (!shadow_me_mask)
return;
- for (i = context->shadow_root_level; --i >= 0;) {
- shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
- shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ for (i = context->root_role.level; --i >= 0;) {
+ /*
+ * So far shadow_me_value is a constant during KVM's life
+ * time. Bits in shadow_me_value are allowed to be set.
+ * Bits in shadow_me_mask but not in shadow_me_value are
+ * not allowed to be set.
+ */
+ shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
}
}
-EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
static inline bool boot_cpu_is_amd(void)
{
@@ -4173,8 +4558,7 @@ static inline bool boot_cpu_is_amd(void)
* possible, however, kvm currently does not do execution-protection.
*/
static void
-reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -4182,19 +4566,19 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
shadow_zero_check = &context->shadow_zero_check;
if (boot_cpu_is_amd())
- __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- reserved_hpa_bits(),
- context->shadow_root_level, false,
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ context->root_role.level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
- true, true);
+ false, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
- reserved_hpa_bits(), false);
+ reserved_hpa_bits(), false,
+ max_huge_page_level);
if (!shadow_me_mask)
return;
- for (i = context->shadow_root_level; --i >= 0;) {
+ for (i = context->root_role.level; --i >= 0;) {
shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
}
@@ -4205,11 +4589,11 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
* is the shadow page table for intel nested guest.
*/
static void
-reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- reserved_hpa_bits(), execonly);
+ reserved_hpa_bits(), execonly,
+ max_huge_page_level);
}
#define BYTE_MASK(access) \
@@ -4222,8 +4606,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
(7 & (access) ? 128 : 0))
-static void update_permission_bitmask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu, bool ept)
+static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
{
unsigned byte;
@@ -4231,9 +4614,10 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
const u8 w = BYTE_MASK(ACC_WRITE_MASK);
const u8 u = BYTE_MASK(ACC_USER_MASK);
- bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
- bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
- bool cr0_wp = is_write_protection(vcpu);
+ bool cr4_smep = is_cr4_smep(mmu);
+ bool cr4_smap = is_cr4_smap(mmu);
+ bool cr0_wp = is_cr0_wp(mmu);
+ bool efer_nx = is_efer_nx(mmu);
for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
unsigned pfec = byte << 1;
@@ -4259,7 +4643,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
/* Not really needed: !nx will cause pte.nx to fault */
- if (!mmu->nx)
+ if (!efer_nx)
ff = 0;
/* Allow supervisor writes if !cr0.wp */
@@ -4278,11 +4662,11 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
* - X86_CR4_SMAP is set in CR4
* - A user page is accessed
* - The access is not a fetch
- * - Page fault in kernel mode
- * - if CPL = 3 or X86_EFLAGS_AC is clear
+ * - The access is supervisor mode
+ * - If implicit supervisor access or X86_EFLAGS_AC is clear
*
- * Here, we cover the first three conditions.
- * The fourth is computed dynamically in permission_fault();
+ * Here, we cover the first four conditions.
+ * The fifth is computed dynamically in permission_fault();
* PFERR_RSVD_MASK bit will be set in PFEC if the access is
* *not* subject to SMAP restrictions.
*/
@@ -4318,24 +4702,17 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
* away both AD and WD. For all reads or if the last condition holds, WD
* only will be masked away.
*/
-static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- bool ept)
+static void update_pkru_bitmask(struct kvm_mmu *mmu)
{
unsigned bit;
bool wp;
- if (ept) {
- mmu->pkru_mask = 0;
- return;
- }
+ mmu->pkru_mask = 0;
- /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
- mmu->pkru_mask = 0;
+ if (!is_cr4_pke(mmu))
return;
- }
- wp = is_write_protection(vcpu);
+ wp = is_cr0_wp(mmu);
for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
unsigned pfec, pkey_bits;
@@ -4369,108 +4746,79 @@ static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
}
-static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
{
- unsigned root_level = mmu->root_level;
+ if (!is_cr0_pg(mmu))
+ return;
- mmu->last_nonleaf_level = root_level;
- if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
- mmu->last_nonleaf_level++;
+ reset_guest_rsvds_bits_mask(vcpu, mmu);
+ update_permission_bitmask(mmu, false);
+ update_pkru_bitmask(mmu);
}
-static void paging64_init_context_common(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context,
- int level)
+static void paging64_init_context(struct kvm_mmu *context)
{
- context->nx = is_nx(vcpu);
- context->root_level = level;
-
- reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
-
- MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
- context->shadow_root_level = level;
- context->direct_map = false;
-}
-
-static void paging64_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- int root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
-
- paging64_init_context_common(vcpu, context, root_level);
}
-static void paging32_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void paging32_init_context(struct kvm_mmu *context)
{
- context->nx = false;
- context->root_level = PT32_ROOT_LEVEL;
-
- reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
-
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->direct_map = false;
-}
-
-static void paging32E_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
}
-static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
+static union kvm_cpu_role
+kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
{
- union kvm_mmu_extended_role ext = {0};
-
- ext.cr0_pg = !!is_paging(vcpu);
- ext.cr4_pae = !!is_pae(vcpu);
- ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
- ext.cr4_pse = !!is_pse(vcpu);
- ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
- ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
-
- ext.valid = 1;
-
- return ext;
-}
-
-static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
- bool base_only)
-{
- union kvm_mmu_role role = {0};
+ union kvm_cpu_role role = {0};
role.base.access = ACC_ALL;
- role.base.nxe = !!is_nx(vcpu);
- role.base.cr0_wp = is_write_protection(vcpu);
role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu);
+ role.ext.valid = 1;
- if (base_only)
+ if (!____is_cr0_pg(regs)) {
+ role.base.direct = 1;
return role;
+ }
+
+ role.base.efer_nx = ____is_efer_nx(regs);
+ role.base.cr0_wp = ____is_cr0_wp(regs);
+ role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+ role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
+ role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
+
+ if (____is_efer_lma(regs))
+ role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
+ : PT64_ROOT_4LEVEL;
+ else if (____is_cr4_pae(regs))
+ role.base.level = PT32E_ROOT_LEVEL;
+ else
+ role.base.level = PT32_ROOT_LEVEL;
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ role.ext.cr4_smep = ____is_cr4_smep(regs);
+ role.ext.cr4_smap = ____is_cr4_smap(regs);
+ role.ext.cr4_pse = ____is_cr4_pse(regs);
+ /* PKEY and LA57 are active iff long mode is active. */
+ role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+ role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ role.ext.efer_lma = ____is_efer_lma(regs);
return role;
}
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
+ /* tdp_root_level is architecture forced level, use it if nonzero */
+ if (tdp_root_level)
+ return tdp_root_level;
+
/* Use 5-level TDP if and only if it's useful/necessary. */
if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
return 4;
@@ -4478,238 +4826,204 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
return max_tdp_level;
}
-static union kvm_mmu_role
-kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
{
- union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
+ union kvm_mmu_page_role role = {0};
- role.base.ad_disabled = (shadow_accessed_mask == 0);
- role.base.level = kvm_mmu_get_tdp_level(vcpu);
- role.base.direct = true;
- role.base.gpte_is_8_bytes = true;
+ role.access = ACC_ALL;
+ role.cr0_wp = true;
+ role.efer_nx = true;
+ role.smm = cpu_role.base.smm;
+ role.guest_mode = cpu_role.base.guest_mode;
+ role.ad_disabled = !kvm_ad_enabled();
+ role.level = kvm_mmu_get_tdp_level(vcpu);
+ role.direct = true;
+ role.has_4_byte_gpte = false;
return role;
}
-static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
- union kvm_mmu_role new_role =
- kvm_calc_tdp_mmu_root_page_role(vcpu, false);
+ union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
- if (new_role.as_u64 == context->mmu_role.as_u64)
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
return;
- context->mmu_role.as_u64 = new_role.as_u64;
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
context->page_fault = kvm_tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
- context->direct_map = true;
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
- if (!is_paging(vcpu)) {
- context->nx = false;
+ if (!is_cr0_pg(context))
context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->root_level = 0;
- } else if (is_long_mode(vcpu)) {
- context->nx = is_nx(vcpu);
- context->root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
+ else if (is_cr4_pae(context))
context->gva_to_gpa = paging64_gva_to_gpa;
- } else if (is_pae(vcpu)) {
- context->nx = is_nx(vcpu);
- context->root_level = PT32E_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
- context->gva_to_gpa = paging64_gva_to_gpa;
- } else {
- context->nx = false;
- context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
+ else
context->gva_to_gpa = paging32_gva_to_gpa;
- }
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
- reset_tdp_shadow_zero_bits_mask(vcpu, context);
+ reset_guest_paging_metadata(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(context);
}
-static union kvm_mmu_role
-kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
-{
- union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
-
- role.base.smep_andnot_wp = role.ext.cr4_smep &&
- !is_write_protection(vcpu);
- role.base.smap_andnot_wp = role.ext.cr4_smap &&
- !is_write_protection(vcpu);
- role.base.gpte_is_8_bytes = !!is_pae(vcpu);
-
- return role;
-}
-
-static union kvm_mmu_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ union kvm_cpu_role cpu_role,
+ union kvm_mmu_page_role root_role)
{
- union kvm_mmu_role role =
- kvm_calc_shadow_root_page_role_common(vcpu, base_only);
-
- role.base.direct = !is_paging(vcpu);
-
- if (!is_long_mode(vcpu))
- role.base.level = PT32E_ROOT_LEVEL;
- else if (is_la57_mode(vcpu))
- role.base.level = PT64_ROOT_5LEVEL;
- else
- role.base.level = PT64_ROOT_4LEVEL;
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
+ return;
- return role;
-}
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
-static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- u32 cr0, u32 cr4, u32 efer,
- union kvm_mmu_role new_role)
-{
- if (!(cr0 & X86_CR0_PG))
- nonpaging_init_context(vcpu, context);
- else if (efer & EFER_LMA)
- paging64_init_context(vcpu, context);
- else if (cr4 & X86_CR4_PAE)
- paging32E_init_context(vcpu, context);
+ if (!is_cr0_pg(context))
+ nonpaging_init_context(context);
+ else if (is_cr4_pae(context))
+ paging64_init_context(context);
else
- paging32_init_context(vcpu, context);
+ paging32_init_context(context);
- context->mmu_role.as_u64 = new_role.as_u64;
+ reset_guest_paging_metadata(vcpu, context);
reset_shadow_zero_bits_mask(vcpu, context);
}
-static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
- union kvm_mmu_role new_role =
- kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+ union kvm_mmu_page_role root_role;
- if (new_role.as_u64 != context->mmu_role.as_u64)
- shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
-}
+ root_role = cpu_role.base;
-static union kvm_mmu_role
-kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
-{
- union kvm_mmu_role role =
- kvm_calc_shadow_root_page_role_common(vcpu, false);
+ /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
+ root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
- role.base.direct = false;
- role.base.level = kvm_mmu_get_tdp_level(vcpu);
+ /*
+ * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts.
+ */
+ root_role.efer_nx = true;
- return role;
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
}
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
- gpa_t nested_cr3)
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
- union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = cr0,
+ .cr4 = cr4 & ~X86_CR4_PKE,
+ .efer = efer,
+ };
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+ union kvm_mmu_page_role root_role;
- context->shadow_root_level = new_role.base.level;
+ /* NPT requires CR0.PG=1. */
+ WARN_ON_ONCE(cpu_role.base.direct);
- __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
+ root_role = cpu_role.base;
+ root_role.level = kvm_mmu_get_tdp_level(vcpu);
+ if (root_role.level == PT64_ROOT_5LEVEL &&
+ cpu_role.base.level == PT64_ROOT_4LEVEL)
+ root_role.passthrough = 1;
- if (new_role.as_u64 != context->mmu_role.as_u64)
- shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+ kvm_mmu_new_pgd(vcpu, nested_cr3);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
-static union kvm_mmu_role
+static union kvm_cpu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
bool execonly, u8 level)
{
- union kvm_mmu_role role = {0};
-
- /* SMM flag is inherited from root_mmu */
- role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
+ union kvm_cpu_role role = {0};
+ /*
+ * KVM does not support SMM transfer monitors, and consequently does not
+ * support the "entry to SMM" control either. role.base.smm is always 0.
+ */
+ WARN_ON_ONCE(is_smm(vcpu));
role.base.level = level;
- role.base.gpte_is_8_bytes = true;
+ role.base.has_4_byte_gpte = false;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true;
role.base.access = ACC_ALL;
- /*
- * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
- * SMAP variation to denote shadow EPT entries.
- */
- role.base.cr0_wp = true;
- role.base.smap_andnot_wp = true;
-
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ role.ext.word = 0;
role.ext.execonly = execonly;
+ role.ext.valid = 1;
return role;
}
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty, gpa_t new_eptp)
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
- union kvm_mmu_role new_role =
+ union kvm_cpu_role new_mode =
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
execonly, level);
- __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
-
- if (new_role.as_u64 == context->mmu_role.as_u64)
- return;
+ if (new_mode.as_u64 != context->cpu_role.as_u64) {
+ /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+ context->cpu_role.as_u64 = new_mode.as_u64;
+ context->root_role.word = new_mode.base.word;
- context->shadow_root_level = level;
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
- context->nx = true;
- context->ept_ad = accessed_dirty;
- context->page_fault = ept_page_fault;
- context->gva_to_gpa = ept_gva_to_gpa;
- context->sync_page = ept_sync_page;
- context->invlpg = ept_invlpg;
- context->root_level = level;
- context->direct_map = false;
- context->mmu_role.as_u64 = new_role.as_u64;
+ update_permission_bitmask(context, true);
+ context->pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
+ reset_ept_shadow_zero_bits_mask(context, execonly);
+ }
- update_permission_bitmask(vcpu, context, true);
- update_pkru_bitmask(vcpu, context, true);
- update_last_nonleaf_level(vcpu, context);
- reset_rsvds_bits_mask_ept(vcpu, context, execonly);
- reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
+ kvm_mmu_new_pgd(vcpu, new_eptp);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
-static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
- kvm_init_shadow_mmu(vcpu,
- kvm_read_cr0_bits(vcpu, X86_CR0_PG),
- kvm_read_cr4_bits(vcpu, X86_CR4_PAE),
- vcpu->arch.efer);
+ kvm_init_shadow_mmu(vcpu, cpu_role);
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
}
-static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role new_mode)
{
- union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
- if (new_role.as_u64 == g_context->mmu_role.as_u64)
+ if (new_mode.as_u64 == g_context->cpu_role.as_u64)
return;
- g_context->mmu_role.as_u64 = new_role.as_u64;
+ g_context->cpu_role.as_u64 = new_mode.as_u64;
g_context->get_guest_pgd = get_cr3;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
@@ -4728,70 +5042,65 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
* nested page tables as the second level of translation. Basically
* the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/
- if (!is_paging(vcpu)) {
- g_context->nx = false;
- g_context->root_level = 0;
- g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
- } else if (is_long_mode(vcpu)) {
- g_context->nx = is_nx(vcpu);
- g_context->root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else if (is_pae(vcpu)) {
- g_context->nx = is_nx(vcpu);
- g_context->root_level = PT32E_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else {
- g_context->nx = false;
- g_context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
- g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
- }
+ if (!is_paging(vcpu))
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa;
+ else if (is_long_mode(vcpu))
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
+ else if (is_pae(vcpu))
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
+ else
+ g_context->gva_to_gpa = paging32_gva_to_gpa;
- update_permission_bitmask(vcpu, g_context, false);
- update_pkru_bitmask(vcpu, g_context, false);
- update_last_nonleaf_level(vcpu, g_context);
+ reset_guest_paging_metadata(vcpu, g_context);
}
-void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
+void kvm_init_mmu(struct kvm_vcpu *vcpu)
{
- if (reset_roots) {
- uint i;
-
- vcpu->arch.mmu->root_hpa = INVALID_PAGE;
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
- }
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
if (mmu_is_nested(vcpu))
- init_kvm_nested_mmu(vcpu);
+ init_kvm_nested_mmu(vcpu, cpu_role);
else if (tdp_enabled)
- init_kvm_tdp_mmu(vcpu);
+ init_kvm_tdp_mmu(vcpu, cpu_role);
else
- init_kvm_softmmu(vcpu);
+ init_kvm_softmmu(vcpu, cpu_role);
}
EXPORT_SYMBOL_GPL(kvm_init_mmu);
-static union kvm_mmu_page_role
-kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_role role;
-
- if (tdp_enabled)
- role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
- else
- role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
+ /*
+ * Invalidate all MMU roles to force them to reinitialize as CPUID
+ * information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments). For now that
+ * problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
+ */
+ vcpu->arch.root_mmu.root_role.word = 0;
+ vcpu->arch.guest_mmu.root_role.word = 0;
+ vcpu->arch.nested_mmu.root_role.word = 0;
+ vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
+ kvm_mmu_reset_context(vcpu);
- return role.base;
+ /*
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
+ */
+ KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
- kvm_init_mmu(vcpu, true);
+ kvm_init_mmu(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
@@ -4799,28 +5108,90 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
- r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
if (r)
goto out;
- r = mmu_alloc_roots(vcpu);
- kvm_mmu_sync_roots(vcpu);
+ r = mmu_alloc_special_roots(vcpu);
+ if (r)
+ goto out;
+ if (vcpu->arch.mmu->root_role.direct)
+ r = mmu_alloc_direct_roots(vcpu);
+ else
+ r = mmu_alloc_shadow_roots(vcpu);
if (r)
goto out;
+
+ kvm_mmu_sync_roots(vcpu);
+
kvm_mmu_load_pgd(vcpu);
- static_call(kvm_x86_tlb_flush_current)(vcpu);
+
+ /*
+ * Flush any TLB entries for the new root, the provenance of the root
+ * is unknown. Even if KVM ensures there are no stale TLB entries
+ * for a freed root, in theory another hypervisor could have left
+ * stale entries. Flushing on alloc also allows KVM to skip the TLB
+ * flush when freeing a root (see kvm_tdp_mmu_put_root()).
+ */
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
out:
return r;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
+ struct kvm *kvm = vcpu->kvm;
+
+ kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+ kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+}
+
+static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root_hpa))
+ return false;
+
+ /*
+ * When freeing obsolete roots, treat roots as obsolete if they don't
+ * have an associated shadow page. This does mean KVM will get false
+ * positives and free roots that don't strictly need to be freed, but
+ * such false positives are relatively rare:
+ *
+ * (a) only PAE paging and nested NPT has roots without shadow pages
+ * (b) remote reloads due to a memslot update obsoletes _all_ roots
+ * (c) KVM doesn't track previous roots for PAE paging, and the guest
+ * is unlikely to zap an in-use PGD.
+ */
+ sp = to_shadow_page(root_hpa);
+ return !sp || is_obsolete_sp(kvm, sp);
+}
+
+static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ if (is_obsolete_root(kvm, mmu->root.hpa))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots_to_free)
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
+{
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
static bool need_remote_flush(u64 old, u64 new)
{
@@ -4891,7 +5262,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
gpa, bytes, sp->role.word);
offset = offset_in_page(gpa);
- pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
+ pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
/*
* Sometimes, the OS only writes the last one bytes to update status
@@ -4915,7 +5286,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
page_offset = offset_in_page(gpa);
level = sp->role.level;
*nspte = 1;
- if (!sp->role.gpte_is_8_bytes) {
+ if (sp->role.has_4_byte_gpte) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -4946,7 +5317,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
- bool remote_flush, local_flush;
+ bool flush = false;
/*
* If we don't have indirect shadow pages, it means no page is
@@ -4955,13 +5326,11 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
- remote_flush = local_flush = false;
-
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
/*
* No need to care whether allocation memory is successful
- * or not since pte prefetch is skiped if it does not have
+ * or not since pte prefetch is skipped if it does not have
* enough objects in the cache.
*/
mmu_topup_memory_caches(vcpu, true);
@@ -4971,9 +5340,8 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
- for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
detect_write_flooding(sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
@@ -4985,29 +5353,27 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!spte)
continue;
- local_flush = true;
while (npte--) {
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
if (gentry && sp->role.level != PG_LEVEL_4K)
++vcpu->kvm->stat.mmu_pde_zapped;
if (need_remote_flush(entry, *spte))
- remote_flush = true;
+ flush = true;
++spte;
}
}
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
- kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
write_unlock(&vcpu->kvm->mmu_lock);
}
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
{
int r, emulation_type = EMULTYPE_PF;
- bool direct = vcpu->arch.mmu->direct_map;
+ bool direct = vcpu->arch.mmu->root_role.direct;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
r = RET_PF_INVALID;
@@ -5020,7 +5386,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
lower_32_bits(error_code), false);
- if (WARN_ON_ONCE(r == RET_PF_INVALID))
+ if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
@@ -5036,7 +5402,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
* paging in both guests. If true, we simply unprotect the page
* and resume the guest.
*/
- if (vcpu->arch.mmu->direct_map &&
+ if (vcpu->arch.mmu->root_role.direct &&
(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
return 1;
@@ -5072,14 +5438,14 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(gva, vcpu))
return;
- static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
}
if (!mmu->invlpg)
return;
if (root_hpa == INVALID_PAGE) {
- mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ mmu->invlpg(vcpu, gva, mmu->root.hpa);
/*
* INVLPG is required to invalidate any global mappings for the VA,
@@ -5102,7 +5468,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
+ kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5115,20 +5481,22 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
uint i;
if (pcid == kvm_get_active_pcid(vcpu)) {
- mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ if (mmu->invlpg)
+ mmu->invlpg(vcpu, gva, mmu->root.hpa);
tlb_flush = true;
}
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ if (mmu->invlpg)
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
tlb_flush = true;
}
}
if (tlb_flush)
- static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
++vcpu->stat.invlpg;
@@ -5139,10 +5507,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
*/
}
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
- int tdp_huge_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
+ tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;
/*
@@ -5162,17 +5531,18 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot);
+typedef bool (*slot_level_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
/* The caller should hold mmu-lock before calling this function. */
static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+ gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
+ bool flush)
{
struct slot_rmap_walk_iterator iterator;
- bool flush = false;
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
@@ -5180,7 +5550,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
flush |= fn(kvm, iterator.rmap, memslot);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush && lock_flush_tlb) {
+ if (flush && flush_on_yield) {
kvm_flush_remote_tlbs_with_address(kvm,
start_gfn,
iterator.gfn - start_gfn + 1);
@@ -5190,38 +5560,35 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
}
- if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
- end_gfn - start_gfn + 1);
- flush = false;
- }
-
return flush;
}
static __always_inline bool
-slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- bool lock_flush_tlb)
+ bool flush_on_yield)
{
return slot_handle_level_range(kvm, memslot, fn, start_level,
end_level, memslot->base_gfn,
memslot->base_gfn + memslot->npages - 1,
- lock_flush_tlb);
+ flush_on_yield, false);
}
static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool flush_on_yield)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
- PG_LEVEL_4K, lock_flush_tlb);
+ PG_LEVEL_4K, flush_on_yield);
}
static void free_mmu_pages(struct kvm_mmu *mmu)
{
+ if (!tdp_enabled && mmu->pae_root)
+ set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
- free_page((unsigned long)mmu->lm_root);
+ free_page((unsigned long)mmu->pml4_root);
+ free_page((unsigned long)mmu->pml5_root);
}
static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
@@ -5229,20 +5596,25 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
struct page *page;
int i;
- mmu->root_hpa = INVALID_PAGE;
- mmu->root_pgd = 0;
- mmu->translate_gpa = translate_gpa;
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
+ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
+ return 0;
+
/*
* When using PAE paging, the four PDPTEs are treated as 'root' pages,
* while the PDP table is a per-vCPU construct that's allocated at MMU
* creation. When emulating 32-bit mode, cr3 is only 32 bits even on
* x86_64. Therefore we need to allocate the PDP table in the first
- * 4GB of memory, which happens to fit the DMA32 zone. Except for
- * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
- * skip allocating the PDP table.
+ * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
+ * generally doesn't use PAE paging and can skip allocating the PDP
+ * table. The main exception, handled here, is SVM's 32-bit NPT. The
+ * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+ * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
*/
if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
@@ -5252,8 +5624,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
return -ENOMEM;
mmu->pae_root = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+ else
+ WARN_ON_ONCE(shadow_me_value);
+
for (i = 0; i < 4; ++i)
- mmu->pae_root[i] = INVALID_PAGE;
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
return 0;
}
@@ -5273,8 +5659,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
- vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
-
ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
if (ret)
return ret;
@@ -5294,6 +5678,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
int nr_zapped, batch = 0;
+ bool unstable;
restart:
list_for_each_entry_safe_reverse(sp, node,
@@ -5325,17 +5710,22 @@ restart:
goto restart;
}
- if (__kvm_mmu_prepare_zap_page(kvm, sp,
- &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
- batch += nr_zapped;
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages, &nr_zapped);
+ batch += nr_zapped;
+
+ if (unstable)
goto restart;
- }
}
/*
- * Trigger a remote TLB flush before freeing the page tables to ensure
- * KVM is not in the middle of a lockless shadow page table walk, which
- * may reference the pages.
+ * Kick all vCPUs (via remote TLB flush) before freeing the page tables
+ * to ensure KVM is not in the middle of a lockless shadow page table
+ * walk, which may reference the pages. The remote TLB flush itself is
+ * not required and is simply a convenient way to kick vCPUs as needed.
+ * KVM performs a local TLB flush when allocating a new root (see
+ * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
+ * running with an obsolete MMU.
*/
kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
}
@@ -5366,6 +5756,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
/*
+ * In order to ensure all vCPUs drop their soon-to-be invalid roots,
+ * invalidating TDP MMU roots must be done while holding mmu_lock for
+ * write and in the same critical section as making the reload request,
+ * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
+ */
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_invalidate_all_roots(kvm);
+
+ /*
* Notify all vcpus to reload its shadow page table and flush TLB.
* Then all vcpus will switch to new shadow page table with the new
* mmu_valid_gen.
@@ -5373,14 +5772,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* Note: we need to do this under the protection of mmu_lock,
* otherwise, vcpu would purge shadow page but miss tlb flush.
*/
- kvm_reload_remote_mmus(kvm);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
kvm_zap_obsolete_pages(kvm);
- if (is_tdp_mmu_enabled(kvm))
- kvm_tdp_mmu_zap_all(kvm);
-
write_unlock(&kvm->mmu_lock);
+
+ /*
+ * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
+ * returning to the caller, e.g. if the zap is in response to a memslot
+ * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
+ * associated with the deleted memslot once the update completes, and
+ * Deferring the zap until the final reference to the root is put would
+ * lead to use-after-free.
+ */
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_zap_invalidated_roots(kvm);
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5395,15 +5802,24 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
kvm_mmu_zap_all_fast(kvm);
}
-void kvm_mmu_init_vm(struct kvm *kvm)
+int kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ int r;
- kvm_mmu_init_tdp_mmu(kvm);
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
+ spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
+
+ r = kvm_mmu_init_tdp_mmu(kvm);
+ if (r < 0)
+ return r;
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
+ return 0;
}
void kvm_mmu_uninit_vm(struct kvm *kvm)
@@ -5415,78 +5831,168 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_mmu_uninit_tdp_mmu(kvm);
}
-void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
+ const struct kvm_memory_slot *memslot;
struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
+ struct kvm_memslot_iter iter;
+ bool flush = false;
+ gfn_t start, end;
int i;
- bool flush;
- write_lock(&kvm->mmu_lock);
+ if (!kvm_memslots_have_rmaps(kvm))
+ return flush;
+
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- gfn_t start, end;
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ memslot = iter.slot;
start = max(gfn_start, memslot->base_gfn);
end = min(gfn_end, memslot->base_gfn + memslot->npages);
- if (start >= end)
+ if (WARN_ON_ONCE(start >= end))
continue;
- slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
- PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true);
+ flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
}
}
+ return flush;
+}
+
+/*
+ * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
+ * (not including it)
+ */
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ bool flush;
+ int i;
+
+ if (WARN_ON_ONCE(gfn_end <= gfn_start))
+ return;
+
+ write_lock(&kvm->mmu_lock);
+
+ kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
+
+ flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
+
if (is_tdp_mmu_enabled(kvm)) {
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
- if (flush)
- kvm_flush_remote_tlbs(kvm);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
+ gfn_end, true, flush);
}
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end - gfn_start);
+
+ kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
+
write_unlock(&kvm->mmu_lock);
}
static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
- return __rmap_write_protect(kvm, rmap_head, false);
+ return rmap_write_protect(rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
+ const struct kvm_memory_slot *memslot,
int start_level)
{
- bool flush;
+ bool flush = false;
- write_lock(&kvm->mmu_lock);
- flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
- start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
- write_unlock(&kvm->mmu_lock);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL,
+ false);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+ read_unlock(&kvm->mmu_lock);
+ }
/*
- * We can flush all the TLBs out of the mmu lock without TLB
- * corruption since we just change the spte from writable to
- * readonly so that we only need to care the case of changing
- * spte from present to present (changing the spte from present
- * to nonpresent will flush all the TLBs immediately), in other
- * words, the only case we care is mmu_spte_update() where we
- * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
- * instead of PT_WRITABLE_MASK, that means it does not depend
- * on PT_WRITABLE_MASK anymore.
+ * Flush TLBs if any SPTEs had to be write-protected to ensure that
+ * guest writes are reflected in the dirty bitmap before the memslot
+ * update completes, i.e. before enabling dirty logging is visible to
+ * userspace.
+ *
+ * Perform the TLB flush outside the mmu_lock to reduce the amount of
+ * time the lock is held. However, this does mean that another CPU can
+ * now grab mmu_lock and encounter a write-protected SPTE while CPUs
+ * still have a writable mapping for the associated GFN in their TLB.
+ *
+ * This is safe but requires KVM to be careful when making decisions
+ * based on the write-protection status of an SPTE. Specifically, KVM
+ * also write-protects SPTEs to monitor changes to guest page tables
+ * during shadow paging, and must guarantee no CPUs can write to those
+ * page before the lock is dropped. As mentioned in the previous
+ * paragraph, a write-protected SPTE is no guarantee that CPU cannot
+ * perform writes. So to determine if a TLB flush is truly required, KVM
+ * will clear a separate software-only bit (MMU-writable) and skip the
+ * flush if-and-only-if this bit was already clear.
+ *
+ * See is_writable_pte() for more details.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level)
+{
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+ target_level, false);
+
+ /*
+ * A TLB flush is unnecessary at this point for the same resons as in
+ * kvm_mmu_slot_try_split_huge_pages().
+ */
+}
+
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level)
+{
+ u64 start = memslot->base_gfn;
+ u64 end = start + memslot->npages;
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+ read_unlock(&kvm->mmu_lock);
+ }
+
+ /*
+ * No TLB flush is necessary here. KVM will flush TLBs after
+ * write-protecting and/or clearing dirty on the newly split SPTEs to
+ * ensure that guest writes are reflected in the dirty log before the
+ * ioctl to enable dirty logging on this memslot completes. Since the
+ * split SPTEs retain the write and dirty bits of the huge SPTE, it is
+ * safe for KVM to decide if a TLB flush is necessary based on the split
+ * SPTEs.
+ */
+}
+
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -5509,7 +6015,7 @@ restart:
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
pfn, PG_LEVEL_NUM)) {
- pte_list_remove(rmap_head, sptep);
+ pte_list_remove(kvm, rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
@@ -5525,25 +6031,33 @@ restart:
}
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *slot)
{
- /* FIXME: const-ify all uses of struct kvm_memory_slot. */
- struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
-
- write_lock(&kvm->mmu_lock);
- slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ /*
+ * Zap only 4k SPTEs since the legacy MMU only supports dirty
+ * logging at a 4k granularity and never creates collapsible
+ * 2m SPTEs during dirty logging.
+ */
+ if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ write_unlock(&kvm->mmu_lock);
+ }
- if (is_tdp_mmu_enabled(kvm))
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
- write_unlock(&kvm->mmu_lock);
+ read_unlock(&kvm->mmu_lock);
+ }
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
/*
* All current use cases for flushing the TLBs for a specific memslot
- * are related to dirty logging, and do the TLB flush out of mmu_lock.
+ * related to dirty logging, and many do the TLB flush out of mmu_lock.
* The interaction between the various operations on memslot must be
* serialized by slots_locks to ensure the TLB flush from one operation
* is observed by any other operation on the same memslot.
@@ -5554,15 +6068,25 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
- bool flush;
+ bool flush = false;
- write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- if (is_tdp_mmu_enabled(kvm))
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ /*
+ * Clear dirty bits only on 4k SPTEs since the legacy MMU only
+ * support dirty logging at a 4k granularity.
+ */
+ flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
- write_unlock(&kvm->mmu_lock);
+ read_unlock(&kvm->mmu_lock);
+ }
/*
* It's also safe to flush TLBs out of mmu lock here as currently this
@@ -5701,25 +6225,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
-static void kvm_set_mmio_spte_mask(void)
-{
- u64 mask;
-
- /*
- * Set a reserved PA bit in MMIO SPTEs to generate page faults with
- * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
- * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
- * 52-bit physical addresses then there are no reserved PA bits in the
- * PTEs and so the reserved PA approach must be disabled.
- */
- if (shadow_phys_bits < 52)
- mask = BIT_ULL(51) | PT_PRESENT_MASK;
- else
- mask = 0;
-
- kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
static bool get_nx_auto_mode(void)
{
/* Return true when CPU has the bug, and mitigations are ON */
@@ -5766,12 +6271,24 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
return 0;
}
-int kvm_mmu_module_init(void)
+/*
+ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
+ * its default value of -1 is technically undefined behavior for a boolean.
+ */
+void kvm_mmu_x86_module_init(void)
{
- int ret = -ENOMEM;
-
if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());
+}
+
+/*
+ * The bulk of the MMU initialization is deferred until the vendor module is
+ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
+ * to be reset when a potentially different vendor module is loaded.
+ */
+int kvm_mmu_vendor_module_init(void)
+{
+ int ret = -ENOMEM;
/*
* MMU roles use union aliasing which is, generally speaking, an
@@ -5781,12 +6298,10 @@ int kvm_mmu_module_init(void)
*/
BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
- BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
+ BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
kvm_mmu_reset_all_pte_masks();
- kvm_set_mmio_spte_mask();
-
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
@@ -5813,30 +6328,6 @@ out:
return ret;
}
-/*
- * Calculate mmu pages needed for kvm.
- */
-unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
-{
- unsigned long nr_mmu_pages;
- unsigned long nr_pages = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int i;
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
-
- kvm_for_each_memslot(memslot, slots)
- nr_pages += memslot->npages;
- }
-
- nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
-
- return nr_mmu_pages;
-}
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
@@ -5845,26 +6336,54 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
mmu_free_memory_caches(vcpu);
}
-void kvm_mmu_module_exit(void)
+void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
unregister_shrinker(&mmu_shrinker);
- mmu_audit_disable();
}
-static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
{
- unsigned int old_val;
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
+{
+ bool was_recovery_enabled, is_recovery_enabled;
+ uint old_period, new_period;
int err;
- old_val = nx_huge_pages_recovery_ratio;
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
+
err = param_set_uint(val, kp);
if (err)
return err;
- if (READ_ONCE(nx_huge_pages) &&
- !old_val && nx_huge_pages_recovery_ratio) {
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
+
+ if (is_recovery_enabled &&
+ (!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
mutex_lock(&kvm_lock);
@@ -5880,6 +6399,7 @@ static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel
static void kvm_recover_nx_lpages(struct kvm *kvm)
{
+ unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
int rcu_idx;
struct kvm_mmu_page *sp;
unsigned int ratio;
@@ -5890,8 +6410,15 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
rcu_idx = srcu_read_lock(&kvm->srcu);
write_lock(&kvm->mmu_lock);
+ /*
+ * Zapping TDP MMU shadow pages, including the remote TLB flush, must
+ * be done under RCU protection, because the pages are freed via RCU
+ * callback.
+ */
+ rcu_read_lock();
+
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
- to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
break;
@@ -5906,7 +6433,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
lpage_disallowed_link);
WARN_ON_ONCE(!sp->lpage_disallowed);
if (is_tdp_mmu_page(sp)) {
- flush = kvm_tdp_mmu_zap_sp(kvm, sp);
+ flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
} else {
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
@@ -5914,21 +6441,31 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
cond_resched_rwlock_write(&kvm->mmu_lock);
flush = false;
+
+ rcu_read_lock();
}
}
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
static long get_nx_lpage_recovery_timeout(u64 start_time)
{
- return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
- ? start_time + 60 * HZ - get_jiffies_64()
- : MAX_SCHEDULE_TIMEOUT;
+ bool enabled;
+ uint period;
+
+ enabled = calc_nx_huge_pages_recovery_period(&period);
+
+ return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
}
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
deleted file mode 100644
index ced15fd58fde..000000000000
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ /dev/null
@@ -1,303 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mmu_audit.c:
- *
- * Audit code for KVM MMU
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- * Marcelo Tosatti <mtosatti@redhat.com>
- * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
- */
-
-#include <linux/ratelimit.h>
-
-static char const *audit_point_name[] = {
- "pre page fault",
- "post page fault",
- "pre pte write",
- "post pte write",
- "pre sync",
- "post sync"
-};
-
-#define audit_printk(kvm, fmt, args...) \
- printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[kvm->arch.audit_point], ##args)
-
-typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
-
-static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- inspect_spte_fn fn, int level)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 *ent = sp->spt;
-
- fn(vcpu, ent + i, level);
-
- if (is_shadow_present_pte(ent[i]) &&
- !is_last_spte(ent[i], level)) {
- struct kvm_mmu_page *child;
-
- child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(vcpu, child, fn, level - 1);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return;
-
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- hpa_t root = vcpu->arch.mmu->root_hpa;
-
- sp = to_shadow_page(root);
- __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level);
- return;
- }
-
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu->pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = to_shadow_page(root);
- __mmu_spte_walk(vcpu, sp, fn, 2);
- }
- }
-
- return;
-}
-
-typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
-
-static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
- fn(kvm, sp);
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- kvm_pfn_t pfn;
- hpa_t hpa;
-
- sp = sptep_to_sp(sptep);
-
- if (sp->unsync) {
- if (level != PG_LEVEL_4K) {
- audit_printk(vcpu->kvm, "unsync sp: %p "
- "level = %d\n", sp, level);
- return;
- }
- }
-
- if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
-
- if (is_error_pfn(pfn))
- return;
-
- hpa = pfn << PAGE_SHIFT;
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
- "ent %llxn", vcpu->arch.mmu->root_level, pfn,
- hpa, *sptep);
-}
-
-static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
- struct kvm_rmap_head *rmap_head;
- struct kvm_mmu_page *rev_sp;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
- gfn_t gfn;
-
- rev_sp = sptep_to_sp(sptep);
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
- slot = __gfn_to_memslot(slots, gfn);
- if (!slot) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
- audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
- (long int)(sptep - rev_sp->spt), rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
- if (!rmap_head->val) {
- if (!__ratelimit(&ratelimit_state))
- return;
- audit_printk(kvm, "no rmap for writable spte %llx\n",
- *sptep);
- dump_stack();
- }
-}
-
-static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
- inspect_spte_has_rmap(vcpu->kvm, sptep);
-}
-
-static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- struct kvm_mmu_page *sp = sptep_to_sp(sptep);
-
- if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
- "root.\n", sp);
-}
-
-static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- int i;
-
- if (sp->role.level != PG_LEVEL_4K)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_shadow_present_pte(sp->spt[i]))
- continue;
-
- inspect_spte_has_rmap(kvm, sp->spt + i);
- }
-}
-
-static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- struct kvm_rmap_head *rmap_head;
- u64 *sptep;
- struct rmap_iterator iter;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
-
- if (sp->role.direct || sp->unsync || sp->role.invalid)
- return;
-
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, sp->gfn);
- rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
-
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- if (is_writable_pte(*sptep))
- audit_printk(kvm, "shadow page has writable "
- "mappings: gfn %llx role %x\n",
- sp->gfn, sp->role.word);
- }
-}
-
-static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- check_mappings_rmap(kvm, sp);
- audit_write_protection(kvm, sp);
-}
-
-static void audit_all_active_sps(struct kvm *kvm)
-{
- walk_all_active_sps(kvm, audit_sp);
-}
-
-static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
-{
- audit_sptes_have_rmaps(vcpu, sptep, level);
- audit_mappings(vcpu, sptep, level);
- audit_spte_after_sync(vcpu, sptep, level);
-}
-
-static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, audit_spte);
-}
-
-static bool mmu_audit;
-static DEFINE_STATIC_KEY_FALSE(mmu_audit_key);
-
-static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
-
- if (!__ratelimit(&ratelimit_state))
- return;
-
- vcpu->kvm->arch.audit_point = point;
- audit_all_active_sps(vcpu->kvm);
- audit_vcpu_spte(vcpu);
-}
-
-static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
-{
- if (static_branch_unlikely((&mmu_audit_key)))
- __kvm_mmu_audit(vcpu, point);
-}
-
-static void mmu_audit_enable(void)
-{
- if (mmu_audit)
- return;
-
- static_branch_inc(&mmu_audit_key);
- mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
- if (!mmu_audit)
- return;
-
- static_branch_dec(&mmu_audit_key);
- mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
- int ret;
- unsigned long enable;
-
- ret = kstrtoul(val, 10, &enable);
- if (ret < 0)
- return -EINVAL;
-
- switch (enable) {
- case 0:
- mmu_audit_disable();
- break;
- case 1:
- mmu_audit_enable();
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static const struct kernel_param_ops audit_param_ops = {
- .set = mmu_audit_set,
- .get = param_get_bool,
-};
-
-arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 1f6f98c76bdf..bd2a26897b97 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -20,14 +20,29 @@ extern bool dbg;
#define MMU_WARN_ON(x) do { } while (0)
#endif
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid. And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT 0
+#define IS_VALID_PAE_ROOT(x) (!!(x))
+
+typedef u64 __rcu *tdp_ptep_t;
+
struct kvm_mmu_page {
+ /*
+ * Note, "link" through "spt" fit in a single 64 byte cache line on
+ * 64-bit kernels, keep it that way unless there's a reason not to.
+ */
struct list_head link;
struct hlist_node hash_link;
- struct list_head lpage_disallowed_link;
+ bool tdp_mmu_page;
bool unsync;
u8 mmu_valid_gen;
- bool mmio_cached;
bool lpage_disallowed; /* Can't be replaced by an equiv large page */
/*
@@ -40,11 +55,25 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- int root_count; /* Currently serving as active root */
+ /* Currently serving as active root */
+ union {
+ int root_count;
+ refcount_t tdp_mmu_root_count;
+ };
unsigned int unsync_children;
- struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
- DECLARE_BITMAP(unsync_child_bitmap, 512);
+ union {
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+ tdp_ptep_t ptep;
+ };
+ union {
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+ struct {
+ struct work_struct tdp_mmu_async_work;
+ void *tdp_mmu_async_data;
+ };
+ };
+ struct list_head lpage_disallowed_link;
#ifdef CONFIG_X86_32
/*
* Used out of the mmu-lock to avoid reading spte values while an
@@ -57,9 +86,7 @@ struct kvm_mmu_page {
atomic_t write_flooding_count;
#ifdef CONFIG_X86_64
- bool tdp_mmu_page;
-
- /* Used for freeing the page asyncronously if it is a TDP MMU page. */
+ /* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
#endif
};
@@ -78,12 +105,17 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+ return role.smm ? 1 : 0;
+}
+
static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
{
- return sp->role.smm ? 1 : 0;
+ return kvm_mmu_role_as_id(sp->role);
}
-static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
+static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
{
/*
* When using the EPT page-modification log, the GPAs in the CPU dirty
@@ -91,70 +123,167 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
* on write protection to record dirty pages, which bypasses PML, since
* writes now result in a vmexit. Note, the check on CPU dirty logging
* being enabled is mandatory as the bits used to denote WP-only SPTEs
- * are reserved for NPT w/ PAE (32-bit KVM).
+ * are reserved for PAE paging (32-bit KVM).
*/
- return vcpu->arch.mmu == &vcpu->arch.guest_mmu &&
- kvm_x86_ops.cpu_dirty_log_size;
+ return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
}
-bool is_nx_huge_page_enabled(void);
-bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync);
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool can_unsync, bool prefetch);
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
- struct kvm_memory_slot *slot, u64 gfn);
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level);
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(void)
{
- BUG_ON(!sp->root_count);
- lockdep_assert_held(&kvm->mmu_lock);
-
- ++sp->root_count;
+ return READ_ONCE(nx_huge_pages);
}
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- lockdep_assert_held(&kvm->mmu_lock);
- --sp->root_count;
+struct kvm_page_fault {
+ /* arguments to kvm_mmu_do_page_fault. */
+ const gpa_t addr;
+ const u32 error_code;
+ const bool prefetch;
- return !sp->root_count;
-}
+ /* Derived from error_code. */
+ const bool exec;
+ const bool write;
+ const bool present;
+ const bool rsvd;
+ const bool user;
+
+ /* Derived from mmu and global state. */
+ const bool is_tdp;
+ const bool nx_huge_page_workaround_enabled;
+
+ /*
+ * Whether a >4KB mapping can be created or is forbidden due to NX
+ * hugepages.
+ */
+ bool huge_page_disallowed;
+
+ /*
+ * Maximum page size that can be created for this fault; input to
+ * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
+ */
+ u8 max_level;
+
+ /*
+ * Page size that can be created based on the max_level and the
+ * page size used by the host mapping.
+ */
+ u8 req_level;
+
+ /*
+ * Page size that will be created based on the req_level and
+ * huge_page_disallowed.
+ */
+ u8 goal_level;
+
+ /* Shifted addr, or result of guest page table walk if addr is a gva. */
+ gfn_t gfn;
+
+ /* The memslot containing gfn. May be NULL. */
+ struct kvm_memory_slot *slot;
+
+ /* Outputs of kvm_faultin_pfn. */
+ kvm_pfn_t pfn;
+ hva_t hva;
+ bool map_writable;
+};
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
/*
- * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
+ * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
+ * and of course kvm_mmu_do_page_fault().
*
+ * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
* RET_PF_RETRY: let CPU fault again on the address.
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
* RET_PF_FIXED: The faulting entry has been fixed.
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
+ *
+ * Any names added to this enum should be exported to userspace for use in
+ * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
+ *
+ * Note, all values must be greater than or equal to zero so as not to encroach
+ * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which
+ * will allow for efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
*/
enum {
- RET_PF_RETRY = 0,
+ RET_PF_CONTINUE = 0,
+ RET_PF_RETRY,
RET_PF_EMULATE,
RET_PF_INVALID,
RET_PF_FIXED,
RET_PF_SPURIOUS,
};
-/* Bits which may be returned by set_spte() */
-#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
-#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
-#define SET_SPTE_SPURIOUS BIT(2)
+static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u32 err, bool prefetch)
+{
+ struct kvm_page_fault fault = {
+ .addr = cr2_or_gpa,
+ .error_code = err,
+ .exec = err & PFERR_FETCH_MASK,
+ .write = err & PFERR_WRITE_MASK,
+ .present = err & PFERR_PRESENT_MASK,
+ .rsvd = err & PFERR_RSVD_MASK,
+ .user = err & PFERR_USER_MASK,
+ .prefetch = prefetch,
+ .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
+
+ .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ };
+ int r;
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level);
-int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp,
- bool huge_page_disallowed, int *req_level);
-void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
- kvm_pfn_t *pfnp, int *goal_levelp);
+ /*
+ * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
+ * guest perspective and have already been counted at the time of the
+ * original fault.
+ */
+ if (!prefetch)
+ vcpu->stat.pf_taken++;
+
+ if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
+ r = kvm_tdp_page_fault(vcpu, &fault);
+ else
+ r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+
+ /*
+ * Similar to above, prefetch faults aren't truly spurious, and the
+ * async #PF path doesn't do emulation. Do count faults that are fixed
+ * by the async #PF handler though, otherwise they'll never be counted.
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+ else if (prefetch)
+ ;
+ else if (r == RET_PF_EMULATE)
+ vcpu->stat.pf_emulate++;
+ else if (r == RET_PF_SPURIOUS)
+ vcpu->stat.pf_spurious++;
+ return r;
+}
-bool is_nx_huge_page_enabled(void);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level);
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index e798489b56b5..ae86820cef69 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -35,12 +35,12 @@
" %snxe %sad root %u %s%c", \
__entry->mmu_valid_gen, \
__entry->gfn, role.level, \
- role.gpte_is_8_bytes ? 8 : 4, \
+ role.has_4_byte_gpte ? 4 : 8, \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
role.invalid ? " invalid" : "", \
- role.nxe ? "" : "!", \
+ role.efer_nx ? "" : "!", \
role.ad_disabled ? "!" : "", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
@@ -54,6 +54,13 @@
{ PFERR_RSVD_MASK, "RSVD" }, \
{ PFERR_FETCH_MASK, "F" }
+TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
+TRACE_DEFINE_ENUM(RET_PF_RETRY);
+TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_FIXED);
+TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
+
/*
* A pagetable walk has started
*/
@@ -246,9 +253,9 @@ TRACE_EVENT(
TRACE_EVENT(
fast_page_fault,
- TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
+ TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
u64 *sptep, u64 old_spte, int ret),
- TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret),
+ TP_ARGS(vcpu, fault, sptep, old_spte, ret),
TP_STRUCT__entry(
__field(int, vcpu_id)
@@ -262,8 +269,8 @@ TRACE_EVENT(
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
- __entry->cr2_or_gpa = cr2_or_gpa;
- __entry->error_code = error_code;
+ __entry->cr2_or_gpa = fault->addr;
+ __entry->error_code = fault->error_code;
__entry->sptep = sptep;
__entry->old_spte = old_spte;
__entry->new_spte = *sptep;
@@ -361,8 +368,8 @@ TRACE_EVENT(
TRACE_EVENT(
kvm_mmu_spte_requested,
- TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
- TP_ARGS(addr, level, pfn),
+ TP_PROTO(struct kvm_page_fault *fault),
+ TP_ARGS(fault),
TP_STRUCT__entry(
__field(u64, gfn)
@@ -371,9 +378,9 @@ TRACE_EVENT(
),
TP_fast_assign(
- __entry->gfn = addr >> PAGE_SHIFT;
- __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
- __entry->level = level;
+ __entry->gfn = fault->gfn;
+ __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1));
+ __entry->level = fault->goal_level;
),
TP_printk("gfn %llx pfn %llx level %d",
@@ -410,6 +417,29 @@ TRACE_EVENT(
)
);
+TRACE_EVENT(
+ kvm_mmu_split_huge_page,
+ TP_PROTO(u64 gfn, u64 spte, int level, int errno),
+ TP_ARGS(gfn, spte, level, errno),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(int, level)
+ __field(int, errno)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = spte;
+ __entry->level = level;
+ __entry->errno = errno;
+ ),
+
+ TP_printk("gfn %llx spte %llx level %d errno %d",
+ __entry->gfn, __entry->spte, __entry->level, __entry->errno)
+);
+
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 34bb0ec69bd8..2e09d1b6249f 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -16,8 +16,15 @@
#include <asm/kvm_page_track.h>
+#include "mmu.h"
#include "mmu_internal.h"
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
+{
+ return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
+ !tdp_enabled || kvm_shadow_root_allocated(kvm);
+}
+
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
int i;
@@ -28,15 +35,20 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
}
}
-int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
unsigned long npages)
{
- int i;
+ int i;
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ if (i == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm))
+ continue;
+
slot->arch.gfn_track[i] =
- kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
- GFP_KERNEL_ACCOUNT);
+ __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
+ GFP_KERNEL_ACCOUNT);
if (!slot->arch.gfn_track[i])
goto track_free;
}
@@ -56,6 +68,22 @@ static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
return true;
}
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
+{
+ unsigned short *gfn_track;
+
+ if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE])
+ return 0;
+
+ gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track),
+ GFP_KERNEL_ACCOUNT);
+ if (gfn_track == NULL)
+ return -ENOMEM;
+
+ slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track;
+ return 0;
+}
+
static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode, short count)
{
@@ -91,6 +119,10 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
if (WARN_ON(!page_track_mode_is_valid(mode)))
return;
+ if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm)))
+ return;
+
update_gfn_track(slot, gfn, mode, 1);
/*
@@ -100,7 +132,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (mode == KVM_PAGE_TRACK_WRITE)
- if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
kvm_flush_remote_tlbs(kvm);
}
EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
@@ -125,6 +157,10 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
if (WARN_ON(!page_track_mode_is_valid(mode)))
return;
+ if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm)))
+ return;
+
update_gfn_track(slot, gfn, mode, -1);
/*
@@ -138,19 +174,22 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
/*
* check if the corresponding access on the specified guest page is tracked.
*/
-bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
- enum kvm_page_track_mode mode)
+bool kvm_slot_page_track_is_active(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, enum kvm_page_track_mode mode)
{
- struct kvm_memory_slot *slot;
int index;
if (WARN_ON(!page_track_mode_is_valid(mode)))
return false;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot)
return false;
+ if (mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm))
+ return false;
+
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
}
@@ -163,13 +202,13 @@ void kvm_page_track_cleanup(struct kvm *kvm)
cleanup_srcu_struct(&head->track_srcu);
}
-void kvm_page_track_init(struct kvm *kvm)
+int kvm_page_track_init(struct kvm *kvm)
{
struct kvm_page_track_notifier_head *head;
head = &kvm->arch.track_notifier_head;
- init_srcu_struct(&head->track_srcu);
INIT_HLIST_HEAD(&head->track_notifier_list);
+ return init_srcu_struct(&head->track_srcu);
}
/*
diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h
new file mode 100644
index 000000000000..de8ab323bb70
--- /dev/null
+++ b/arch/x86/kvm/mmu/paging.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Shadow paging constants/helpers that don't need to be #undef'd. */
+#ifndef __KVM_X86_PAGING_H
+#define __KVM_X86_PAGING_H
+
+#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_LVL_ADDR_MASK(level) \
+ (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+ (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+#endif /* __KVM_X86_PAGING_H */
+
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 55d7b473ac44..db80f7ccaa4e 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -24,7 +24,7 @@
#define pt_element_t u64
#define guest_walker guest_walker64
#define FNAME(name) paging##64_##name
- #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
@@ -34,9 +34,8 @@
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
- #define CMPXCHG cmpxchg
+ #define CMPXCHG "cmpxchgq"
#else
- #define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 2
#endif
#elif PTTYPE == 32
@@ -52,20 +51,22 @@
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
- #define CMPXCHG cmpxchg
+ #define CMPXCHG "cmpxchgl"
#elif PTTYPE == PTTYPE_EPT
#define pt_element_t u64
#define guest_walker guest_walkerEPT
#define FNAME(name) ept_##name
- #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
#define PT_GUEST_DIRTY_SHIFT 9
#define PT_GUEST_ACCESSED_SHIFT 8
- #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
- #define CMPXCHG cmpxchg64
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
+ #ifdef CONFIG_X86_64
+ #define CMPXCHG "cmpxchgq"
+ #endif
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
#else
#error Invalid PTTYPE value
@@ -90,8 +91,8 @@ struct guest_walker {
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
bool pte_writable[PT_MAX_FULL_LEVELS];
- unsigned pt_access;
- unsigned pte_access;
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
gfn_t gfn;
struct x86_exception fault;
};
@@ -143,49 +144,6 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
}
-static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- pt_element_t __user *ptep_user, unsigned index,
- pt_element_t orig_pte, pt_element_t new_pte)
-{
- int npages;
- pt_element_t ret;
- pt_element_t *table;
- struct page *page;
-
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
- if (likely(npages == 1)) {
- table = kmap_atomic(page);
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- kunmap_atomic(table);
-
- kvm_release_page_dirty(page);
- } else {
- struct vm_area_struct *vma;
- unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
- unsigned long pfn;
- unsigned long paddr;
-
- mmap_read_lock(current->mm);
- vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE);
- if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
- mmap_read_unlock(current->mm);
- return -EFAULT;
- }
- pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- paddr = pfn << PAGE_SHIFT;
- table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
- if (!table) {
- mmap_read_unlock(current->mm);
- return -EFAULT;
- }
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- memunmap(table);
- mmap_read_unlock(current->mm);
- }
-
- return (ret != orig_pte);
-}
-
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
@@ -193,7 +151,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
if (!FNAME(is_present_gpte)(gpte))
goto no_present;
- /* if accessed bit is not supported prefetch non accessed gpte */
+ /* Prefetch only accessed entries (unless A/D bits are disabled). */
if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
!(gpte & PT_GUEST_ACCESSED_MASK))
goto no_present;
@@ -284,7 +242,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
if (unlikely(!walker->pte_writable[level - 1]))
continue;
- ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+ ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
if (ret)
return ret;
@@ -305,12 +263,41 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
return pkeys;
}
+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+ unsigned int level, unsigned int gpte)
+{
+ /*
+ * For EPT and PAE paging (both variants), bit 7 is either reserved at
+ * all level or indicates a huge page (ignoring CR3/EPTP). In either
+ * case, bit 7 being set terminates the walk.
+ */
+#if PTTYPE == 32
+ /*
+ * 32-bit paging requires special handling because bit 7 is ignored if
+ * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is
+ * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+ *
+ * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7
+ * is not reserved and does not indicate a large page at this level,
+ * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
+#endif
+ /*
+ * PG_LEVEL_4K always terminates. The RHS has bit 7 set
+ * iff level <= PG_LEVEL_4K, which for our purpose means
+ * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PG_LEVEL_4K - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
/*
* Fetch a guest pte for a guest virtual address, or for an L2's GPA.
*/
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gpa_t addr, u32 access)
+ gpa_t addr, u64 access)
{
int ret;
pt_element_t pte;
@@ -318,7 +305,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
gfn_t table_gfn;
u64 pt_access, pte_access;
unsigned index, accessed_dirty, pte_pkey;
- unsigned nested_access;
+ u64 nested_access;
gpa_t pte_gpa;
bool have_ad;
int offset;
@@ -332,7 +319,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
- walker->level = mmu->root_level;
+ walker->level = mmu->cpu_role.base.level;
pte = mmu->get_guest_pgd(vcpu);
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
@@ -374,9 +361,8 @@ retry_walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
- nested_access,
- &walker->fault);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
+ nested_access, &walker->fault);
/*
* FIXME: This can happen if emulation (for of an INS/OUTS
@@ -418,13 +404,15 @@ retry_walk:
}
walker->ptes[walker->level - 1] = pte;
- } while (!is_last_gpte(mmu, walker->level, pte));
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+ } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
- walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
@@ -436,7 +424,7 @@ retry_walk:
if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
if (real_gpa == UNMAPPED_GVA)
return 0;
@@ -463,13 +451,13 @@ retry_walk:
}
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, walker->pte_access, walker->pt_access);
+ __func__, (u64)pte, walker->pte_access,
+ walker->pt_access[walker->level - 1]);
return 1;
error:
errcode |= write_fault | user_fault;
- if (fetch_fault && (mmu->nx ||
- kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+ if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
errcode |= PFERR_FETCH_MASK;
walker->fault.vector = PF_VECTOR;
@@ -491,44 +479,43 @@ error:
* The other bits are set to 0.
*/
if (!(errcode & PFERR_RSVD_MASK)) {
- vcpu->arch.exit_qualification &= 0x180;
+ vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
if (write_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
if (user_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
if (fetch_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
- vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
+
+ /*
+ * Note, pte_access holds the raw RWX bits from the EPTE, not
+ * ACC_*_MASK flags!
+ */
+ vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
+ EPT_VIOLATION_RWX_SHIFT;
}
#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+ walker->fault.async_page_fault = false;
trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
}
static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
+ struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
{
return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
access);
}
-#if PTTYPE != PTTYPE_EPT
-static int FNAME(walk_addr_nested)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- u32 access)
-{
- return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
- addr, access);
-}
-#endif
-
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, pt_element_t gpte, bool no_dirty_log)
{
+ struct kvm_memory_slot *slot;
unsigned pte_access;
gfn_t gfn;
kvm_pfn_t pfn;
@@ -541,30 +528,21 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
- if (is_error_pfn(pfn))
+ if (!slot)
return false;
- /*
- * we call mmu_set_spte() with host_writable = true because
- * pte_prefetch_gfn_to_pfn always gets a writable pfn.
- */
- mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn,
- true, true);
+ pfn = gfn_to_pfn_memslot_atomic(slot, gfn);
+ if (is_error_pfn(pfn))
+ return false;
+ mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL);
kvm_release_pfn_clean(pfn);
return true;
}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte)
-{
- pt_element_t gpte = *(const pt_element_t *)pte;
-
- FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
-}
-
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
struct guest_walker *gw, int level)
{
@@ -631,24 +609,19 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
* If the guest tries to write a write-protected page, we need to
* emulate this operation, return 1 to indicate this case.
*/
-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
- struct guest_walker *gw, u32 error_code,
- int max_level, kvm_pfn_t pfn, bool map_writable,
- bool prefault)
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ struct guest_walker *gw)
{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write_fault = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
- unsigned direct_access, access = gw->pt_access;
- int top_level, level, req_level, ret;
- gfn_t base_gfn = gw->gfn;
+ unsigned int direct_access, access;
+ int top_level, ret;
+ gfn_t base_gfn = fault->gfn;
+ WARN_ON_ONCE(gw->gfn != base_gfn);
direct_access = gw->pte_access;
- top_level = vcpu->arch.mmu->root_level;
+ top_level = vcpu->arch.mmu->cpu_role.base.level;
if (top_level == PT32E_ROOT_LEVEL)
top_level = PT32_ROOT_LEVEL;
/*
@@ -660,10 +633,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
goto out_gpte_changed;
- for (shadow_walk_init(&it, vcpu, addr);
+ for (shadow_walk_init(&it, vcpu, fault->addr);
shadow_walk_okay(&it) && it.level > gw->level;
shadow_walk_next(&it)) {
gfn_t table_gfn;
@@ -674,8 +647,28 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
sp = NULL;
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
- sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
- false, access);
+ access = gw->pt_access[it.level - 2];
+ sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
+ it.level-1, false, access);
+ /*
+ * We must synchronize the pagetable before linking it
+ * because the guest doesn't need to flush tlb when
+ * the gpte is changed from non-present to present.
+ * Otherwise, the guest may use the wrong mapping.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_get_page() has already
+ * synchronized it transiently via kvm_sync_page().
+ *
+ * For higher level pagetable, we synchronize it via
+ * the slower mmu_sync_children(). If it needs to
+ * break, some progress has been made; return
+ * RET_PF_RETRY and retry on the next #PF.
+ * KVM_REQ_MMU_SYNC is not necessary but it
+ * expedites the process.
+ */
+ if (sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
}
/*
@@ -689,10 +682,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+ trace_kvm_mmu_spte_requested(fault);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
@@ -701,12 +693,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level,
- &pfn, &level);
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
- if (it.level == level)
+ base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == fault->goal_level)
break;
validate_direct_spte(vcpu, it.sptep, direct_access);
@@ -714,21 +705,24 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
drop_large_spte(vcpu, it.sptep);
if (!is_shadow_present_pte(*it.sptep)) {
- sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+ sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
it.level - 1, true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
- if (huge_page_disallowed && req_level >= it.level)
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
- ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, base_gfn, pfn, prefault, map_writable);
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access,
+ base_gfn, fault->pfn, fault);
if (ret == RET_PF_SPURIOUS)
return ret;
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
- ++vcpu->stat.pf_fixed;
return ret;
out_gpte_changed:
@@ -762,7 +756,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
bool self_changed = false;
if (!(walker->pte_access & ACC_WRITE_MASK ||
- (!is_write_protection(vcpu) && !user_fault)))
+ (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
return false;
for (level = walker->level; level <= walker->max_level; level++) {
@@ -789,45 +783,40 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
- bool prefault)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool write_fault = error_code & PFERR_WRITE_MASK;
- bool user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
- kvm_pfn_t pfn;
- hva_t hva;
unsigned long mmu_seq;
- bool map_writable, is_self_change_mapping;
- int max_level;
+ bool is_self_change_mapping;
- pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+ pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
+ WARN_ON_ONCE(fault->is_tdp);
/*
+ * Look up the guest pte for the faulting address.
* If PFEC.RSVD is set, this is a shadow page fault.
* The bit needs to be cleared before walking guest page tables.
*/
- error_code &= ~PFERR_RSVD_MASK;
-
- /*
- * Look up the guest pte for the faulting address.
- */
- r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
+ r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
+ fault->error_code & ~PFERR_RSVD_MASK);
/*
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- if (!prefault)
+ if (!fault->prefetch)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
return RET_PF_RETRY;
}
- if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
- shadow_page_table_clear_flood(vcpu, addr);
+ fault->gfn = walker.gfn;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault)) {
+ shadow_page_table_clear_flood(vcpu, fault->addr);
return RET_PF_EMULATE;
}
@@ -838,30 +827,30 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
vcpu->arch.write_fault_to_shadow_pgtable = false;
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
- &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+ &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
if (is_self_change_mapping)
- max_level = PG_LEVEL_4K;
+ fault->max_level = PG_LEVEL_4K;
else
- max_level = walker.level;
+ fault->max_level = walker.level;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
- write_fault, &map_writable))
- return RET_PF_RETRY;
+ r = kvm_faultin_pfn(vcpu, fault);
+ if (r != RET_PF_CONTINUE)
+ return r;
- if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
+ r = handle_abnormal_pfn(vcpu, fault, walker.pte_access);
+ if (r != RET_PF_CONTINUE)
return r;
/*
* Do not change pte_access if the pfn is a mmio page, otherwise
* we will cache the incorrect access into mmio spte.
*/
- if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
- !is_write_protection(vcpu) && !user_fault &&
- !is_noslot_pfn(pfn)) {
+ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;
@@ -871,26 +860,24 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
* then we should prevent the kernel from executing it
* if SMEP is enabled.
*/
- if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
+
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
- kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn,
- map_writable, prefault);
- kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
+ r = FNAME(fetch)(vcpu, fault, &walker);
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
+ kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -956,87 +943,89 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
sizeof(pt_element_t)))
break;
- FNAME(update_pte)(vcpu, sp, sptep, &gpte);
+ FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
}
- if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
+ if (!sp->unsync_children)
break;
}
write_unlock(&vcpu->kvm->mmu_lock);
}
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u64 access,
struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, addr, access);
-
- if (r) {
- gpa = gfn_to_gpa(walker.gfn);
- gpa |= addr & ~PAGE_MASK;
- } else if (exception)
- *exception = walker.fault;
-
- return gpa;
-}
-
-#if PTTYPE != PTTYPE_EPT
-/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access,
- struct x86_exception *exception)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
#ifndef CONFIG_X86_64
/* A 64-bit GVA should be impossible on 32-bit KVM. */
- WARN_ON_ONCE(vaddr >> 32);
+ WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
#endif
- r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
+ r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
+ gpa |= addr & ~PAGE_MASK;
} else if (exception)
*exception = walker.fault;
return gpa;
}
-#endif
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
*
- * Note:
- * We should flush all tlbs if spte is dropped even though guest is
- * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
- * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
- * used by guest then tlbs are not flushed, so guest is allowed to access the
- * freed pages.
- * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ * Returns
+ * < 0: the sp should be zapped
+ * 0: the sp is synced and no tlb flushing is required
+ * > 0: the sp is synced and tlb flushing is required
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- int i, nr_present = 0;
+ union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
+ int i;
bool host_writable;
gpa_t first_pte_gpa;
- int set_spte_ret = 0;
+ bool flush = false;
- /* direct kvm_mmu_page can not be unsync. */
- BUG_ON(sp->role.direct);
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ .passthrough = 0x1,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct ||
+ (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
+ return -1;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
@@ -1049,16 +1038,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
sizeof(pt_element_t)))
- return 0;
+ return -1;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- /*
- * Update spte before increasing tlbs_dirty to make
- * sure no tlb flush is lost after spte is zapped; see
- * the comments in kvm_flush_remote_tlbs().
- */
- smp_wmb();
- vcpu->kvm->tlbs_dirty++;
+ flush = true;
continue;
}
@@ -1067,35 +1050,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_access &= FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
- &nr_present))
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
continue;
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
- /*
- * The same as above where we are doing
- * prefetch_invalid_gpte().
- */
- smp_wmb();
- vcpu->kvm->tlbs_dirty++;
+ flush = true;
continue;
}
- nr_present++;
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, false,
+ host_writable, &spte);
- host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
-
- set_spte_ret |= set_spte(vcpu, &sp->spt[i],
- pte_access, PG_LEVEL_4K,
- gfn, spte_to_pfn(sp->spt[i]),
- true, false, host_writable);
+ flush |= mmu_spte_update(sptep, spte);
}
- if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- return nr_present;
+ return flush;
}
#undef pt_element_t
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index ef55f0bc4ccf..b5960bbde7f7 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -16,15 +16,24 @@
#include "spte.h"
#include <asm/e820/api.h>
+#include <asm/memtype.h>
+#include <asm/vmx.h>
+bool __read_mostly enable_mmio_caching = true;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
u64 __read_mostly shadow_user_mask;
u64 __read_mostly shadow_accessed_mask;
u64 __read_mostly shadow_dirty_mask;
u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
u64 __read_mostly shadow_mmio_access_mask;
u64 __read_mostly shadow_present_mask;
+u64 __read_mostly shadow_me_value;
u64 __read_mostly shadow_me_mask;
u64 __read_mostly shadow_acc_track_mask;
@@ -38,7 +47,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
u64 mask;
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
- BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -48,16 +56,18 @@ static u64 generation_mmio_spte_mask(u64 gen)
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
{
u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
- u64 mask = generation_mmio_spte_mask(gen);
+ u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
+ WARN_ON_ONCE(!shadow_mmio_value);
+
access &= shadow_mmio_access_mask;
- mask |= shadow_mmio_value | access;
- mask |= gpa | shadow_nonpresent_or_rsvd_mask;
- mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ spte |= shadow_mmio_value | access;
+ spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+ spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
- return mask;
+ return spte;
}
static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -81,18 +91,48 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
-int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
- bool can_unsync, bool host_writable, bool ad_disabled,
- u64 *new_spte)
+/*
+ * Returns true if the SPTE has bits that may be set without holding mmu_lock.
+ * The caller is responsible for checking if the SPTE is shadow-present, and
+ * for determining whether or not the caller cares about non-leaf SPTEs.
+ */
+bool spte_has_volatile_bits(u64 spte)
{
- u64 spte = 0;
- int ret = 0;
+ /*
+ * Always atomically update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
+ return true;
- if (ad_disabled)
- spte |= SPTE_AD_DISABLED_MASK;
- else if (kvm_vcpu_ad_need_write_protect(vcpu))
- spte |= SPTE_AD_WRPROT_ONLY_MASK;
+ if (is_access_track_spte(spte))
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ if (!(spte & shadow_accessed_mask) ||
+ (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
+ return true;
+ }
+
+ return false;
+}
+
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool can_unsync,
+ bool host_writable, u64 *new_spte)
+{
+ int level = sp->role.level;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
+ bool wrprot = false;
+
+ if (sp->role.ad_disabled)
+ spte |= SPTE_TDP_AD_DISABLED_MASK;
+ else if (kvm_mmu_page_ad_need_write_protect(sp))
+ spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
/*
* For the EPT case, shadow_present_mask is 0 if hardware
@@ -101,7 +141,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
* read access. See FNAME(gpte_access) in paging_tmpl.h.
*/
spte |= shadow_present_mask;
- if (!speculative)
+ if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
@@ -124,56 +164,131 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
kvm_is_mmio_pfn(pfn));
if (host_writable)
- spte |= SPTE_HOST_WRITEABLE;
+ spte |= shadow_host_writable_mask;
else
pte_access &= ~ACC_WRITE_MASK;
- if (!kvm_is_mmio_pfn(pfn))
- spte |= shadow_me_mask;
+ if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
+ spte |= shadow_me_value;
spte |= (u64)pfn << PAGE_SHIFT;
if (pte_access & ACC_WRITE_MASK) {
- spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+ spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
/*
* Optimization: for pte sync, if spte was writable the hash
* lookup is unnecessary (and expensive). Write protection
- * is responsibility of mmu_get_page / kvm_sync_page.
+ * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writable_pte(old_spte))
+ if (is_writable_pte(old_spte))
goto out;
- if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
+ /*
+ * Unsync shadow pages that are reachable by the new, writable
+ * SPTE. Write-protect the SPTE if the page can't be unsync'd,
+ * e.g. it's write-tracked (upper-level SPs) or has one or more
+ * shadow pages and unsync'ing pages is not allowed.
+ */
+ if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
- ret |= SET_SPTE_WRITE_PROTECTED_PT;
+ wrprot = true;
pte_access &= ~ACC_WRITE_MASK;
- spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
}
}
if (pte_access & ACC_WRITE_MASK)
spte |= spte_shadow_dirty_mask(spte);
- if (speculative)
+out:
+ if (prefetch)
spte = mark_spte_for_access_track(spte);
-out:
+ WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
+ "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
+ get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+
+ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
+ /* Enforced by kvm_mmu_hugepage_adjust. */
+ WARN_ON(level > PG_LEVEL_4K);
+ mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
+ }
+
*new_spte = spte;
- return ret;
+ return wrprot;
+}
+
+static u64 make_spte_executable(u64 spte)
+{
+ bool is_access_track = is_access_track_spte(spte);
+
+ if (is_access_track)
+ spte = restore_acc_track_spte(spte);
+
+ spte &= ~shadow_nx_mask;
+ spte |= shadow_x_mask;
+
+ if (is_access_track)
+ spte = mark_spte_for_access_track(spte);
+
+ return spte;
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given huge page SPTE where
+ * `index` identifies which sub-page.
+ *
+ * This is used during huge page splitting to build the SPTEs that make up the
+ * new page table.
+ */
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+{
+ u64 child_spte;
+ int child_level;
+
+ if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
+ return 0;
+
+ if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
+ return 0;
+
+ child_spte = huge_spte;
+ child_level = huge_level - 1;
+
+ /*
+ * The child_spte already has the base address of the huge page being
+ * split. So we just have to OR in the offset to the page at the next
+ * lower level for the given index.
+ */
+ child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
+
+ if (child_level == PG_LEVEL_4K) {
+ child_spte &= ~PT_PAGE_SIZE_MASK;
+
+ /*
+ * When splitting to a 4K page, mark the page executable as the
+ * NX hugepage mitigation no longer applies.
+ */
+ if (is_nx_huge_page_enabled())
+ child_spte = make_spte_executable(child_spte);
+ }
+
+ return child_spte;
}
+
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
- u64 spte;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
- spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_mask;
+ spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_value;
if (ad_disabled)
- spte |= SPTE_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED_MASK;
else
spte |= shadow_accessed_mask;
@@ -188,32 +303,14 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
new_spte |= (u64)new_pfn << PAGE_SHIFT;
new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~SPTE_HOST_WRITEABLE;
+ new_spte &= ~shadow_host_writable_mask;
+ new_spte &= ~shadow_mmu_writable_mask;
new_spte = mark_spte_for_access_track(new_spte);
return new_spte;
}
-static u8 kvm_get_shadow_phys_bits(void)
-{
- /*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
- * in CPU detection code, but the processor treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
- * the physical address bits reported by CPUID.
- */
- if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
- return cpuid_eax(0x80000008) & 0xff;
-
- /*
- * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
- * custom CPUID. Proceed with whatever the kernel found since these features
- * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
- */
- return boot_cpu_data.x86_phys_bits;
-}
-
u64 mark_spte_for_access_track(u64 spte)
{
if (spte_ad_enabled(spte))
@@ -222,14 +319,7 @@ u64 mark_spte_for_access_track(u64 spte)
if (is_access_track_spte(spte))
return spte;
- /*
- * Making an Access Tracking PTE will result in removal of write access
- * from the PTE. So, verify that we will be able to restore the write
- * access in the fast page fault path later on.
- */
- WARN_ONCE((spte & PT_WRITABLE_MASK) &&
- !spte_can_locklessly_be_made_writable(spte),
- "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+ check_spte_writable_invariants(spte);
WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
@@ -242,53 +332,80 @@ u64 mark_spte_for_access_track(u64 spte)
return spte;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
- WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
- shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+
+ if (!enable_mmio_caching)
+ mmio_value = 0;
+
+ /*
+ * Disable MMIO caching if the MMIO value collides with the bits that
+ * are used to hold the relocated GFN when the L1TF mitigation is
+ * enabled. This should never fire as there is no known hardware that
+ * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+ * MMIO value are not susceptible to L1TF.
+ */
+ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+ SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+ mmio_value = 0;
+
+ /*
+ * The masked MMIO value must obviously match itself and a removed SPTE
+ * must not get a false positive. Removed SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * not set any RWX bits.
+ */
+ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+ WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ mmio_value = 0;
+
+ if (!mmio_value)
+ enable_mmio_caching = false;
+
+ shadow_mmio_value = mmio_value;
+ shadow_mmio_mask = mmio_mask;
shadow_mmio_access_mask = access_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- * - Setting either @accessed_mask or @dirty_mask requires setting both
- * - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask)
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
{
- BUG_ON(!dirty_mask != !accessed_mask);
- BUG_ON(!accessed_mask && !acc_track_mask);
- BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
- shadow_user_mask = user_mask;
- shadow_accessed_mask = accessed_mask;
- shadow_dirty_mask = dirty_mask;
- shadow_nx_mask = nx_mask;
- shadow_x_mask = x_mask;
- shadow_present_mask = p_mask;
- shadow_acc_track_mask = acc_track_mask;
+ /* shadow_me_value must be a subset of shadow_me_mask */
+ if (WARN_ON(me_value & ~me_mask))
+ me_value = me_mask = 0;
+
+ shadow_me_value = me_value;
shadow_me_mask = me_mask;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
+
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
+{
+ shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+ shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+ shadow_nx_mask = 0ull;
+ shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
+ shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
+
+ /*
+ * EPT Misconfigurations are generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+ VMX_EPT_RWX_MASK, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
void kvm_mmu_reset_all_pte_masks(void)
{
u8 low_phys_bits;
-
- shadow_user_mask = 0;
- shadow_accessed_mask = 0;
- shadow_dirty_mask = 0;
- shadow_nx_mask = 0;
- shadow_x_mask = 0;
- shadow_present_mask = 0;
- shadow_acc_track_mask = 0;
+ u64 mask;
shadow_phys_bits = kvm_get_shadow_phys_bits();
@@ -315,4 +432,31 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_nonpresent_or_rsvd_lower_gfn_mask =
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+ shadow_user_mask = PT_USER_MASK;
+ shadow_accessed_mask = PT_ACCESSED_MASK;
+ shadow_dirty_mask = PT_DIRTY_MASK;
+ shadow_nx_mask = PT64_NX_MASK;
+ shadow_x_mask = 0;
+ shadow_present_mask = PT_PRESENT_MASK;
+ shadow_acc_track_mask = 0;
+ shadow_me_mask = 0;
+ shadow_me_value = 0;
+
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 6de3950fd704..0127bb6e3c7d 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -5,30 +5,41 @@
#include "mmu_internal.h"
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+extern bool __read_mostly enable_mmio_caching;
/*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+. MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
*/
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
+#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
+
+/*
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE. This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
+ */
+#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
#else
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#endif
-#define PT64_LVL_ADDR_MASK(level) \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-#define PT64_LVL_OFFSET_MASK(level) \
- (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
@@ -51,16 +62,51 @@
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page. This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+ PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+/*
+ * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given
+ * SPTE is write-protected. See is_writable_pte() for details.
+ */
+
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10)
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
+
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
/*
- * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
* the memslots generation and is derived as follows:
*
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
@@ -71,39 +117,45 @@
*/
#define MMIO_SPTE_GEN_LOW_START 3
-#define MMIO_SPTE_GEN_LOW_END 11
+#define MMIO_SPTE_GEN_LOW_END 10
-#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_START 52
#define MMIO_SPTE_GEN_HIGH_END 62
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+ (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
/* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
extern u64 __read_mostly shadow_accessed_mask;
extern u64 __read_mostly shadow_dirty_mask;
extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
extern u64 __read_mostly shadow_mmio_access_mask;
extern u64 __read_mostly shadow_present_mask;
+extern u64 __read_mostly shadow_me_value;
extern u64 __read_mostly shadow_me_mask;
/*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
* pages.
*/
@@ -121,28 +173,21 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
/*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
- PT64_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
-
-/*
* If a thread running without exclusive control of the MMU lock must perform a
* multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
*
- * This constant works because it is considered non-present on both AMD and
- * Intel CPUs and does not create a L1TF vulnerability because the pfn section
- * is zeroed out.
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability. Use only low bits to avoid 64-bit immediates.
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE (1ull << 59)
+#define REMOVED_SPTE 0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
static inline bool is_removed_spte(u64 spte)
{
@@ -159,15 +204,26 @@ static inline bool is_removed_spte(u64 spte)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+static inline bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ likely(enable_mmio_caching);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+ return !!(pte & SPTE_MMU_PRESENT_MASK);
+}
+
/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
+ * Returns true if A/D bits are supported in hardware and are enabled by KVM.
+ * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can
+ * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
+ * scenario where KVM is using A/D bits for L1, but not L2.
*/
-extern u8 __read_mostly shadow_phys_bits;
-
-static inline bool is_mmio_spte(u64 spte)
+static inline bool kvm_ad_enabled(void)
{
- return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+ return !!shadow_accessed_mask;
}
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -177,25 +233,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
+ /*
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+ * and non-TDP SPTEs will never set these bits. Optimize for 64-bit
+ * TDP and do the A/D type check unconditionally.
+ */
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
@@ -204,11 +265,6 @@ static inline bool is_access_track_spte(u64 spte)
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
-static inline bool is_shadow_present_pte(u64 pte)
-{
- return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
-}
-
static inline bool is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
@@ -244,10 +300,113 @@ static inline bool is_dirty_spte(u64 spte)
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
}
-static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
+ int level)
+{
+ int bit7 = (pte >> 7) & 1;
+
+ return rsvd_check->rsvd_bits_mask[bit7][level-1];
+}
+
+static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
+ u64 pte, int level)
+{
+ return pte & get_rsvd_bits(rsvd_check, pte, level);
+}
+
+static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
+ u64 pte)
+{
+ return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+}
+
+static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
+ u64 spte, int level)
+{
+ return __is_bad_mt_xwr(rsvd_check, spte) ||
+ __is_rsvd_bits_set(rsvd_check, spte, level);
+}
+
+/*
+ * An shadow-present leaf SPTE may be non-writable for 3 possible reasons:
+ *
+ * 1. To intercept writes for dirty logging. KVM write-protects huge pages
+ * so that they can be split be split down into the dirty logging
+ * granularity (4KiB) whenever the guest writes to them. KVM also
+ * write-protects 4KiB pages so that writes can be recorded in the dirty log
+ * (e.g. if not using PML). SPTEs are write-protected for dirty logging
+ * during the VM-iotcls that enable dirty logging.
+ *
+ * 2. To intercept writes to guest page tables that KVM is shadowing. When a
+ * guest writes to its page table the corresponding shadow page table will
+ * be marked "unsync". That way KVM knows which shadow page tables need to
+ * be updated on the next TLB flush, INVLPG, etc. and which do not.
+ *
+ * 3. To prevent guest writes to read-only memory, such as for memory in a
+ * read-only memslot or guest memory backed by a read-only VMA. Writes to
+ * such pages are disallowed entirely.
+ *
+ * To keep track of why a given SPTE is write-protected, KVM uses 2
+ * software-only bits in the SPTE:
+ *
+ * shadow_mmu_writable_mask, aka MMU-writable -
+ * Cleared on SPTEs that KVM is currently write-protecting for shadow paging
+ * purposes (case 2 above).
+ *
+ * shadow_host_writable_mask, aka Host-writable -
+ * Cleared on SPTEs that are not host-writable (case 3 above)
+ *
+ * Note, not all possible combinations of PT_WRITABLE_MASK,
+ * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given
+ * SPTE can be in only one of the following states, which map to the
+ * aforementioned 3 cases:
+ *
+ * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK
+ * ------------------------- | ------------------------ | ----------------
+ * 1 | 1 | 1 (writable)
+ * 1 | 1 | 0 (case 1)
+ * 1 | 0 | 0 (case 2)
+ * 0 | 0 | 0 (case 3)
+ *
+ * The valid combinations of these bits are checked by
+ * check_spte_writable_invariants() whenever an SPTE is modified.
+ *
+ * Clearing the MMU-writable bit is always done under the MMU lock and always
+ * accompanied by a TLB flush before dropping the lock to avoid corrupting the
+ * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging
+ * (which does not clear the MMU-writable bit), does not flush TLBs before
+ * dropping the lock, as it only needs to synchronize guest writes with the
+ * dirty bitmap.
+ *
+ * So, there is the problem: clearing the MMU-writable bit can encounter a
+ * write-protected SPTE while CPUs still have writable mappings for that SPTE
+ * cached in their TLB. To address this, KVM always flushes TLBs when
+ * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE.
+ *
+ * The Host-writable bit is not modified on present SPTEs, it is only set or
+ * cleared when an SPTE is first faulted in from non-present and then remains
+ * immutable.
+ */
+static inline bool is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+/* Note: spte must be a shadow-present leaf SPTE. */
+static inline void check_spte_writable_invariants(u64 spte)
{
- return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
- (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+ if (spte & shadow_mmu_writable_mask)
+ WARN_ONCE(!(spte & shadow_host_writable_mask),
+ "kvm: MMU-writable SPTE is not Host-writable: %llx",
+ spte);
+ else
+ WARN_ONCE(is_writable_pte(spte),
+ "kvm: Writable SPTE is not MMU-writable: %llx", spte);
+}
+
+static inline bool is_mmu_writable_spte(u64 spte)
+{
+ return spte & shadow_mmu_writable_mask;
}
static inline u64 get_mmio_spte_generation(u64 spte)
@@ -259,18 +418,32 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
-/* Bits which may be returned by set_spte() */
-#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
-#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
-#define SET_SPTE_SPURIOUS BIT(2)
+bool spte_has_volatile_bits(u64 spte);
-int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
- bool can_unsync, bool host_writable, bool ad_disabled,
- u64 *new_spte);
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool can_unsync,
+ bool host_writable, u64 *new_spte);
+u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
+
+/* Restore an acc-track PTE back to a regular PTE */
+static inline u64 restore_acc_track_spte(u64 spte)
+{
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
+
+ spte &= ~shadow_acc_track_mask;
+ spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
+ spte |= saved_bits;
+
+ return spte;
+}
+
u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
void kvm_mmu_reset_all_pte_masks(void);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index b3ed302c1a35..ee4802d7b36c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
static gfn_t round_gfn_for_level(gfn_t gfn, int level)
@@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
*/
void tdp_iter_restart(struct tdp_iter *iter)
{
+ iter->yielded = false;
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
@@ -39,17 +40,19 @@ void tdp_iter_restart(struct tdp_iter *iter)
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
-void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn)
{
+ int root_level = root->role.level;
+
WARN_ON(root_level < 1);
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
iter->next_last_level_gfn = next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
- iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
- iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
+ iter->as_id = kvm_mmu_page_as_id(root);
tdp_iter_restart(iter);
}
@@ -86,7 +89,7 @@ static bool try_step_down(struct tdp_iter *iter)
* Reread the SPTE before stepping down to avoid traversing into page
* tables that are no longer linked from this entry.
*/
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
child_pt = spte_to_child_pt(iter->old_spte, iter->level);
if (!child_pt)
@@ -120,7 +123,7 @@ static bool try_step_side(struct tdp_iter *iter)
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
iter->next_last_level_gfn = iter->gfn;
iter->sptep++;
- iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
return true;
}
@@ -143,6 +146,15 @@ static bool try_step_up(struct tdp_iter *iter)
}
/*
+ * Step the iterator back up a level in the paging structure. Should only be
+ * used when the iterator is below the root level.
+ */
+void tdp_iter_step_up(struct tdp_iter *iter)
+{
+ WARN_ON(!try_step_up(iter));
+}
+
+/*
* Step to the next SPTE in a pre-order traversal of the paging structure.
* To get to the next SPTE, the iterator either steps down towards the goal
* GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
@@ -160,6 +172,11 @@ static bool try_step_up(struct tdp_iter *iter)
*/
void tdp_iter_next(struct tdp_iter *iter)
{
+ if (iter->yielded) {
+ tdp_iter_restart(iter);
+ return;
+ }
+
if (try_step_down(iter))
return;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index b1748b988d3a..adfca0cf94d3 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -6,8 +6,51 @@
#include <linux/kvm_host.h>
#include "mmu.h"
+#include "spte.h"
-typedef u64 __rcu *tdp_ptep_t;
+/*
+ * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
+ * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
+ * batched without having to collect the list of zapped SPs. Flows that can
+ * remove SPs must service pending TLB flushes prior to dropping RCU protection.
+ */
+static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
+{
+ return READ_ONCE(*rcu_dereference(sptep));
+}
+
+static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
+{
+ return xchg(rcu_dereference(sptep), new_spte);
+}
+
+static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
+{
+ WRITE_ONCE(*rcu_dereference(sptep), new_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, int level)
+{
+ /*
+ * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
+ * volatile bits, i.e. has bits that can be set outside of mmu_lock.
+ * The Writable bit can be set by KVM's fast page fault handler, and
+ * Accessed and Dirty bits can be set by the CPU.
+ *
+ * Note, non-leaf SPTEs do have Accessed bits and those bits are
+ * technically volatile, but KVM doesn't consume the Accessed bit of
+ * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
+ * logic needs to be reassessed if KVM were to use non-leaf Accessed
+ * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ */
+ if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte))
+ return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
+
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return old_spte;
+}
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
@@ -45,25 +88,32 @@ struct tdp_iter {
* iterator walks off the end of the paging structure.
*/
bool valid;
+ /*
+ * True if KVM dropped mmu_lock and yielded in the middle of a walk, in
+ * which case tdp_iter_next() needs to restart the walk at the root
+ * level instead of advancing to the next entry.
+ */
+ bool yielded;
};
/*
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
-#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \
- for (tdp_iter_start(&iter, root, root_level, min_level, start); \
+#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start); \
iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
-#define for_each_tdp_pte(iter, root, root_level, start, end) \
- for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
+#define for_each_tdp_pte(iter, root, start, end) \
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
-void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
+void tdp_iter_step_up(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 018d82e73e31..7b9265d67131 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -10,21 +10,40 @@
#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-static bool __read_mostly tdp_mmu_enabled = false;
+static bool __read_mostly tdp_mmu_enabled = true;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
/* Initializes the TDP MMU for the VM, if enabled. */
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
+ struct workqueue_struct *wq;
+
if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
- return;
+ return 0;
+
+ wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
+ if (!wq)
+ return -ENOMEM;
/* This should not be changed for the lifetime of the VM. */
kvm->arch.tdp_mmu_enabled = true;
-
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
+ kvm->arch.tdp_mmu_zap_wq = wq;
+ return 1;
+}
+
+/* Arbitrarily returns true so that this may be used in if statements. */
+static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+ bool shared)
+{
+ if (shared)
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return true;
}
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
@@ -32,171 +51,292 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;
+ /* Also waits for any queued work items. */
+ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
+
+ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
* Ensure that all the outstanding RCU callbacks to free shadow pages
- * can run before the VM is torn down.
+ * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
+ * can call kvm_tdp_mmu_put_root and create new callbacks.
*/
rcu_barrier();
}
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
- if (kvm_mmu_put_root(kvm, root))
- kvm_tdp_mmu_free_root(kvm, root);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
}
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
- struct kvm_mmu_page *root)
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
{
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
- return false;
-
- kvm_mmu_get_root(kvm, root);
- return true;
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
+ tdp_mmu_free_sp(sp);
}
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
- struct kvm_mmu_page *root)
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
+
+static void tdp_mmu_zap_root_work(struct work_struct *work)
{
- struct kvm_mmu_page *next_root;
+ struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
+ tdp_mmu_async_work);
+ struct kvm *kvm = root->tdp_mmu_async_data;
- next_root = list_next_entry(root, link);
- tdp_mmu_put_root(kvm, root);
- return next_root;
+ read_lock(&kvm->mmu_lock);
+
+ /*
+ * A TLB flush is not necessary as KVM performs a local TLB flush when
+ * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
+ * to a different pCPU. Note, the local TLB flush on reuse also
+ * invalidates any paging-structure-cache entries, i.e. TLB entries for
+ * intermediate paging structures, that may be zapped, as such entries
+ * are associated with the ASID on both VMX and SVM.
+ */
+ tdp_mmu_zap_root(kvm, root, true);
+
+ /*
+ * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
+ * avoiding an infinite loop. By design, the root is reachable while
+ * it's being asynchronously zapped, thus a different task can put its
+ * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
+ * asynchronously zapped root is unavoidable.
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+
+ read_unlock(&kvm->mmu_lock);
}
-/*
- * Note: this iterator gets and puts references to the roots it iterates over.
- * This makes it safe to release the MMU lock and yield within the loop, but
- * if exiting the loop early, the caller must drop the reference to the most
- * recent root. (Unless keeping a live reference is desirable.)
- */
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
- typeof(*_root), link); \
- tdp_mmu_next_root_valid(_kvm, _root); \
- _root = tdp_mmu_next_root(_kvm, _root))
+static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+ root->tdp_mmu_async_data = kvm;
+ INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
+ queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
+}
-#define for_each_tdp_mmu_root(_kvm, _root) \
- list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
+static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
+{
+ union kvm_mmu_page_role role = page->role;
+ role.invalid = true;
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush);
+ /* No need to use cmpxchg, only the invalid bit can change. */
+ role.word = xchg(&page->role.word, role.word);
+ return role.invalid;
+}
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
- lockdep_assert_held_write(&kvm->mmu_lock);
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ return;
- WARN_ON(root->root_count);
WARN_ON(!root->tdp_mmu_page);
- list_del(&root->link);
+ /*
+ * The root now has refcount=0. It is valid, but readers already
+ * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
+ * rejects it. This remains true for the rest of the execution
+ * of this function, because readers visit valid roots only
+ * (except for tdp_mmu_zap_root_work(), which however
+ * does not acquire any reference itself).
+ *
+ * Even though there are flows that need to visit all roots for
+ * correctness, they all take mmu_lock for write, so they cannot yet
+ * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
+ * since the root still has refcount=0.
+ *
+ * However, tdp_mmu_zap_root can yield, and writers do not expect to
+ * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
+ * So the root temporarily gets an extra reference, going to refcount=1
+ * while staying invalid. Readers still cannot acquire any reference;
+ * but writers are now allowed to run if tdp_mmu_zap_root yields and
+ * they might take an extra reference if they themselves yield.
+ * Therefore, when the reference is given back by the worker,
+ * there is no guarantee that the refcount is still 1. If not, whoever
+ * puts the last reference will free the page, but they will not have to
+ * zap the root because a root cannot go from invalid to valid.
+ */
+ if (!kvm_tdp_root_mark_invalid(root)) {
+ refcount_set(&root->tdp_mmu_root_count, 1);
- zap_gfn_range(kvm, root, 0, max_gfn, false, false);
+ /*
+ * Zapping the root in a worker is not just "nice to have";
+ * it is required because kvm_tdp_mmu_invalidate_all_roots()
+ * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
+ * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
+ * might return with some roots not zapped yet.
+ */
+ tdp_mmu_schedule_zap_root(kvm, root);
+ return;
+ }
- free_page((unsigned long)root->spt);
- kmem_cache_free(mmu_page_header_cache, root);
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_del_rcu(&root->link);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
-static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
- int level)
+/*
+ * Returns the next root after @prev_root (or the first root if @prev_root is
+ * NULL). A reference to the returned root is acquired, and the reference to
+ * @prev_root is released (the caller obviously must hold a reference to
+ * @prev_root if it's non-NULL).
+ *
+ * If @only_valid is true, invalid roots are skipped.
+ *
+ * Returns NULL if the end of tdp_mmu_roots was reached.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root,
+ bool shared, bool only_valid)
{
- union kvm_mmu_page_role role;
+ struct kvm_mmu_page *next_root;
- role = vcpu->arch.mmu->mmu_role.base;
- role.level = level;
- role.direct = true;
- role.gpte_is_8_bytes = true;
- role.access = ACC_ALL;
+ rcu_read_lock();
- return role;
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root) {
+ if ((!only_valid || !next_root->role.invalid) &&
+ kvm_tdp_mmu_get_root(next_root))
+ break;
+
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link, typeof(*next_root), link);
+ }
+
+ rcu_read_unlock();
+
+ if (prev_root)
+ kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+
+ return next_root;
}
-static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- int level)
+/*
+ * Note: this iterator gets and puts references to the roots it iterates over.
+ * This makes it safe to release the MMU lock and yield within the loop, but
+ * if exiting the loop early, the caller must drop the reference to the most
+ * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
+ */
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
+ kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
+
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
+
+/*
+ * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
+ * the implication being that any flow that holds mmu_lock for read is
+ * inherently yield-friendly and should use the yield-safe variant above.
+ * Holding mmu_lock for write obviates the need for RCU protection as the list
+ * is guaranteed to be stable.
+ */
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+
+ return sp;
+}
+
+static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
+ gfn_t gfn, union kvm_mmu_page_role role)
+{
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- sp->role.word = page_role_for_level(vcpu, level).word;
+ sp->role = role;
sp->gfn = gfn;
+ sp->ptep = sptep;
sp->tdp_mmu_page = true;
trace_kvm_mmu_get_page(sp, true);
-
- return sp;
}
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
+ struct tdp_iter *iter)
{
+ struct kvm_mmu_page *parent_sp;
union kvm_mmu_page_role role;
- struct kvm *kvm = vcpu->kvm;
- struct kvm_mmu_page *root;
-
- role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
-
- write_lock(&kvm->mmu_lock);
-
- /* Check for an existing root before allocating a new one. */
- for_each_tdp_mmu_root(kvm, root) {
- if (root->role.word == role.word) {
- kvm_mmu_get_root(kvm, root);
- write_unlock(&kvm->mmu_lock);
- return root;
- }
- }
-
- root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
- root->root_count = 1;
- list_add(&root->link, &kvm->arch.tdp_mmu_roots);
+ parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
- write_unlock(&kvm->mmu_lock);
+ role = parent_sp->role;
+ role.level--;
- return root;
+ tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
- root = get_tdp_mmu_vcpu_root(vcpu);
- if (!root)
- return INVALID_PAGE;
+ lockdep_assert_held_write(&kvm->mmu_lock);
- return __pa(root->spt);
-}
+ /*
+ * Check for an existing root before allocating a new one. Note, the
+ * role check prevents consuming an invalid root.
+ */
+ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ if (root->role.word == role.word &&
+ kvm_tdp_mmu_get_root(root))
+ goto out;
+ }
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
- free_page((unsigned long)sp->spt);
- kmem_cache_free(mmu_page_header_cache, sp);
-}
+ root = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_sp(root, NULL, 0, role);
-/*
- * This is called through call_rcu in order to free TDP page table memory
- * safely with respect to other kernel threads that may be operating on
- * the memory.
- * By only accessing TDP MMU page table memory in an RCU read critical
- * section, and freeing it after a grace period, lockless access to that
- * memory won't use it after it is freed.
- */
-static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
-{
- struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
- rcu_head);
+ refcount_set(&root->tdp_mmu_root_count, 1);
- tdp_mmu_free_sp(sp);
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+out:
+ return __pa(root->spt);
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -205,13 +345,12 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
- bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
return;
if (is_accessed_spte(old_spte) &&
- (!is_accessed_spte(new_spte) || pfn_changed))
+ (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+ spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
@@ -234,34 +373,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
/**
- * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
- *
- * @kvm: kvm instance
- * @sp: the new page
- * @shared: This operation may not be running under the exclusive use of
- * the MMU lock and the operation must synchronize with other
- * threads that might be adding or removing pages.
- * @account_nx: This page replaces a NX large page and should be marked for
- * eventual reclaim.
- */
-static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool shared, bool account_nx)
-{
- if (shared)
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- else
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
- if (account_nx)
- account_huge_nx_page(kvm, sp);
-
- if (shared)
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-}
-
-/**
- * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
*
* @kvm: kvm instance
* @sp: the page to be removed
@@ -269,8 +381,8 @@ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
* the MMU lock and the operation must synchronize with other
* threads that might be adding or removing pages.
*/
-static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool shared)
+static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared)
{
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
@@ -286,7 +398,7 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
}
/**
- * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
+ * handle_removed_pt() - handle a page table removed from the TDP structure
*
* @kvm: kvm instance
* @pt: the page removed from the paging structure
@@ -302,24 +414,21 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
* this thread will be responsible for ensuring the page is freed. Hence the
* early rcu_dereferences in the function.
*/
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
- bool shared)
+static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
{
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
- u64 old_child_spte;
- u64 *sptep;
- gfn_t gfn;
int i;
trace_kvm_mmu_prepare_zap_page(sp);
- tdp_mmu_unlink_page(kvm, sp, shared);
+ tdp_mmu_unlink_sp(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- sptep = rcu_dereference(pt) + i;
- gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
+ tdp_ptep_t sptep = pt + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_spte;
if (shared) {
/*
@@ -331,8 +440,8 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
* value to the removed SPTE value.
*/
for (;;) {
- old_child_spte = xchg(sptep, REMOVED_SPTE);
- if (!is_removed_spte(old_child_spte))
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
+ if (!is_removed_spte(old_spte))
break;
cpu_relax();
}
@@ -346,33 +455,50 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
* are guarded by the memslots generation, not by being
* unreachable.
*/
- old_child_spte = READ_ONCE(*sptep);
- if (!is_shadow_present_pte(old_child_spte))
+ old_spte = kvm_tdp_mmu_read_spte(sptep);
+ if (!is_shadow_present_pte(old_spte))
continue;
/*
- * Marking the SPTE as a removed SPTE is not
- * strictly necessary here as the MMU lock will
- * stop other threads from concurrently modifying
- * this SPTE. Using the removed SPTE value keeps
- * the two branches consistent and simplifies
- * the function.
+ * Use the common helper instead of a raw WRITE_ONCE as
+ * the SPTE needs to be updated atomically if it can be
+ * modified by a different vCPU outside of mmu_lock.
+ * Even though the parent SPTE is !PRESENT, the TLB
+ * hasn't yet been flushed, and both Intel and AMD
+ * document that A/D assists can use upper-level PxE
+ * entries that are cached in the TLB, i.e. the CPU can
+ * still access the page and mark it dirty.
+ *
+ * No retry is needed in the atomic update path as the
+ * sole concern is dropping a Dirty bit, i.e. no other
+ * task can zap/remove the SPTE as mmu_lock is held for
+ * write. Marking the SPTE as a removed SPTE is not
+ * strictly necessary for the same reason, but using
+ * the remove SPTE value keeps the shared/exclusive
+ * paths consistent and allows the handle_changed_spte()
+ * call below to hardcode the new value to REMOVED_SPTE.
+ *
+ * Note, even though dropping a Dirty bit is the only
+ * scenario where a non-atomic update could result in a
+ * functional bug, simply checking the Dirty bit isn't
+ * sufficient as a fast page fault could read the upper
+ * level SPTE before it is zapped, and then make this
+ * target SPTE writable, resume the guest, and set the
+ * Dirty bit between reading the SPTE above and writing
+ * it here.
*/
- WRITE_ONCE(*sptep, REMOVED_SPTE);
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
+ REMOVED_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_child_spte, REMOVED_SPTE, level - 1,
- shared);
+ old_spte, REMOVED_SPTE, level, shared);
}
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
-
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
/**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
* @gfn: the base GFN that was mapped by the SPTE
@@ -404,7 +530,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* If this warning were to trigger it would indicate that there was a
* missing MMU notifier or a race with some notifier handler.
* A present, leaf SPTE should never be directly replaced with another
- * present leaf SPTE pointing to a differnt PFN. A notifier handler
+ * present leaf SPTE pointing to a different PFN. A notifier handler
* should be zapping the SPTE before the main MM's page table is
* changed, or the SPTE should be zeroed, and the TLBs flushed by the
* thread before replacement.
@@ -418,7 +544,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
/*
* Crash the host to prevent error propagation and guest data
- * courruption.
+ * corruption.
*/
BUG();
}
@@ -428,6 +554,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ if (is_leaf)
+ check_spte_writable_invariants(new_spte);
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -453,18 +582,22 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
return;
}
+ if (is_leaf != was_leaf)
+ kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
if (was_leaf && is_dirty_spte(old_spte) &&
- (!is_dirty_spte(new_spte) || pfn_changed))
+ (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
/*
* Recursively handle child PTs if the change removed a subtree from
- * the paging structure.
+ * the paging structure. Note the WARN on the PFN changing without the
+ * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
+ * pages are kernel allocations and should never be migrated.
*/
- if (was_present && !was_leaf && (pfn_changed || !is_present))
- handle_removed_tdp_mmu_page(kvm,
- spte_to_child_pt(old_spte, level), shared);
+ if (was_present && !was_leaf &&
+ (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
+ handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -479,72 +612,102 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
}
/*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping. Do not mark the page dirty
+ * in KVM's dirty bitmaps.
+ *
+ * If setting the SPTE fails because it has changed, iter->old_spte will be
+ * refreshed to the current value of the spte.
*
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @new_spte: The value the SPTE should be set to
- * Returns: true if the SPTE was set, false if it was not. If false is returned,
- * this function will have no side-effects.
+ * Return:
+ * * 0 - If the SPTE was set.
+ * * -EBUSY - If the SPTE cannot be set. In this case this function will have
+ * no side-effects other than setting iter->old_spte to the last
+ * known value of the spte.
*/
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
- lockdep_assert_held_read(&kvm->mmu_lock);
+ u64 *sptep = rcu_dereference(iter->sptep);
+ u64 old_spte;
/*
- * Do not change removed SPTEs. Only the thread that froze the SPTE
- * may modify it.
+ * The caller is responsible for ensuring the old SPTE is not a REMOVED
+ * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
+ * and pre-checking before inserting a new SPTE is advantageous as it
+ * avoids unnecessary work.
*/
- if (iter->old_spte == REMOVED_SPTE)
- return false;
+ WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
- if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
- new_spte) != iter->old_spte)
- return false;
+ lockdep_assert_held_read(&kvm->mmu_lock);
- handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
+ * does not hold the mmu_lock.
+ */
+ old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
+ if (old_spte != iter->old_spte) {
+ /*
+ * The page table entry was modified by a different logical
+ * CPU. Refresh iter->old_spte with the current value so the
+ * caller operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic().
+ */
+ iter->old_spte = old_spte;
+ return -EBUSY;
+ }
- return true;
+ __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+ handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
+
+ return 0;
}
-static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter)
+static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
+ int ret;
+
/*
* Freeze the SPTE by setting it to a special,
* non-present value. This will stop other threads from
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
- return false;
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
+ if (ret)
+ return ret;
kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
KVM_PAGES_PER_HPAGE(iter->level));
/*
- * No other thread can overwrite the removed SPTE as they
- * must either wait on the MMU lock or use
- * tdp_mmu_set_spte_atomic which will not overrite the
- * special removed SPTE value. No bookkeeping is needed
- * here since the SPTE is going from non-present
- * to non-present.
+ * No other thread can overwrite the removed SPTE as they must either
+ * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
+ * overwrite the special removed SPTE value. No bookkeeping is needed
+ * here since the SPTE is going from non-present to non-present. Use
+ * the raw write helper to avoid an unnecessary check on volatile bits.
*/
- WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
+ __kvm_tdp_mmu_write_spte(iter->sptep, 0);
- return true;
+ return 0;
}
/*
* __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
- * @kvm: kvm instance
- * @iter: a tdp_iter instance currently on the SPTE that should be set
- * @new_spte: The value the SPTE should be set to
+ * @kvm: KVM instance
+ * @as_id: Address space ID, i.e. regular vs. SMM
+ * @sptep: Pointer to the SPTE
+ * @old_spte: The current value of the SPTE
+ * @new_spte: The new value that will be set for the SPTE
+ * @gfn: The base GFN that was (or will be) mapped by the SPTE
+ * @level: The level _containing_ the SPTE (its parent PT's level)
* @record_acc_track: Notify the MM subsystem of changes to the accessed state
* of the page. Should be set unless handling an MMU
* notifier for access tracking. Leaving record_acc_track
@@ -555,57 +718,71 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* unless performing certain dirty logging operations.
* Leaving record_dirty_log unset in that case prevents page
* writes from being double counted.
+ *
+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
+ * SPTE had voldatile bits.
*/
-static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte, bool record_acc_track,
- bool record_dirty_log)
+static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level,
+ bool record_acc_track, bool record_dirty_log)
{
lockdep_assert_held_write(&kvm->mmu_lock);
/*
- * No thread should be using this function to set SPTEs to the
+ * No thread should be using this function to set SPTEs to or from the
* temporary removed SPTE value.
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
- WARN_ON(iter->old_spte == REMOVED_SPTE);
+ WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
- WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
+
+ __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
- __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, false);
if (record_acc_track)
- handle_changed_spte_acc_track(iter->old_spte, new_spte,
- iter->level);
+ handle_changed_spte_acc_track(old_spte, new_spte, level);
if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
- iter->old_spte, new_spte,
- iter->level);
+ handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
+ new_spte, level);
+ return old_spte;
+}
+
+static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte, bool record_acc_track,
+ bool record_dirty_log)
+{
+ WARN_ON_ONCE(iter->yielded);
+
+ iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level,
+ record_acc_track, record_dirty_log);
}
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
}
static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
}
static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
+ _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
}
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
- for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
+ for_each_tdp_pte(_iter, _root, _start, _end)
#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
tdp_root_for_each_pte(_iter, _root, _start, _end) \
@@ -615,8 +792,7 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
else
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
- _mmu->shadow_root_level, _start, _end)
+ for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
/*
* Yield if the MMU lock is contended or this thread needs to return control
@@ -625,77 +801,167 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* If this function should yield and flush is set, it will perform a remote
* TLB flush before yielding.
*
- * If this function yields, it will also reset the tdp_iter's walk over the
- * paging structure and the calling function should skip to the next
- * iteration to allow the iterator to continue its traversal from the
- * paging structure root.
+ * If this function yields, iter->yielded is set and the caller must skip to
+ * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
+ * over the paging structures to allow the iterator to continue its traversal
+ * from the paging structure root.
*
- * Return true if this function yielded and the iterator's traversal was reset.
- * Return false if a yield was not needed.
+ * Returns true if this function yielded.
*/
-static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush)
+static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool flush, bool shared)
{
+ WARN_ON(iter->yielded);
+
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
return false;
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- rcu_read_unlock();
-
if (flush)
kvm_flush_remote_tlbs(kvm);
- cond_resched_rwlock_write(&kvm->mmu_lock);
+ rcu_read_unlock();
+
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
rcu_read_lock();
WARN_ON(iter->gfn > iter->next_last_level_gfn);
- tdp_iter_restart(iter);
+ iter->yielded = true;
+ }
+
+ return iter->yielded;
+}
+
+static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
+{
+ /*
+ * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
+ * a gpa range that would exceed the max gfn, and KVM does not create
+ * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
+ * the slow emulation path every time.
+ */
+ return kvm_mmu_max_gfn() + 1;
+}
+
+static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared, int zap_level)
+{
+ struct tdp_iter iter;
+
+ gfn_t end = tdp_mmu_max_gfn_exclusive();
+ gfn_t start = 0;
+
+ for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
- return true;
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ if (iter.level > zap_level)
+ continue;
+
+ if (!shared)
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ goto retry;
}
+}
+
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+
+ /*
+ * The root must have an elevated refcount so that it's reachable via
+ * mmu_notifier callbacks, which allows this path to yield and drop
+ * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
+ * must drop all references to relevant pages prior to completing the
+ * callback. Dropping mmu_lock with an unreachable root would result
+ * in zapping SPTEs after a relevant mmu_notifier callback completes
+ * and lead to use-after-free as zapping a SPTE triggers "writeback" of
+ * dirty accessed bits to the SPTE's associated struct page.
+ */
+ WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ rcu_read_lock();
+
+ /*
+ * To avoid RCU stalls due to recursively removing huge swaths of SPs,
+ * split the zap into two passes. On the first pass, zap at the 1gb
+ * level, and then zap top-level SPs on the second pass. "1gb" is not
+ * arbitrary, as KVM must be able to zap a 1gb shadow page without
+ * inducing a stall to allow in-place replacement with a 1gb hugepage.
+ *
+ * Because zapping a SP recurses on its children, stepping down to
+ * PG_LEVEL_4K in the iterator itself is unnecessary.
+ */
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
+ __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
+
+ rcu_read_unlock();
+}
+
+bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 old_spte;
+
+ /*
+ * This helper intentionally doesn't allow zapping a root shadow page,
+ * which doesn't have a parent page table and thus no associated entry.
+ */
+ if (WARN_ON_ONCE(!sp->ptep))
+ return false;
+
+ old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
+ if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
+ return false;
+
+ __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
+ sp->gfn, sp->role.level + 1, true, true);
- return false;
+ return true;
}
/*
- * Tears down the mappings for the range of gfns, [start, end), and frees the
- * non-root pages mapping GFNs strictly within that range. Returns true if
- * SPTEs have been cleared and a TLB flush is needed before releasing the
- * MMU lock.
+ * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
+ * have been cleared and a TLB flush is needed before releasing the MMU lock.
+ *
* If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup. Note, in some use cases a flush may be
- * required by prior actions. Ensure the pending flush is performed prior to
- * yielding.
+ * operation can cause a soft lockup.
*/
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush)
+static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush)
{
struct tdp_iter iter;
+ end = min(end, tdp_mmu_max_gfn_exclusive());
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
rcu_read_lock();
- tdp_root_for_each_pte(iter, root, start, end) {
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false;
continue;
}
- if (!is_shadow_present_pte(iter.old_spte))
- continue;
-
- /*
- * If this is a non-last-level SPTE that covers a larger range
- * than should be zapped, continue, and zap the mappings at a
- * lower level.
- */
- if ((iter.gfn < start ||
- iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
+ if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -704,6 +970,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
}
rcu_read_unlock();
+
+ /*
+ * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
+ * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
+ */
return flush;
}
@@ -713,122 +984,194 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
*/
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
- bool can_yield)
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
+ bool can_yield, bool flush)
{
struct kvm_mmu_page *root;
- bool flush = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root)
- flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
return flush;
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- bool flush;
+ struct kvm_mmu_page *root;
+ int i;
+
+ /*
+ * Zap all roots, including invalid roots, as all SPTEs must be dropped
+ * before returning to the caller. Zap directly even if the root is
+ * also being zapped by a worker. Walking zapped top-level SPTEs isn't
+ * all that expensive and mmu_lock is already held, which means the
+ * worker has yielded, i.e. flushing the work instead of zapping here
+ * isn't guaranteed to be any faster.
+ *
+ * A TLB flush is unnecessary, KVM zaps everything if and only the VM
+ * is being destroyed or the userspace VMM has exited. In both cases,
+ * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
+ */
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root, i)
+ tdp_mmu_zap_root(kvm, root, false);
+ }
+}
+
+/*
+ * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
+ * zap" completes.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+ flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
+}
+
+/*
+ * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+ * is about to be zapped, e.g. in response to a memslots update. The actual
+ * zapping is performed asynchronously, so a reference is taken on all roots.
+ * Using a separate workqueue makes it easy to ensure that the destruction is
+ * performed before the "fast zap" completes, without keeping a separate list
+ * of invalidated roots; the list is effectively the list of work items in
+ * the workqueue.
+ *
+ * Get a reference even if the root is already invalid, the asynchronous worker
+ * assumes it was gifted a reference to the root it processes. Because mmu_lock
+ * is held for write, it should be impossible to observe a root with zero refcount,
+ * i.e. the list of roots cannot be stale.
+ *
+ * This has essentially the same effect for the TDP MMU
+ * as updating mmu_valid_gen does for the shadow MMU.
+ */
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
- if (flush)
- kvm_flush_remote_tlbs(kvm);
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!root->role.invalid &&
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
+ root->role.invalid = true;
+ tdp_mmu_schedule_zap_root(kvm, root);
+ }
+ }
}
/*
* Installs a last-level SPTE to handle a TDP page fault.
* (NPT/EPT violation/misconfiguration)
*/
-static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
- int map_writable,
- struct tdp_iter *iter,
- kvm_pfn_t pfn, bool prefault)
+static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ struct tdp_iter *iter)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
u64 new_spte;
- int ret = 0;
- int make_spte_ret = 0;
+ int ret = RET_PF_FIXED;
+ bool wrprot = false;
- if (unlikely(is_noslot_pfn(pfn)))
+ WARN_ON(sp->role.level != fault->goal_level);
+ if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else
- make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
- pfn, iter->old_spte, prefault, true,
- map_writable, !shadow_accessed_mask,
- &new_spte);
+ wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ fault->pfn, iter->old_spte, fault->prefetch, true,
+ fault->map_writable, &new_spte);
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
+ else if (is_shadow_present_pte(iter->old_spte) &&
+ !is_last_spte(iter->old_spte, iter->level))
+ kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
+ KVM_PAGES_PER_HPAGE(iter->level + 1));
/*
* If the page fault was caused by a write but the page is write
* protected, emulation is needed. If the emulation was skipped,
* the vCPU would have the same fault again.
*/
- if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
- if (write)
+ if (wrprot) {
+ if (fault->write)
ret = RET_PF_EMULATE;
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
if (unlikely(is_mmio_spte(new_spte))) {
+ vcpu->stat.pf_mmio_spte_created++;
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
ret = RET_PF_EMULATE;
- } else
+ } else {
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));
-
- trace_kvm_mmu_set_spte(iter->level, iter->gfn,
- rcu_dereference(iter->sptep));
- if (!prefault)
- vcpu->stat.pf_fixed++;
+ }
return ret;
}
/*
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @sp: The new TDP page table to install.
+ * @account_nx: True if this page table is being installed to split a
+ * non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
+ *
+ * Returns: 0 if the new page table was installed. Non-0 if the page table
+ * could not be installed (e.g. the atomic compare-exchange failed).
+ */
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool account_nx,
+ bool shared)
+{
+ u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
+ int ret = 0;
+
+ if (shared) {
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+ if (ret)
+ return ret;
+ } else {
+ tdp_mmu_set_spte(kvm, iter, spte);
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+ if (account_nx)
+ account_huge_nx_page(kvm, sp);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+ return 0;
+}
+
+/*
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
*/
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault)
-{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
struct kvm_mmu *mmu = vcpu->arch.mmu;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
- u64 *child_pt;
- u64 new_spte;
int ret;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- int level;
- int req_level;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
- if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
-
- trace_kvm_mmu_spte_requested(gpa, level, pfn);
+ trace_kvm_mmu_spte_requested(fault);
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(iter.old_spte, gfn,
- iter.level, &pfn, &level);
+ tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
- if (iter.level == level)
+ if (iter.level == fault->goal_level)
break;
/*
@@ -838,7 +1181,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
*/
if (is_shadow_present_pte(iter.old_spte) &&
is_large_pte(iter.old_spte)) {
- if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
+ if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
break;
/*
@@ -846,241 +1189,179 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
* because the new value informs the !present
* path below.
*/
- iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
}
if (!is_shadow_present_pte(iter.old_spte)) {
- sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
- child_pt = sp->spt;
+ bool account_nx = fault->huge_page_disallowed &&
+ fault->req_level >= iter.level;
- new_spte = make_nonleaf_spte(child_pt,
- !shadow_accessed_mask);
+ /*
+ * If SPTE has been frozen by another thread, just
+ * give up and retry, avoiding unnecessary page table
+ * allocation and free.
+ */
+ if (is_removed_spte(iter.old_spte))
+ break;
- if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
- new_spte)) {
- tdp_mmu_link_page(vcpu->kvm, sp, true,
- huge_page_disallowed &&
- req_level >= iter.level);
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
- trace_kvm_mmu_get_page(sp, true);
- } else {
+ if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
tdp_mmu_free_sp(sp);
break;
}
}
}
- if (iter.level != level) {
+ /*
+ * Force the guest to retry the access if the upper level SPTEs aren't
+ * in place, or if the target leaf SPTE is frozen by another CPU.
+ */
+ if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
rcu_read_unlock();
return RET_PF_RETRY;
}
- ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
- pfn, prefault);
+ ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
rcu_read_unlock();
return ret;
}
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root,
- gfn_t start,
- gfn_t end,
- unsigned long data))
-{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush)
+{
+ return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
+ range->end, range->may_block, flush);
+}
+
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range);
+
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ tdp_handler_t handler)
+{
struct kvm_mmu_page *root;
- int ret = 0;
- int as_id;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- as_id = kvm_mmu_page_as_id(root);
- slots = __kvm_memslots(kvm, as_id);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+ struct tdp_iter iter;
+ bool ret = false;
- ret |= handler(kvm, memslot, root, gfn_start,
- gfn_end, data);
- }
- }
+ /*
+ * Don't support rescheduling, none of the MMU notifiers that funnel
+ * into this helper allow blocking; it'd be dead, wasteful code.
+ */
+ for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ rcu_read_lock();
- return ret;
-}
+ tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+ ret |= handler(kvm, &iter, range);
-static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start,
- gfn_t end, unsigned long unused)
-{
- return zap_gfn_range(kvm, root, start, end, false, false);
-}
+ rcu_read_unlock();
+ }
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end)
-{
- return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
- zap_gfn_range_hva_wrapper);
+ return ret;
}
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
*/
-static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start, gfn_t end,
- unsigned long unused)
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
- int young = 0;
u64 new_spte = 0;
- rcu_read_lock();
+ /* If we have a non-accessed entry we don't need to change the pte. */
+ if (!is_accessed_spte(iter->old_spte))
+ return false;
- tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ new_spte = iter->old_spte;
+
+ if (spte_ad_enabled(new_spte)) {
+ new_spte &= ~shadow_accessed_mask;
+ } else {
/*
- * If we have a non-accessed entry we don't need to change the
- * pte.
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
*/
- if (!is_accessed_spte(iter.old_spte))
- continue;
+ if (is_writable_pte(new_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(new_spte));
- new_spte = iter.old_spte;
-
- if (spte_ad_enabled(new_spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)&new_spte);
- } else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(new_spte))
- kvm_set_pfn_dirty(spte_to_pfn(new_spte));
-
- new_spte = mark_spte_for_access_track(new_spte);
- }
- new_spte &= ~shadow_dirty_mask;
-
- tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
- young = 1;
-
- trace_kvm_age_page(iter.gfn, iter.level, slot, young);
+ new_spte = mark_spte_for_access_track(new_spte);
}
- rcu_read_unlock();
+ tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
- return young;
+ return true;
}
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
- age_gfn_range);
+ return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
}
-static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
- unsigned long unused2)
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
-
- tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
- if (is_accessed_spte(iter.old_spte))
- return 1;
-
- return 0;
+ return is_accessed_spte(iter->old_spte);
}
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
- test_age_gfn);
+ return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
- unsigned long data)
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
- pte_t *ptep = (pte_t *)data;
- kvm_pfn_t new_pfn;
u64 new_spte;
- int need_flush = 0;
- rcu_read_lock();
-
- WARN_ON(pte_huge(*ptep));
-
- new_pfn = pte_pfn(*ptep);
-
- tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
- if (iter.level != PG_LEVEL_4K)
- continue;
+ /* Huge pages aren't expected to be modified without first being zapped. */
+ WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
- if (!is_shadow_present_pte(iter.old_spte))
- break;
-
- tdp_mmu_set_spte(kvm, &iter, 0);
-
- kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
+ if (iter->level != PG_LEVEL_4K ||
+ !is_shadow_present_pte(iter->old_spte))
+ return false;
- if (!pte_write(*ptep)) {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- iter.old_spte, new_pfn);
+ /*
+ * Note, when changing a read-only SPTE, it's not strictly necessary to
+ * zero the SPTE before setting the new PFN, but doing so preserves the
+ * invariant that the PFN of a present * leaf SPTE can never change.
+ * See __handle_changed_spte().
+ */
+ tdp_mmu_set_spte(kvm, iter, 0);
- tdp_mmu_set_spte(kvm, &iter, new_spte);
- }
+ if (!pte_write(range->pte)) {
+ new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+ pte_pfn(range->pte));
- need_flush = 1;
+ tdp_mmu_set_spte(kvm, iter, new_spte);
}
- if (need_flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
-
- rcu_read_unlock();
-
- return 0;
+ return true;
}
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
- pte_t *host_ptep)
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
- (unsigned long)host_ptep,
- set_tdp_spte);
+ /*
+ * No need to handle the remote TLB flush under RCU protection, the
+ * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
+ * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
+ */
+ return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
}
/*
- * Remove write access from all the SPTEs mapping GFNs [start, end). If
- * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
- * Returns true if an SPTE has been changed and the TLBs need to be flushed.
+ * Remove write access from all SPTEs at or above min_level that map GFNs
+ * [start, end). Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
*/
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, int min_level)
@@ -1093,9 +1374,9 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
- for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
- min_level, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1105,7 +1386,9 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
+ goto retry;
+
spte_set = true;
}
@@ -1118,25 +1401,205 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* only affect leaf SPTEs down to min_level.
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
*/
-bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
- int min_level)
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
- }
return spte_set;
}
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+{
+ struct kvm_mmu_page *sp;
+
+ gfp |= __GFP_ZERO;
+
+ sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ if (!sp)
+ return NULL;
+
+ sp->spt = (void *)__get_free_page(gfp);
+ if (!sp->spt) {
+ kmem_cache_free(mmu_page_header_cache, sp);
+ return NULL;
+ }
+
+ return sp;
+}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool shared)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since we are allocating while under the MMU lock we have to be
+ * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
+ * reclaim and to avoid making any filesystem callbacks (which can end
+ * up invoking KVM MMU notifiers, resulting in a deadlock).
+ *
+ * If this allocation fails we drop the lock and retry with reclaim
+ * allowed.
+ */
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+ if (sp)
+ return sp;
+
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ iter->yielded = true;
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ return sp;
+}
+
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ const u64 huge_spte = iter->old_spte;
+ const int level = iter->level;
+ int ret, i;
+
+ tdp_mmu_init_child_sp(sp, iter);
+
+ /*
+ * No need for atomics when writing to sp->spt since the page table has
+ * not been linked in yet and thus is not reachable from any other CPU.
+ */
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++)
+ sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+
+ /*
+ * Replace the huge spte with a pointer to the populated lower level
+ * page table. Since we are making this change without a TLB flush vCPUs
+ * will see a mix of the split mappings and the original huge mapping,
+ * depending on what's currently in their TLB. This is fine from a
+ * correctness standpoint since the translation will be the same either
+ * way.
+ */
+ ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
+ if (ret)
+ goto out;
+
+ /*
+ * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
+ * are overwriting from the page stats. But we have to manually update
+ * the page stats with the new present child pages.
+ */
+ kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
+
+out:
+ trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
+ return ret;
+}
+
+static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct tdp_iter iter;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ /*
+ * Traverse the page table splitting all huge pages above the target
+ * level into one lower level. For example, if we encounter a 1GB page
+ * we split it into 512 2MB pages.
+ *
+ * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+ * to visit an SPTE before ever visiting its children, which means we
+ * will correctly recursively split huge pages that are more than one
+ * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
+ * and then splitting each of those to 512 4KB pages).
+ */
+ for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
+ continue;
+
+ if (!sp) {
+ sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ if (!sp) {
+ ret = -ENOMEM;
+ trace_kvm_mmu_split_huge_page(iter.gfn,
+ iter.old_spte,
+ iter.level, ret);
+ break;
+ }
+
+ if (iter.yielded)
+ continue;
+ }
+
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
+ goto retry;
+
+ sp = NULL;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * It's possible to exit the loop having never used the last sp if, for
+ * example, a vCPU doing HugePage NX splitting wins the race and
+ * installs its own sp in place of the last sp we tried to split.
+ */
+ if (sp)
+ tdp_mmu_free_sp(sp);
+
+ return ret;
+}
+
+
+/*
+ * Try to split all huge pages mapped by the TDP MMU down to the target level.
+ */
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *root;
+ int r = 0;
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+ r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
+ if (r) {
+ kvm_tdp_mmu_put_root(kvm, root, shared);
+ break;
+ }
+ }
+}
+
/*
* Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
* AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
@@ -1154,7 +1617,11 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte))
continue;
if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1169,7 +1636,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
}
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
+ goto retry;
+
spte_set = true;
}
@@ -1184,20 +1653,17 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* each SPTE. Returns true if an SPTE has been changed and the TLBs need to
* be flushed.
*/
-bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
- }
return spte_set;
}
@@ -1259,16 +1725,10 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
bool wrprot)
{
struct kvm_mmu_page *root;
- int root_as_id;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
-
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
- }
}
/*
@@ -1277,40 +1737,62 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
*/
static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
+ int max_mapping_level;
kvm_pfn_t pfn;
- bool spte_set = false;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
- spte_set = false;
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- }
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
+ /*
+ * This is a leaf SPTE. Check if the PFN it maps can
+ * be mapped at a higher level.
+ */
pfn = spte_to_pfn(iter.old_spte);
- if (kvm_is_reserved_pfn(pfn) ||
- iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
- pfn, PG_LEVEL_NUM))
+
+ if (kvm_is_reserved_pfn(pfn))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
+ iter.gfn, pfn, PG_LEVEL_NUM);
- spte_set = true;
+ WARN_ON(max_mapping_level < iter.level);
+
+ /*
+ * If this page is already mapped at the highest
+ * viable level, there's nothing more to do.
+ */
+ if (max_mapping_level == iter.level)
+ continue;
+
+ /*
+ * The page can be remapped at a higher level, so step
+ * up to zap the parent SPTE.
+ */
+ while (max_mapping_level > iter.level)
+ tdp_iter_step_up(&iter);
+
+ /* Note, a successful atomic zap also does a remote TLB flush. */
+ tdp_mmu_zap_spte_atomic(kvm, &iter);
+
+ /*
+ * If the atomic zap fails, the iter will recurse back into
+ * the same subtree to retry.
+ */
}
rcu_read_unlock();
- if (spte_set)
- kvm_flush_remote_tlbs(kvm);
}
/*
@@ -1318,40 +1800,42 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
* be replaced by large mappings, for GFNs within the slot.
*/
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
- int root_as_id;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
zap_collapsible_spte_range(kvm, root, slot);
- }
}
/*
* Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
* Returns true if an SPTE was set and a TLB flush is needed.
*/
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t gfn)
+ gfn_t gfn, int min_level)
{
struct tdp_iter iter;
u64 new_spte;
bool spte_set = false;
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
rcu_read_lock();
- tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
- if (!is_writable_pte(iter.old_spte))
- break;
+ for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
new_spte = iter.old_spte &
- ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
+
+ if (new_spte == iter.old_spte)
+ break;
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
@@ -1364,30 +1848,28 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
/*
* Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
* Returns true if an SPTE was set and a TLB flush is needed.
*/
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
- spte_set |= write_protect_gfn(kvm, root, gfn);
- }
return spte_set;
}
/*
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
+ *
+ * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
*/
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level)
@@ -1397,16 +1879,49 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
- *root_level = vcpu->arch.mmu->shadow_root_level;
-
- rcu_read_lock();
+ *root_level = vcpu->arch.mmu->root_role.level;
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
- rcu_read_unlock();
-
return leaf;
}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
+ *
+ * WARNING: This function is only intended to be called during fast_page_fault.
+ */
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+ u64 *spte)
+{
+ struct tdp_iter iter;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+ tdp_ptep_t sptep = NULL;
+
+ tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ *spte = iter.old_spte;
+ sptep = iter.sptep;
+ }
+
+ /*
+ * Perform the rcu_dereference to get the raw spte pointer value since
+ * we are passing it up to fast_page_fault, which is shared with the
+ * legacy MMU and thus does not retain the TDP MMU-specific __rcu
+ * annotation.
+ *
+ * This is safe since fast_page_fault obeys the contracts of this
+ * function as well as all TDP MMU contracts around modifying SPTEs
+ * outside of mmu_lock.
+ */
+ return rcu_dereference(sptep);
+}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 31096ece9b14..c163f7cc23ca 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -6,90 +6,91 @@
#include <linux/kvm_host.h>
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
- bool can_yield);
-static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
- gfn_t end)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
- return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+ return refcount_inc_not_zero(&root->tdp_mmu_root_count);
}
-static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level);
- /*
- * Don't allow yielding, as the caller may have a flush pending. Note,
- * if mmu_lock is held for write, zapping will never yield in this case,
- * but explicitly disallow it for safety. The TDP MMU does not yield
- * until it has made forward progress (steps sideways), and when zapping
- * a single shadow page that it's guaranteed to see (thus the mmu_lock
- * requirement), its "step sideways" will always step beyond the bounds
- * of the shadow page's gfn range and stop iterating before yielding.
- */
- lockdep_assert_held_write(&kvm->mmu_lock);
- return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
-}
-void kvm_tdp_mmu_zap_all(struct kvm *kvm);
-
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault);
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end);
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
+ gfn_t end, bool can_yield, bool flush);
+bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
+void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end);
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
- pte_t *host_ptep);
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
- int min_level);
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level);
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
- struct kvm_memory_slot *slot);
+ const struct kvm_memory_slot *slot);
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *slot);
+ const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn);
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level);
+
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared);
+
+static inline void kvm_tdp_mmu_walk_lockless_begin(void)
+{
+ rcu_read_lock();
+}
+
+static inline void kvm_tdp_mmu_walk_lockless_end(void)
+{
+ rcu_read_unlock();
+}
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+ u64 *spte);
#ifdef CONFIG_X86_64
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
-#else
-static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
-static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
-static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
-#endif
-static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
{
struct kvm_mmu_page *sp;
+ hpa_t hpa = mmu->root.hpa;
- if (!is_tdp_mmu_enabled(kvm))
- return false;
if (WARN_ON(!VALID_PAGE(hpa)))
return false;
+ /*
+ * A NULL shadow page is legal when shadowing a non-paging guest with
+ * PAE paging, as the MMU will be direct with root_hpa pointing at the
+ * pae_root page, not a shadow page.
+ */
sp = to_shadow_page(hpa);
- if (WARN_ON(!sp))
- return false;
-
- return is_tdp_mmu_page(sp) && sp->root_count;
+ return sp && is_tdp_mmu_page(sp) && sp->root_count;
}
+#else
+static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; }
+static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
+#endif
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 827886c12c16..3f868fed9114 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -13,6 +13,8 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include <asm/perf_event.h>
#include "x86.h"
#include "cpuid.h"
@@ -47,6 +49,32 @@
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
*/
+static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
+
+#define KVM_X86_PMU_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
+ *(((struct kvm_pmu_ops *)0)->func));
+#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
+{
+ memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
+
+#define __KVM_X86_PMU_OP(func) \
+ static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
+#define KVM_X86_PMU_OP(func) \
+ WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
+#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+#undef __KVM_X86_PMU_OP
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
+}
+
static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
{
struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
@@ -55,49 +83,46 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
kvm_pmu_deliver_pmi(vcpu);
}
-static void kvm_perf_overflow(struct perf_event *perf_event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
{
- struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
- }
+ /* Ignore counters that have been reprogrammed already. */
+ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
+ return;
+
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
+ if (!pmc->intr)
+ return;
+
+ /*
+ * Inject PMI. If vcpu was in a guest mode during NMI PMI
+ * can be ejected on a guest mode re-entry. Otherwise we can't
+ * be sure that vcpu wasn't executing hlt instruction at the
+ * time of vmexit and is not going to re-enter guest mode until
+ * woken up. So we should wake it, but this is impossible from
+ * NMI context. Do it from irq work instead.
+ */
+ if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
+ irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
+ else
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
}
-static void kvm_perf_overflow_intr(struct perf_event *perf_event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
-
- if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
- /*
- * Inject PMI. If vcpu was in a guest mode during NMI PMI
- * can be ejected on a guest mode re-entry. Otherwise we can't
- * be sure that vcpu wasn't executing hlt instruction at the
- * time of vmexit and is not going to re-enter guest mode until
- * woken up. So we should wake it, but this is impossible from
- * NMI context. Do it from irq work instead.
- */
- if (!kvm_is_in_guest())
- irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
- else
- kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
- }
+ __kvm_perf_overflow(pmc, true);
}
static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
- unsigned config, bool exclude_user,
- bool exclude_kernel, bool intr,
- bool in_tx, bool in_tx_cp)
+ u64 config, bool exclude_user,
+ bool exclude_kernel, bool intr)
{
struct perf_event *event;
struct perf_event_attr attr = {
@@ -111,22 +136,22 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
.config = config,
};
+ if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
+ return;
+
attr.sample_period = get_sample_period(pmc, pmc->counter);
- if (in_tx)
- attr.config |= HSW_IN_TX;
- if (in_tx_cp) {
+ if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
+ guest_cpuid_is_intel(pmc->vcpu)) {
/*
* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
* period. Just clear the sample period so at least
* allocating the counter doesn't fail.
*/
attr.sample_period = 0;
- attr.config |= HSW_IN_TX_CHECKPOINTED;
}
event = perf_event_create_kernel_counter(&attr, -1, current,
- intr ? kvm_perf_overflow_intr :
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
@@ -137,18 +162,21 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
pmc->perf_event = event;
pmc_to_pmu(pmc)->event_count++;
clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc->is_paused = false;
+ pmc->intr = intr;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
{
u64 counter = pmc->counter;
- if (!pmc->perf_event)
+ if (!pmc->perf_event || pmc->is_paused)
return;
/* update counter, reset event value to avoid redundant accumulation */
counter += perf_event_pause(pmc->perf_event, true);
pmc->counter = counter & pmc_bitmask(pmc);
+ pmc->is_paused = true;
}
static bool pmc_resume_counter(struct kvm_pmc *pmc)
@@ -163,18 +191,27 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
/* reuse perf_event to serve as pmc_reprogram_counter() does*/
perf_event_enable(pmc->perf_event);
+ pmc->is_paused = false;
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
return true;
}
+static int cmp_u64(const void *pa, const void *pb)
+{
+ u64 a = *(u64 *)pa;
+ u64 b = *(u64 *)pb;
+
+ return (a > b) - (a < b);
+}
+
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
- unsigned config, type = PERF_TYPE_RAW;
- u8 event_select, unit_mask;
+ u64 config;
+ u32 type = PERF_TYPE_RAW;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
- int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
bool allow_event = true;
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@ -189,37 +226,29 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (filter) {
- for (i = 0; i < filter->nevents; i++)
- if (filter->events[i] ==
- (eventsel & AMD64_RAW_EVENT_MASK_NB))
- break;
- if (filter->action == KVM_PMU_EVENT_ALLOW &&
- i == filter->nevents)
- allow_event = false;
- if (filter->action == KVM_PMU_EVENT_DENY &&
- i < filter->nevents)
- allow_event = false;
+ __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+
+ if (bsearch(&key, filter->events, filter->nevents,
+ sizeof(__u64), cmp_u64))
+ allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
+ else
+ allow_event = filter->action == KVM_PMU_EVENT_DENY;
}
if (!allow_event)
return;
- event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
- unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-
if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
ARCH_PERFMON_EVENTSEL_INV |
ARCH_PERFMON_EVENTSEL_CMASK |
HSW_IN_TX |
HSW_IN_TX_CHECKPOINTED))) {
- config = kvm_x86_ops.pmu_ops->find_arch_event(pmc_to_pmu(pmc),
- event_select,
- unit_mask);
+ config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
if (config != PERF_COUNT_HW_MAX)
type = PERF_TYPE_HARDWARE;
}
if (type == PERF_TYPE_RAW)
- config = eventsel & X86_RAW_EVENT_MASK;
+ config = eventsel & pmu->raw_event_mask;
if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
return;
@@ -230,9 +259,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
pmc_reprogram_counter(pmc, type, config,
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT,
- (eventsel & HSW_IN_TX),
- (eventsel & HSW_IN_TX_CHECKPOINTED));
+ eventsel & ARCH_PERFMON_EVENTSEL_INT);
}
EXPORT_SYMBOL_GPL(reprogram_gp_counter);
@@ -265,16 +292,16 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
pmc->current_config = (u64)ctrl;
pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- kvm_x86_ops.pmu_ops->find_fixed_event(idx),
+ static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc),
!(en_field & 0x2), /* exclude user */
!(en_field & 0x1), /* exclude kernel */
- pmi, false, false);
+ pmi);
}
EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
{
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx);
if (!pmc)
return;
@@ -296,7 +323,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
int bit;
for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
if (unlikely(!pmc || !pmc->perf_event)) {
clear_bit(bit, pmu->reprogram_pmi);
@@ -316,9 +343,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
}
/* check if idx is a valid index to access PMU */
-int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
- return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
+ return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -368,7 +395,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
+ pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
if (!pmc)
return 1;
@@ -384,22 +411,21 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu)) {
- if (kvm_x86_ops.pmu_ops->deliver_pmi)
- kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
+ static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
}
}
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
- return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
- kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr);
+ return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
+ static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
}
static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
if (pmc)
__set_bit(pmc->idx, pmu->pmc_in_use);
@@ -407,13 +433,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
+ return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
}
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
- return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info);
+ return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
}
/* refresh PMU settings. This function generally is called when underlying
@@ -422,7 +448,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops.pmu_ops->refresh(vcpu);
+ static_call(kvm_x86_pmu_refresh)(vcpu);
}
void kvm_pmu_reset(struct kvm_vcpu *vcpu)
@@ -430,7 +456,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
irq_work_sync(&pmu->irq_work);
- kvm_x86_ops.pmu_ops->reset(vcpu);
+ static_call(kvm_x86_pmu_reset)(vcpu);
}
void kvm_pmu_init(struct kvm_vcpu *vcpu)
@@ -438,7 +464,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
memset(pmu, 0, sizeof(*pmu));
- kvm_x86_ops.pmu_ops->init(vcpu);
+ static_call(kvm_x86_pmu_init)(vcpu);
init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
pmu->event_count = 0;
pmu->need_cleanup = false;
@@ -470,14 +496,13 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmu->pmc_in_use, X86_PMC_IDX_MAX);
for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
- pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
pmc_stop_counter(pmc);
}
- if (kvm_x86_ops.pmu_ops->cleanup)
- kvm_x86_ops.pmu_ops->cleanup(vcpu);
+ static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
@@ -487,6 +512,66 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
kvm_pmu_reset(vcpu);
}
+static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u64 prev_count;
+
+ prev_count = pmc->counter;
+ pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
+
+ reprogram_counter(pmu, pmc->idx);
+ if (pmc->counter < prev_count)
+ __kvm_perf_overflow(pmc, false);
+}
+
+static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
+ unsigned int perf_hw_id)
+{
+ u64 old_eventsel = pmc->eventsel;
+ unsigned int config;
+
+ pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
+ config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
+ pmc->eventsel = old_eventsel;
+ return config == perf_hw_id;
+}
+
+static inline bool cpl_is_matched(struct kvm_pmc *pmc)
+{
+ bool select_os, select_user;
+ u64 config = pmc->current_config;
+
+ if (pmc_is_gp(pmc)) {
+ select_os = config & ARCH_PERFMON_EVENTSEL_OS;
+ select_user = config & ARCH_PERFMON_EVENTSEL_USR;
+ } else {
+ select_os = config & 0x1;
+ select_user = config & 0x2;
+ }
+
+ return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
+}
+
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
+
+ if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
+ continue;
+
+ /* Ignore checks for edge detect, pin control, invert and CMASK bits */
+ if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
+ kvm_pmu_incr_counter(pmc);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
+
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_pmu_event_filter tmp, *filter;
@@ -518,6 +603,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
/* Ensure nevents can't be changed between the user copies. */
*filter = tmp;
+ /*
+ * Sort the in-kernel list so that we can search it with bsearch.
+ */
+ sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7b30bc967af3..e745f443b6a8 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -15,8 +15,6 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
-#define MAX_FIXED_COUNTERS 3
-
struct kvm_event_hw_type_mapping {
u8 eventsel;
u8 unit_mask;
@@ -24,15 +22,13 @@ struct kvm_event_hw_type_mapping {
};
struct kvm_pmu_ops {
- unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
- u8 unit_mask);
- unsigned (*find_fixed_event)(int idx);
+ unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask);
struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
- int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
+ bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -43,6 +39,8 @@ struct kvm_pmu_ops {
void (*cleanup)(struct kvm_vcpu *vcpu);
};
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -55,7 +53,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
u64 counter, enabled, running;
counter = pmc->counter;
- if (pmc->perf_event)
+ if (pmc->perf_event && !pmc->is_paused)
counter += perf_event_read_value(pmc->perf_event,
&enabled, &running);
/* FIXME: Scaling needed? */
@@ -90,11 +88,6 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
return pmc->type == KVM_PMC_FIXED;
}
-static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
-{
- return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc);
-}
-
static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
u64 data)
{
@@ -103,7 +96,7 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
/* returns general purpose PMC with the specified MSR. Note that it can be
* used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
- * paramenter to tell them apart.
+ * parameter to tell them apart.
*/
static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
u32 base)
@@ -142,6 +135,15 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
return sample_period;
}
+static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event || pmc->is_paused)
+ return;
+
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter));
+}
+
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
@@ -149,7 +151,7 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
+bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -159,6 +161,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id);
bool is_vmware_backdoor_pmc(u32 pmc_idx);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
new file mode 100644
index 000000000000..a19d473d0184
--- /dev/null
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_REVERSE_CPUID_H
+#define ARCH_X86_KVM_REVERSE_CPUID_H
+
+#include <uapi/asm/kvm.h>
+#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
+
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM. Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+ CPUID_12_EAX = NCAPINTS,
+ NR_KVM_CPU_CAPS,
+
+ NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+
+struct cpuid_reg {
+ u32 function;
+ u32 index;
+ int reg;
+};
+
+static const struct cpuid_reg reverse_cpuid[] = {
+ [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
+ [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+ [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+ [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
+ [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+ [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
+ [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
+ [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
+ [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+ [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
+ [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+ [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
+ [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+ [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
+ [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
+ [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+};
+
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities. And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
+{
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+ BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+ BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+ if (x86_feature == X86_FEATURE_SGX1)
+ return KVM_X86_FEATURE_SGX1;
+ else if (x86_feature == X86_FEATURE_SGX2)
+ return KVM_X86_FEATURE_SGX2;
+
+ return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+ return __feature_translate(x86_feature) / 32;
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5). The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+ x86_feature = __feature_translate(x86_feature);
+
+ reverse_cpuid_check(x86_feature / 32);
+ return 1 << (x86_feature & 31);
+}
+
+#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ reverse_cpuid_check(x86_leaf);
+ return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ u32 reg)
+{
+ switch (reg) {
+ case CPUID_EAX:
+ return &entry->eax;
+ case CPUID_EBX:
+ return &entry->ebx;
+ case CPUID_ECX:
+ return &entry->ecx;
+ case CPUID_EDX:
+ return &entry->edx;
+ default:
+ BUILD_BUG();
+ return NULL;
+ }
+}
+
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+ return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature,
+ bool set)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ /*
+ * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+ * compiler into using CMOV instead of Jcc when possible.
+ */
+ if (set)
+ *reg |= __feature_bit(x86_feature);
+ else
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+#endif /* ARCH_X86_KVM_REVERSE_CPUID_H */
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 78bdcfac4e40..d1bc5820ea46 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -27,26 +27,6 @@
#include "irq.h"
#include "svm.h"
-/* enable / disable AVIC */
-int avic;
-#ifdef CONFIG_X86_LOCAL_APIC
-module_param(avic, int, S_IRUGO);
-#endif
-
-#define SVM_AVIC_DOORBELL 0xc001011b
-
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
-/*
- * 0xff is broadcast, so the max index allowed for physical APIC ID
- * table is 0xfe. APIC IDs above 0xff are reserved.
- */
-#define AVIC_MAX_PHYSICAL_ID_COUNT 255
-
-#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
-#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
-#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
-
/* AVIC GATAG is encoded using VM and VCPU IDs */
#define AVIC_VCPU_ID_BITS 8
#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
@@ -79,12 +59,6 @@ struct amd_svm_iommu_ir {
void *data; /* Storing pointer to struct amd_ir_data */
};
-enum avic_ipi_failure_cause {
- AVIC_IPI_FAILURE_INVALID_INT_TYPE,
- AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
- AVIC_IPI_FAILURE_INVALID_TARGET,
- AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
-};
/* Note:
* This function is called from IOMMU driver to notify
@@ -126,7 +100,7 @@ void avic_vm_destroy(struct kvm *kvm)
unsigned long flags;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
- if (!avic)
+ if (!enable_apicv)
return;
if (kvm_svm->avic_logical_id_table_page)
@@ -149,7 +123,7 @@ int avic_vm_init(struct kvm *kvm)
struct page *l_page;
u32 vm_id;
- if (!avic)
+ if (!enable_apicv)
return 0;
/* Allocating physical APIC ID table (4KB) */
@@ -191,9 +165,8 @@ free_avic:
return err;
}
-void avic_init_vmcb(struct vcpu_svm *svm)
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
- struct vmcb *vmcb = svm->vmcb;
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
@@ -203,6 +176,8 @@ void avic_init_vmcb(struct vcpu_svm *svm)
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+
if (kvm_apicv_activated(svm->vcpu.kvm))
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
else
@@ -223,7 +198,7 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
return &avic_physical_id_table[index];
}
-/**
+/*
* Note:
* AVIC hardware walks the nested page table to check permissions,
* but does not use the SPA address specified in the leaf page
@@ -231,31 +206,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
* field of the VMCB. Therefore, we set up the
* APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
*/
-static int avic_update_access_page(struct kvm *kvm, bool activate)
+static int avic_alloc_access_page(struct kvm *kvm)
{
void __user *ret;
int r = 0;
mutex_lock(&kvm->slots_lock);
- /*
- * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
- * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
- * memory region. So, we need to ensure that kvm->mm == current->mm.
- */
- if ((kvm->arch.apic_access_page_done == activate) ||
- (kvm->mm != current->mm))
+
+ if (kvm->arch.apic_access_memslot_enabled)
goto out;
ret = __x86_set_memory_region(kvm,
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
- activate ? PAGE_SIZE : 0);
+ PAGE_SIZE);
if (IS_ERR(ret)) {
r = PTR_ERR(ret);
goto out;
}
- kvm->arch.apic_access_page_done = activate;
+ kvm->arch.apic_access_memslot_enabled = true;
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -270,18 +240,18 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
return -EINVAL;
- if (!svm->vcpu.arch.apic->regs)
+ if (!vcpu->arch.apic->regs)
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
int ret;
- ret = avic_update_access_page(vcpu->kvm, true);
+ ret = avic_alloc_access_page(vcpu->kvm);
if (ret)
return ret;
}
- svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+ svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
/* Setting AVIC backing page address in the phy APIC ID table */
entry = avic_get_physical_id_entry(vcpu, id);
@@ -298,48 +268,172 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+void avic_ring_doorbell(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any point,
+ * which could result in signalling the wrong/previous pCPU. But if
+ * that happens the vCPU is guaranteed to do a VMRUN (after being
+ * migrated) and thus will process pending interrupts, i.e. a doorbell
+ * is not needed (and the spurious one is harmless).
+ */
+ int cpu = READ_ONCE(vcpu->cpu);
+
+ if (cpu != get_cpu())
+ wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ put_cpu();
+}
+
+/*
+ * A fast-path version of avic_kick_target_vcpus(), which attempts to match
+ * destination APIC ID to vCPU without looping through all vCPUs.
+ */
+static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ u32 l1_physical_id, dest;
+ struct kvm_vcpu *target_vcpu;
+ int dest_mode = icrl & APIC_DEST_MASK;
+ int shorthand = icrl & APIC_SHORT_MASK;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (shorthand != APIC_DEST_NOSHORT)
+ return -EINVAL;
+
+ if (apic_x2apic_mode(source))
+ dest = icrh;
+ else
+ dest = GET_APIC_DEST_FIELD(icrh);
+
+ if (dest_mode == APIC_DEST_PHYSICAL) {
+ /* broadcast destination, use slow path */
+ if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+ return -EINVAL;
+ if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+ return -EINVAL;
+
+ l1_physical_id = dest;
+
+ if (WARN_ON_ONCE(l1_physical_id != index))
+ return -EINVAL;
+
+ } else {
+ u32 bitmap, cluster;
+ int logid_index;
+
+ if (apic_x2apic_mode(source)) {
+ /* 16 bit dest mask, 16 bit cluster id */
+ bitmap = dest & 0xFFFF0000;
+ cluster = (dest >> 16) << 4;
+ } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+ /* 8 bit dest mask*/
+ bitmap = dest;
+ cluster = 0;
+ } else {
+ /* 4 bit desk mask, 4 bit cluster id */
+ bitmap = dest & 0xF;
+ cluster = (dest >> 4) << 2;
+ }
+
+ if (unlikely(!bitmap))
+ /* guest bug: nobody to send the logical interrupt to */
+ return 0;
+
+ if (!is_power_of_2(bitmap))
+ /* multiple logical destinations, use slow path */
+ return -EINVAL;
+
+ logid_index = cluster + __ffs(bitmap);
+
+ if (apic_x2apic_mode(source)) {
+ l1_physical_id = logid_index;
+ } else {
+ u32 *avic_logical_id_table =
+ page_address(kvm_svm->avic_logical_id_table_page);
+
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ if (WARN_ON_ONCE(index != logid_index))
+ return -EINVAL;
+
+ /* guest bug: non existing/reserved logical destination */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return 0;
+
+ l1_physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ }
+ }
+
+ target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
+ if (unlikely(!target_vcpu))
+ /* guest bug: non existing vCPU is a target of this IPI*/
+ return 0;
+
+ target_vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(target_vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ return 0;
+}
+
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
- u32 icrl, u32 icrh)
+ u32 icrl, u32 icrh, u32 index)
{
+ unsigned long i;
struct kvm_vcpu *vcpu;
- int i;
- kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, source,
- icrl & APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK);
+ if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
+ return;
- if (m && !avic_vcpu_is_running(vcpu))
- kvm_vcpu_wake_up(vcpu);
+ trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
+
+ /*
+ * Wake any target vCPUs that are blocking, i.e. waiting for a wake
+ * event. There's no need to signal doorbells, as hardware has handled
+ * vCPUs that were in guest at the time of the IPI, and vCPUs that have
+ * since entered the guest will have processed pending IRQs at VMRUN.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & APIC_DEST_MASK)) {
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ }
}
}
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
- u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+ trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
/*
- * AVIC hardware handles the generation of
- * IPIs when the specified Message Type is Fixed
- * (also known as fixed delivery mode) and
- * the Trigger Mode is edge-triggered. The hardware
- * also supports self and broadcast delivery modes
- * specified via the Destination Shorthand(DSH)
- * field of the ICRL. Logical and physical APIC ID
- * formats are supported. All other IPI types cause
- * a #VMEXIT, which needs to emulated.
+ * Emulate IPIs that are not handled by AVIC hardware, which
+ * only virtualizes Fixed, Edge-Triggered INTRs. The exit is
+ * a trap, e.g. ICR holds the correct value and RIP has been
+ * advanced, KVM is responsible only for emulating the IPI.
+ * Sadly, hardware may sometimes leave the BUSY flag set, in
+ * which case KVM needs to emulate the ICR write as well in
+ * order to clear the BUSY flag.
*/
- kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
- kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ if (icrl & APIC_ICR_BUSY)
+ kvm_apic_write_nodecode(vcpu, APIC_ICR);
+ else
+ kvm_apic_send_ipi(apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
/*
@@ -347,11 +441,9 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
- avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
break;
case AVIC_IPI_FAILURE_INVALID_TARGET:
- WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, svm->vcpu.vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -363,6 +455,13 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
return 1;
}
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return APICV_INHIBIT_REASON_NESTED;
+ return 0;
+}
+
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
@@ -442,35 +541,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
return ret;
}
-static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
-{
- u64 *old, *new;
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 id = kvm_xapic_id(vcpu->arch.apic);
-
- if (vcpu->vcpu_id == id)
- return 0;
-
- old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
- new = avic_get_physical_id_entry(vcpu, id);
- if (!new || !old)
- return 1;
-
- /* We need to move physical_id_entry to new offset */
- *new = *old;
- *old = 0ULL;
- to_svm(vcpu)->avic_physical_id_cache = new;
-
- /*
- * Also update the guest physical APIC ID in the logical
- * APIC ID table entry if already setup the LDR.
- */
- if (svm->ldr_reg)
- avic_handle_ldr_update(vcpu);
-
- return 0;
-}
-
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -483,30 +553,24 @@ static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
svm->dfr_reg = dfr;
}
-static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
- u32 offset = svm->vmcb->control.exit_info_1 &
+ u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
switch (offset) {
- case APIC_ID:
- if (avic_handle_apic_id_update(&svm->vcpu))
- return 0;
- break;
case APIC_LDR:
- if (avic_handle_ldr_update(&svm->vcpu))
+ if (avic_handle_ldr_update(vcpu))
return 0;
break;
case APIC_DFR:
- avic_handle_dfr_update(&svm->vcpu);
+ avic_handle_dfr_update(vcpu);
break;
default:
break;
}
- kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
-
+ kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
@@ -539,8 +603,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset)
return ret;
}
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret = 0;
u32 offset = svm->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
@@ -550,15 +615,15 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
AVIC_UNACCEL_ACCESS_WRITE_MASK;
bool trap = is_avic_unaccelerated_access_trap(offset);
- trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+ trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
trap, write, vector);
if (trap) {
/* Handling Trap */
WARN_ONCE(!write, "svm: Handling trap read.\n");
- ret = avic_unaccel_trap_write(svm);
+ ret = avic_unaccel_trap_write(vcpu);
} else {
/* Handling Fault */
- ret = kvm_emulate_instruction(&svm->vcpu, 0);
+ ret = kvm_emulate_instruction(vcpu, 0);
}
return ret;
@@ -569,10 +634,10 @@ int avic_init_vcpu(struct vcpu_svm *svm)
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
- if (!avic || !irqchip_in_kernel(vcpu->kvm))
+ if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
- ret = avic_init_backing_page(&svm->vcpu);
+ ret = avic_init_backing_page(vcpu);
if (ret)
return ret;
@@ -583,39 +648,13 @@ int avic_init_vcpu(struct vcpu_svm *svm)
return ret;
}
-void avic_post_state_restore(struct kvm_vcpu *vcpu)
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
- if (avic_handle_apic_id_update(vcpu) != 0)
- return;
avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
-void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
-{
- if (!avic || !lapic_in_kernel(vcpu))
- return;
-
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- kvm_request_apicv_update(vcpu->kvm, activate,
- APICV_INHIBIT_REASON_IRQWIN);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-}
-
-void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
-void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
-}
-
-void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
-{
-}
-
-static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
unsigned long flags;
@@ -647,63 +686,6 @@ out:
return ret;
}
-void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
- if (!avic)
- return;
-
- if (activated) {
- /**
- * During AVIC temporary deactivation, guest could update
- * APIC ID, DFR and LDR registers, which would not be trapped
- * by avic_unaccelerated_access_interception(). In this case,
- * we need to check and update the AVIC logical APIC ID table
- * accordingly before re-activating.
- */
- avic_post_state_restore(vcpu);
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- } else {
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
- }
- vmcb_mark_dirty(vmcb, VMCB_AVIC);
-
- svm_set_pi_irte_mode(vcpu, activated);
-}
-
-void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
-{
- return;
-}
-
-int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
-{
- if (!vcpu->arch.apicv_active)
- return -1;
-
- kvm_lapic_set_irr(vec, vcpu->arch.apic);
- smp_mb__after_atomic();
-
- if (avic_vcpu_is_running(vcpu)) {
- int cpuid = vcpu->cpu;
-
- if (cpuid != get_cpu())
- wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
- put_cpu();
- } else
- kvm_vcpu_wake_up(vcpu);
-
- return 0;
-}
-
-bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
unsigned long flags;
@@ -727,7 +709,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
struct amd_svm_iommu_ir *ir;
/**
- * In some cases, the existing irte is updaed and re-set,
+ * In some cases, the existing irte is updated and re-set,
* so we need to check here if it's already been * added
* to the ir_list.
*/
@@ -764,7 +746,7 @@ out:
return ret;
}
-/**
+/*
* Note:
* The HW cannot support posting multicast/broadcast
* interrupts to a vCPU. So, we still use legacy interrupt
@@ -801,7 +783,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
}
/*
- * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ * avic_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
@@ -809,12 +791,12 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
-int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
- int idx, ret = -EINVAL;
+ int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP))
@@ -825,7 +807,13 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
struct vcpu_data vcpu_info;
@@ -838,7 +826,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* Here, we setup with legacy mode in the following cases:
* 1. When cannot target interrupt to a specific vcpu.
* 2. Unsetting posted interrupt.
- * 3. APIC virtialization is disabled for the vcpu.
+ * 3. APIC virtualization is disabled for the vcpu.
* 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
*/
if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
@@ -910,22 +898,23 @@ out:
return ret;
}
-bool svm_check_apicv_inhibit_reasons(ulong bit)
+bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_NESTED) |
BIT(APICV_INHIBIT_REASON_IRQWIN) |
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
- BIT(APICV_INHIBIT_REASON_X2APIC);
+ BIT(APICV_INHIBIT_REASON_X2APIC) |
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+ BIT(APICV_INHIBIT_REASON_SEV) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
- return supported & BIT(bit);
+ return supported & BIT(reason);
}
-void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
-{
- avic_update_access_page(kvm, activate);
-}
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
@@ -960,18 +949,22 @@ out:
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
+ lockdep_assert_preemption_disabled();
+
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
*/
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (kvm_vcpu_is_blocking(vcpu))
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
@@ -979,14 +972,10 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- if (svm->avic_is_running)
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
- svm->avic_is_running);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -994,39 +983,78 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
+ lockdep_assert_preemption_disabled();
entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ return;
+
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ if (activated) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_apicv_post_state_restore(vcpu);
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ } else {
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ }
+ vmcb_mark_dirty(vmcb, VMCB_AVIC);
- svm->avic_is_running = is_run;
- if (is_run)
+ if (activated)
avic_vcpu_load(vcpu, vcpu->cpu);
else
avic_vcpu_put(vcpu);
+
+ avic_set_pi_irte_mode(vcpu, activated);
}
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- avic_set_running(vcpu, false);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_
+ * the vCPU actually blocks.
+ *
+ * Any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source, therefore vIRR will also
+ * be checked by kvm_vcpu_check_block() before blocking. The
+ * memory barrier implicit in set_current_state orders writing
+ * IsRunning=0 before reading the vIRR. The processor needs a
+ * matching memory barrier on interrupt delivery between writing
+ * IRR and reading IsRunning; the lack of this barrier might be
+ * the cause of errata #1235).
+ */
+ avic_vcpu_put(vcpu);
}
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
- kvm_vcpu_update_apicv(vcpu);
- avic_set_running(vcpu, true);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ avic_vcpu_load(vcpu, vcpu->cpu);
}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
new file mode 100644
index 000000000000..7d6d97968fb9
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM).
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__
+#define __ARCH_X86_KVM_SVM_HYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#include "../hyperv.h"
+
+/*
+ * Hyper-V uses the software reserved 32 bytes in VMCB
+ * control area to expose SVM enlightenments to guests.
+ */
+struct hv_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB
+ */
+#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
+
+#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index fb204eaa8bb3..ba7cd26f438f 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -28,44 +28,53 @@
#include "cpuid.h"
#include "lapic.h"
#include "svm.h"
+#include "hyperv.h"
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
- if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ if (vmcb->control.exit_code != SVM_EXIT_NPF) {
/*
* TODO: track the cause of the nested page fault, and
* correctly fill in the high bits of exit_info_1.
*/
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = (1ULL << 32);
- svm->vmcb->control.exit_info_2 = fault->address;
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = (1ULL << 32);
+ vmcb->control.exit_info_2 = fault->address;
}
- svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
- svm->vmcb->control.exit_info_1 |= fault->error_code;
+ vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ vmcb->control.exit_info_1 |= fault->error_code;
nested_svm_vmexit(svm);
}
-static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+static bool nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- WARN_ON(!is_guest_mode(vcpu));
-
- if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
- !svm->nested.nested_run_pending) {
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = fault->error_code;
- svm->vmcb->control.exit_info_2 = fault->address;
- nested_svm_vmexit(svm);
- } else {
- kvm_inject_page_fault(vcpu, fault);
- }
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ WARN_ON(!is_guest_mode(vcpu));
+
+ if (vmcb12_is_intercept(&svm->nested.ctl,
+ INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
+ !WARN_ON_ONCE(svm->nested.nested_run_pending)) {
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = fault->error_code;
+ vmcb->control.exit_info_2 = fault->address;
+ nested_svm_vmexit(svm);
+ return true;
+ }
+
+ return false;
}
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
@@ -92,17 +101,22 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+
+ /*
+ * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note,
+ * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
+ * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
+ */
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ svm->vmcb01.ptr->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
- reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
@@ -112,9 +126,24 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+ if (!svm->v_vmload_vmsave_enabled)
+ return true;
+
+ if (!nested_npt_enabled(svm))
+ return true;
+
+ if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ return true;
+
+ return false;
+}
+
void recalc_intercepts(struct vcpu_svm *svm)
{
- struct vmcb_control_area *c, *h, *g;
+ struct vmcb_control_area *c, *h;
+ struct vmcb_ctrl_area_cached *g;
unsigned int i;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
@@ -123,7 +152,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;
c = &svm->vmcb->control;
- h = &svm->nested.hsave->control;
+ h = &svm->vmcb01.ptr->control;
g = &svm->nested.ctl;
for (i = 0; i < MAX_INTERCEPT; i++)
@@ -147,49 +176,50 @@ void recalc_intercepts(struct vcpu_svm *svm)
for (i = 0; i < MAX_INTERCEPT; i++)
c->intercepts[i] |= g->intercepts[i];
-}
-
-static void copy_vmcb_control_area(struct vmcb_control_area *dst,
- struct vmcb_control_area *from)
-{
- unsigned int i;
- for (i = 0; i < MAX_INTERCEPT; i++)
- dst->intercepts[i] = from->intercepts[i];
+ /* If SMI is not intercepted, ignore guest SMI intercept as well */
+ if (!intercept_smi)
+ vmcb_clr_intercept(c, INTERCEPT_SMI);
- dst->iopm_base_pa = from->iopm_base_pa;
- dst->msrpm_base_pa = from->msrpm_base_pa;
- dst->tsc_offset = from->tsc_offset;
- /* asid not copied, it is handled manually for svm->vmcb. */
- dst->tlb_ctl = from->tlb_ctl;
- dst->int_ctl = from->int_ctl;
- dst->int_vector = from->int_vector;
- dst->int_state = from->int_state;
- dst->exit_code = from->exit_code;
- dst->exit_code_hi = from->exit_code_hi;
- dst->exit_info_1 = from->exit_info_1;
- dst->exit_info_2 = from->exit_info_2;
- dst->exit_int_info = from->exit_int_info;
- dst->exit_int_info_err = from->exit_int_info_err;
- dst->nested_ctl = from->nested_ctl;
- dst->event_inj = from->event_inj;
- dst->event_inj_err = from->event_inj_err;
- dst->nested_cr3 = from->nested_cr3;
- dst->virt_ext = from->virt_ext;
- dst->pause_filter_count = from->pause_filter_count;
- dst->pause_filter_thresh = from->pause_filter_thresh;
+ if (nested_vmcb_needs_vls_intercept(svm)) {
+ /*
+ * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+ * we must intercept these instructions to correctly
+ * emulate them in case L1 doesn't intercept them.
+ */
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ } else {
+ WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ }
}
+/*
+ * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
+ * is optimized in that it only merges the parts where KVM MSR permission bitmap
+ * may contain zero bits.
+ */
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)svm->nested.ctl.reserved_sw;
+ int i;
+
/*
- * This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is optimized in that it only merges the parts where
- * the kvm msr permission bitmap may contain zero bits
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) is using Hyper-V emulation interface and
+ * tells KVM (L0) there were no changes in MSR bitmap for L2.
*/
- int i;
+ if (!svm->nested.force_msr_bitmap_recalc &&
+ kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
for (i = 0; i < MSRPM_OFFSETS; i++) {
@@ -208,95 +238,190 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
svm->nested.msrpm[p] = svm->msrpm[p] | value;
}
+ svm->nested.force_msr_bitmap_recalc = false;
+
+set_msrpm_base_pa:
svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
return true;
}
-static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
{
- struct vcpu_svm *svm = to_svm(vcpu);
+ u64 addr = PAGE_ALIGN(pa);
- if (WARN_ON(!is_guest_mode(vcpu)))
- return true;
+ return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+ kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
+}
- if (!nested_svm_vmrun_msrpm(svm)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return false;
+static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl)
+{
+ /* Nested FLUSHBYASID is not supported yet. */
+ switch(tlb_ctl) {
+ case TLB_CONTROL_DO_NOTHING:
+ case TLB_CONTROL_FLUSH_ALL_ASID:
+ return true;
+ default:
+ return false;
}
-
- return true;
}
-static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
+static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *control)
{
- if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+ if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
+ return false;
+
+ if (CC(control->asid == 0))
+ return false;
+
+ if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
return false;
- if (control->asid == 0)
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+ MSRPM_SIZE)))
+ return false;
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+ IOPM_SIZE)))
return false;
- if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
- !npt_enabled)
+ if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
return false;
return true;
}
-static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+/* Common checks that apply to both L1 and L2 state. */
+static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area_cached *save)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- bool vmcb12_lma;
-
- /*
- * FIXME: these should be done after copying the fields,
- * to avoid TOC/TOU races. For these save area checks
- * the possible damage is limited since kvm_set_cr0 and
- * kvm_set_cr4 handle failure; EFER_SVME is an exception
- * so it is force-set later in nested_prepare_vmcb_save.
- */
- if ((vmcb12->save.efer & EFER_SVME) == 0)
+ if (CC(!(save->efer & EFER_SVME)))
return false;
- if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
return false;
- if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
return false;
- vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
-
- if (vmcb12_lma) {
- if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
- !(vmcb12->save.cr0 & X86_CR0_PE) ||
- kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
+ /*
+ * These checks are also performed by KVM_SET_SREGS,
+ * except that EFER.LMA is not checked by SVM against
+ * CR0.PG && EFER.LME.
+ */
+ if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+ if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+ CC(!(save->cr0 & X86_CR0_PE)) ||
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
return false;
}
- if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+
+ if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
+ return false;
+
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
return false;
return true;
}
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
- struct vmcb_control_area *control)
+static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
{
- copy_vmcb_control_area(&svm->nested.ctl, control);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area_cached *save = &svm->nested.save;
- /* Copy it here because nested_svm_check_controls will check it. */
- svm->nested.ctl.asid = control->asid;
- svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL;
- svm->nested.ctl.iopm_base_pa &= ~0x0fffULL;
+ return __nested_vmcb_check_save(vcpu, save);
+}
+
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
+
+ return __nested_vmcb_check_controls(vcpu, ctl);
+}
+
+static
+void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *to,
+ struct vmcb_control_area *from)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ to->intercepts[i] = from->intercepts[i];
+
+ to->iopm_base_pa = from->iopm_base_pa;
+ to->msrpm_base_pa = from->msrpm_base_pa;
+ to->tsc_offset = from->tsc_offset;
+ to->tlb_ctl = from->tlb_ctl;
+ to->int_ctl = from->int_ctl;
+ to->int_vector = from->int_vector;
+ to->int_state = from->int_state;
+ to->exit_code = from->exit_code;
+ to->exit_code_hi = from->exit_code_hi;
+ to->exit_info_1 = from->exit_info_1;
+ to->exit_info_2 = from->exit_info_2;
+ to->exit_int_info = from->exit_int_info;
+ to->exit_int_info_err = from->exit_int_info_err;
+ to->nested_ctl = from->nested_ctl;
+ to->event_inj = from->event_inj;
+ to->event_inj_err = from->event_inj_err;
+ to->nested_cr3 = from->nested_cr3;
+ to->virt_ext = from->virt_ext;
+ to->pause_filter_count = from->pause_filter_count;
+ to->pause_filter_thresh = from->pause_filter_thresh;
+
+ /* Copy asid here because nested_vmcb_check_controls will check it. */
+ to->asid = from->asid;
+ to->msrpm_base_pa &= ~0x0fffULL;
+ to->iopm_base_pa &= ~0x0fffULL;
+
+ /* Hyper-V extensions (Enlightened VMCB) */
+ if (kvm_hv_hypercall_enabled(vcpu)) {
+ to->clean = from->clean;
+ memcpy(to->reserved_sw, from->reserved_sw,
+ sizeof(struct hv_enlightenments));
+ }
+}
+
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
+{
+ __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
+}
+
+static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
+ struct vmcb_save_area *from)
+{
+ /*
+ * Copy only fields that are validated, as we need them
+ * to avoid TOC/TOU races.
+ */
+ to->efer = from->efer;
+ to->cr0 = from->cr0;
+ to->cr3 = from->cr3;
+ to->cr4 = from->cr4;
+
+ to->dr6 = from->dr6;
+ to->dr7 = from->dr7;
+}
+
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save)
+{
+ __nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
}
/*
* Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
*/
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
{
u32 mask;
svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
@@ -316,6 +441,10 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
*/
mask &= ~V_IRQ_MASK;
}
+
+ if (nested_vgif_enabled(svm))
+ mask |= V_GIF_MASK;
+
svm->nested.ctl.int_ctl &= ~mask;
svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
}
@@ -324,8 +453,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
* Transfer any event that L0 or L1 wanted to inject into L2 to
* EXIT_INT_INFO.
*/
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
- struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+ struct vmcb *vmcb12)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 exit_int_info = 0;
@@ -357,9 +486,23 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
vmcb12->control.exit_int_info = exit_int_info;
}
-static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
- return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+ /*
+ * TODO: optimize unconditional TLB flush/MMU sync. A partial list of
+ * things to fix before this can be conditional:
+ *
+ * - Flush TLBs for both L1 and L2 remote TLB flush
+ * - Honor L1's request to flush an ASID on nested VMRUN
+ * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
+ * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
+ * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
+ *
+ * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
+ * NPT guest-physical mappings on VMRUN.
+ */
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
/*
@@ -367,104 +510,218 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
* if we are emulating VM-Entry into a guest with NPT enabled.
*/
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
- bool nested_npt)
+ bool nested_npt, bool reload_pdptrs)
{
- if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
return -EINVAL;
- if (!nested_npt && is_pae_paging(vcpu) &&
- (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
- if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
- return -EINVAL;
- }
-
- /*
- * TODO: optimize unconditional TLB flush/MMU sync here and in
- * kvm_init_shadow_npt_mmu().
- */
- if (!nested_npt)
- kvm_mmu_new_pgd(vcpu, cr3, false, false);
+ if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, cr3)))
+ return -EINVAL;
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
- kvm_init_mmu(vcpu, false);
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
+
+ if (!nested_npt)
+ kvm_mmu_new_pgd(vcpu, cr3);
return 0;
}
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
{
+ if (!svm->nested.vmcb02.ptr)
+ return;
+
+ /* FIXME: merge g_pat from vmcb01 and vmcb12. */
+ svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+ bool new_vmcb12 = false;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+
+ nested_vmcb02_compute_g_pat(svm);
+
/* Load the nested guest state */
- svm->vmcb->save.es = vmcb12->save.es;
- svm->vmcb->save.cs = vmcb12->save.cs;
- svm->vmcb->save.ss = vmcb12->save.ss;
- svm->vmcb->save.ds = vmcb12->save.ds;
- svm->vmcb->save.gdtr = vmcb12->save.gdtr;
- svm->vmcb->save.idtr = vmcb12->save.idtr;
+ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+ new_vmcb12 = true;
+ svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ svm->nested.force_msr_bitmap_recalc = true;
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+ vmcb02->save.es = vmcb12->save.es;
+ vmcb02->save.cs = vmcb12->save.cs;
+ vmcb02->save.ss = vmcb12->save.ss;
+ vmcb02->save.ds = vmcb12->save.ds;
+ vmcb02->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(vmcb02, VMCB_SEG);
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+ vmcb02->save.gdtr = vmcb12->save.gdtr;
+ vmcb02->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(vmcb02, VMCB_DT);
+ }
+
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
- /*
- * Force-set EFER_SVME even though it is checked earlier on the
- * VMCB12, because the guest can flip the bit between the check
- * and now. Clearing EFER_SVME would call svm_free_nested.
- */
- svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME);
+ svm_set_efer(&svm->vcpu, svm->nested.save.efer);
+
+ svm_set_cr0(&svm->vcpu, svm->nested.save.cr0);
+ svm_set_cr4(&svm->vcpu, svm->nested.save.cr4);
+
+ svm->vcpu.arch.cr2 = vmcb12->save.cr2;
- svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
- svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = vmcb12->save.rax;
- svm->vmcb->save.rsp = vmcb12->save.rsp;
- svm->vmcb->save.rip = vmcb12->save.rip;
- svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
- svm->vmcb->save.cpl = vmcb12->save.cpl;
+ vmcb02->save.rax = vmcb12->save.rax;
+ vmcb02->save.rsp = vmcb12->save.rsp;
+ vmcb02->save.rip = vmcb12->save.rip;
+
+ /* These bits will be set properly on the first execution when new_vmc12 is true */
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
+ vmcb_mark_dirty(vmcb02, VMCB_DR);
+ }
+
+ if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ /*
+ * Reserved bits of DEBUGCTL are ignored. Be consistent with
+ * svm_set_msr's definition of reserved bits.
+ */
+ svm_copy_lbrs(vmcb02, vmcb12);
+ vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+ svm_update_lbrv(&svm->vcpu);
+
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb02, vmcb01);
+ }
}
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
- const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+ u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+ u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ u32 pause_count12;
+ u32 pause_thresh12;
+
+ /*
+ * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+ * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+ */
+
+ if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ else
+ int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+
+ /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+ vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
+ vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+
+ /* Done at vmrun: asid. */
+ /* Also overwritten later if necessary. */
+ vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* nested_cr3. */
if (nested_npt_enabled(svm))
- nested_svm_init_mmu_context(&svm->vcpu);
+ nested_svm_init_mmu_context(vcpu);
+
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ svm->nested.ctl.tsc_offset,
+ svm->tsc_ratio_msr);
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
- svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
+ vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
- svm->vmcb->control.int_ctl =
- (svm->nested.ctl.int_ctl & ~mask) |
- (svm->nested.hsave->control.int_ctl & mask);
+ if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
+ WARN_ON(!svm->tsc_scaling_enabled);
+ nested_svm_update_tsc_ratio_msr(vcpu);
+ }
- svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
- svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
- svm->vmcb->control.int_state = svm->nested.ctl.int_state;
- svm->vmcb->control.event_inj = svm->nested.ctl.event_inj;
- svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ vmcb02->control.int_ctl =
+ (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
+ (vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
+
+ vmcb02->control.int_vector = svm->nested.ctl.int_vector;
+ vmcb02->control.int_state = svm->nested.ctl.int_state;
+ vmcb02->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ vmcb02->control.virt_ext = vmcb01->control.virt_ext &
+ LBR_CTL_ENABLE_MASK;
+ if (svm->lbrv_enabled)
+ vmcb02->control.virt_ext |=
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (!nested_vmcb_needs_vls_intercept(svm))
+ vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
+ pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
+ if (kvm_pause_in_guest(svm->vcpu.kvm)) {
+ /* use guest values since host doesn't intercept PAUSE */
+ vmcb02->control.pause_filter_count = pause_count12;
+ vmcb02->control.pause_filter_thresh = pause_thresh12;
+
+ } else {
+ /* start from host values otherwise */
+ vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
+ vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+
+ /* ... but ensure filtering is disabled if so requested. */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (!pause_count12)
+ vmcb02->control.pause_filter_count = 0;
+ if (!pause_thresh12)
+ vmcb02->control.pause_filter_thresh = 0;
+ }
+ }
- svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
- svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
+ nested_svm_transition_tlb_flush(vcpu);
/* Enter Guest-Mode */
- enter_guest_mode(&svm->vcpu);
+ enter_guest_mode(vcpu);
/*
- * Merge guest and host intercepts - must be called with vcpu in
- * guest-mode to take affect here
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take effect.
*/
recalc_intercepts(svm);
+}
- vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ /*
+ * Some VMCB state is shared between L1 and L2 and thus has to be
+ * moved at the time of nested vmrun and vmexit.
+ *
+ * VMLOAD/VMSAVE state would also belong in this category, but KVM
+ * always performs VMLOAD and VMSAVE from the VMCB01.
+ */
+ to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
- struct vmcb *vmcb12)
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
+ struct vmcb *vmcb12, bool from_vmrun)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret;
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
@@ -482,56 +739,71 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
svm->nested.vmcb12_gpa = vmcb12_gpa;
- nested_prepare_vmcb_control(svm);
- nested_prepare_vmcb_save(svm, vmcb12);
- ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
- nested_npt_enabled(svm));
+ WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+ nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm);
+ nested_vmcb02_prepare_save(svm, vmcb12);
+
+ ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
+ nested_npt_enabled(svm), from_vmrun);
if (ret)
return ret;
- if (!npt_enabled)
- svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+ if (!from_vmrun)
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
svm_set_gif(svm, true);
+ if (kvm_vcpu_apicv_active(vcpu))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
return 0;
}
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret;
struct vmcb *vmcb12;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb12_gpa;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
- if (is_smm(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ if (!svm->nested.hsave_msr) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (is_smm(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
vmcb12_gpa = svm->vmcb->save.rax;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
} else if (ret) {
- return kvm_skip_emulated_instruction(&svm->vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ ret = kvm_skip_emulated_instruction(vcpu);
vmcb12 = map.hva;
if (WARN_ON_ONCE(!svm->nested.initialized))
return -EINVAL;
- load_nested_vmcb_control(svm, &vmcb12->control);
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
- if (!nested_vmcb_check_save(svm, vmcb12) ||
- !nested_vmcb_check_controls(&svm->nested.ctl)) {
+ if (!nested_vmcb_check_save(vcpu) ||
+ !nested_vmcb_check_controls(vcpu)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
@@ -539,38 +811,22 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
goto out;
}
-
- /* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
/*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
+ * Since vmcb01 is not in use, we can use it to store some of the L1
+ * state.
*/
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ vmcb01->save.efer = vcpu->arch.efer;
+ vmcb01->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb01->save.cr4 = vcpu->arch.cr4;
+ vmcb01->save.rflags = kvm_get_rflags(vcpu);
+ vmcb01->save.rip = kvm_rip_read(vcpu);
- copy_vmcb_control_area(&hsave->control, &vmcb->control);
+ if (!npt_enabled)
+ vmcb01->save.cr3 = kvm_read_cr3(vcpu);
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
goto out_exit_err;
if (nested_svm_vmrun_msrpm(svm))
@@ -587,12 +843,33 @@ out_exit_err:
nested_svm_vmexit(svm);
out:
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
-void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+/* Copy state save area fields which are handled by VMRUN */
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save)
+{
+ to_save->es = from_save->es;
+ to_save->cs = from_save->cs;
+ to_save->ss = from_save->ss;
+ to_save->ds = from_save->ds;
+ to_save->gdtr = from_save->gdtr;
+ to_save->idtr = from_save->idtr;
+ to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
+ to_save->efer = from_save->efer;
+ to_save->cr0 = from_save->cr0;
+ to_save->cr3 = from_save->cr3;
+ to_save->cr4 = from_save->cr4;
+ to_save->rax = from_save->rax;
+ to_save->rsp = from_save->rsp;
+ to_save->rip = from_save->rip;
+ to_save->cpl = 0;
+}
+
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
{
to_vmcb->save.fs = from_vmcb->save.fs;
to_vmcb->save.gs = from_vmcb->save.gs;
@@ -610,105 +887,122 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
int nested_svm_vmexit(struct vcpu_svm *svm)
{
- int rc;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
struct vmcb *vmcb12;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
+ int rc;
- rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+ rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
vmcb12 = map.hva;
/* Exit Guest-Mode */
- leave_guest_mode(&svm->vcpu);
+ leave_guest_mode(vcpu);
svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
/* in case we halted in L2 */
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
/* Give the current vmcb to the guest */
- vmcb12->save.es = vmcb->save.es;
- vmcb12->save.cs = vmcb->save.cs;
- vmcb12->save.ss = vmcb->save.ss;
- vmcb12->save.ds = vmcb->save.ds;
- vmcb12->save.gdtr = vmcb->save.gdtr;
- vmcb12->save.idtr = vmcb->save.idtr;
+ vmcb12->save.es = vmcb02->save.es;
+ vmcb12->save.cs = vmcb02->save.cs;
+ vmcb12->save.ss = vmcb02->save.ss;
+ vmcb12->save.ds = vmcb02->save.ds;
+ vmcb12->save.gdtr = vmcb02->save.gdtr;
+ vmcb12->save.idtr = vmcb02->save.idtr;
vmcb12->save.efer = svm->vcpu.arch.efer;
- vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu);
- vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu);
- vmcb12->save.cr2 = vmcb->save.cr2;
+ vmcb12->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(vcpu);
+ vmcb12->save.cr2 = vmcb02->save.cr2;
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
- vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
- vmcb12->save.rip = kvm_rip_read(&svm->vcpu);
- vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu);
- vmcb12->save.rax = kvm_rax_read(&svm->vcpu);
- vmcb12->save.dr7 = vmcb->save.dr7;
+ vmcb12->save.rflags = kvm_get_rflags(vcpu);
+ vmcb12->save.rip = kvm_rip_read(vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(vcpu);
+ vmcb12->save.rax = kvm_rax_read(vcpu);
+ vmcb12->save.dr7 = vmcb02->save.dr7;
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
- vmcb12->save.cpl = vmcb->save.cpl;
+ vmcb12->save.cpl = vmcb02->save.cpl;
- vmcb12->control.int_state = vmcb->control.int_state;
- vmcb12->control.exit_code = vmcb->control.exit_code;
- vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi;
- vmcb12->control.exit_info_1 = vmcb->control.exit_info_1;
- vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
+ vmcb12->control.int_state = vmcb02->control.int_state;
+ vmcb12->control.exit_code = vmcb02->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2;
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
- nested_vmcb_save_pending_event(svm, vmcb12);
+ nested_save_pending_event_to_vmcb12(svm, vmcb12);
if (svm->nrips_enabled)
- vmcb12->control.next_rip = vmcb->control.next_rip;
+ vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
- vmcb12->control.pause_filter_count =
- svm->vmcb->control.pause_filter_count;
- vmcb12->control.pause_filter_thresh =
- svm->vmcb->control.pause_filter_thresh;
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
- /* Restore the original control entries */
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ }
+
+ nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ svm_copy_lbrs(vmcb12, vmcb02);
+ svm_update_lbrv(vcpu);
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb01, vmcb02);
+ svm_update_lbrv(vcpu);
+ }
- /* On vmexit the GIF is set to false */
+ /*
+ * On vmexit the GIF is set to false and
+ * no event can be injected in L1.
+ */
svm_set_gif(svm, false);
+ vmcb01->control.exit_int_info = 0;
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
- svm->vcpu.arch.l1_tsc_offset;
+ svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+ if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+ }
+
+ if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
+ WARN_ON(!svm->tsc_scaling_enabled);
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ }
svm->nested.ctl.nested_cr3 = 0;
- /* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- kvm_rax_write(&svm->vcpu, hsave->save.rax);
- kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
- kvm_rip_write(&svm->vcpu, hsave->save.rip);
- svm->vmcb->save.dr7 = DR7_FIXED_1;
- svm->vmcb->save.cpl = 0;
- svm->vmcb->control.exit_int_info = 0;
-
- vmcb_mark_all_dirty(svm->vmcb);
+ /*
+ * Restore processor state that had been saved in vmcb01
+ */
+ kvm_set_rflags(vcpu, vmcb01->save.rflags);
+ svm_set_efer(vcpu, vmcb01->save.efer);
+ svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, vmcb01->save.cr4);
+ kvm_rax_write(vcpu, vmcb01->save.rax);
+ kvm_rsp_write(vcpu, vmcb01->save.rsp);
+ kvm_rip_write(vcpu, vmcb01->save.rip);
+
+ svm->vcpu.arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(&svm->vcpu);
trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
vmcb12->control.exit_info_1,
@@ -717,50 +1011,71 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.exit_int_info_err,
KVM_ISA_SVM);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
- nested_svm_uninit_mmu_context(&svm->vcpu);
+ nested_svm_transition_tlb_flush(vcpu);
- rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+ nested_svm_uninit_mmu_context(vcpu);
+
+ rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
if (rc)
return 1;
- if (npt_enabled)
- svm->vmcb->save.cr3 = hsave->save.cr3;
-
/*
* Drop what we picked up for L2 via svm_complete_interrupts() so it
* doesn't end up in L1.
*/
svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ /*
+ * If we are here following the completion of a VMRUN that
+ * is being single-stepped, queue the pending #DB intercept
+ * right now so that it an be accounted for before we execute
+ * L1's next instruction.
+ */
+ if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
+ kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
+
+ /*
+ * Un-inhibit the AVIC right away, so that other vCPUs can start
+ * to benefit from it right away.
+ */
+ if (kvm_apicv_activated(vcpu->kvm))
+ kvm_vcpu_update_apicv(vcpu);
return 0;
}
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+ nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
int svm_allocate_nested(struct vcpu_svm *svm)
{
- struct page *hsave_page;
+ struct page *vmcb02_page;
if (svm->nested.initialized)
return 0;
- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!hsave_page)
+ vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb02_page)
return -ENOMEM;
- svm->nested.hsave = page_address(hsave_page);
+ svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+ svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
if (!svm->nested.msrpm)
- goto err_free_hsave;
+ goto err_free_vmcb02;
svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
svm->nested.initialized = true;
return 0;
-err_free_hsave:
- __free_page(hsave_page);
+err_free_vmcb02:
+ __free_page(vmcb02_page);
return -ENOMEM;
}
@@ -772,8 +1087,17 @@ void svm_free_nested(struct vcpu_svm *svm)
svm_vcpu_free_msrpm(svm->nested.msrpm);
svm->nested.msrpm = NULL;
- __free_page(virt_to_page(svm->nested.hsave));
- svm->nested.hsave = NULL;
+ __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+ svm->nested.vmcb02.ptr = NULL;
+
+ /*
+ * When last_vmcb12_gpa matches the current vmcb12 gpa,
+ * some vmcb12 fields are not loaded if they are marked clean
+ * in the vmcb12, since in this case they are up to date already.
+ *
+ * When the vmcb02 is freed, this optimization becomes invalid.
+ */
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
svm->nested.initialized = false;
}
@@ -781,20 +1105,23 @@ void svm_free_nested(struct vcpu_svm *svm)
/*
* Forcibly leave nested mode in order to be able to reset the VCPU later on.
*/
-void svm_leave_nested(struct vcpu_svm *svm)
+void svm_leave_nested(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(&svm->vcpu)) {
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
- leave_guest_mode(&svm->vcpu);
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
- nested_svm_uninit_mmu_context(&svm->vcpu);
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+
+ leave_guest_mode(vcpu);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ nested_svm_uninit_mmu_context(vcpu);
vmcb_mark_all_dirty(svm->vmcb);
}
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -802,7 +1129,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
u32 offset, msr, value;
int write, mask;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
@@ -829,7 +1156,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
u8 start_bit;
u64 gpa;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
@@ -860,12 +1187,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
vmexit = nested_svm_intercept_ioio(svm);
break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
@@ -883,7 +1210,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
break;
}
default: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
}
}
@@ -903,16 +1230,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm)
return vmexit;
}
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
{
- if (!(svm->vcpu.arch.efer & EFER_SVME) ||
- !is_paging(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (svm->vmcb->save.cpl) {
- kvm_inject_gp(&svm->vcpu, 0);
+ if (to_svm(vcpu)->vmcb->save.cpl) {
+ kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -929,12 +1255,13 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
{
unsigned int nr = svm->vcpu.arch.exception.nr;
+ struct vmcb *vmcb = svm->vmcb;
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ vmcb->control.exit_code_hi = 0;
if (svm->vcpu.arch.exception.has_error_code)
- svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
+ vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
/*
* EXITINFO2 is undefined for all exception intercepts other
@@ -942,11 +1269,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
*/
if (nr == PF_VECTOR) {
if (svm->vcpu.arch.exception.nested_apf)
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
else if (svm->vcpu.arch.exception.has_payload)
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
else
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
} else if (nr == DB_VECTOR) {
/* See inject_pending_event. */
kvm_deliver_exception_payload(&svm->vcpu);
@@ -960,50 +1287,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
nested_svm_vmexit(svm);
}
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_SMI;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_NMI;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
- trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
-}
-
-static void nested_svm_init(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_INIT;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
}
-
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1017,12 +1305,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_init(svm))
return 0;
- nested_svm_init(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
return 0;
}
if (vcpu->arch.exception.pending) {
- if (block_nested_events)
+ /*
+ * Only a pending nested run can block a pending exception.
+ * Otherwise an injected NMI/interrupt should either be
+ * lost or delivered to the nested hypervisor in the EXITINTINFO
+ * vmcb field, while delivering the pending exception.
+ */
+ if (svm->nested.nested_run_pending)
return -EBUSY;
if (!nested_exit_on_exception(svm))
return 0;
@@ -1035,7 +1329,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_smi(svm))
return 0;
- nested_svm_smi(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
return 0;
}
@@ -1044,7 +1338,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_nmi(svm))
return 0;
- nested_svm_nmi(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
return 0;
}
@@ -1053,7 +1347,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_intr(svm))
return 0;
- nested_svm_intr(svm);
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
return 0;
}
@@ -1072,8 +1367,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
- excp_bits)
+ if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
return NESTED_EXIT_HOST;
else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
svm->vcpu.arch.apf.host_apf_flags)
@@ -1088,11 +1383,58 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_CONTINUE;
}
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->arch.tsc_scaling_ratio =
+ kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
+ svm->tsc_ratio_msr);
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+}
+
+/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
+static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
+ struct vmcb_ctrl_area_cached *from)
+{
+ unsigned int i;
+
+ memset(dst, 0, sizeof(struct vmcb_control_area));
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
+ /* 'clean' and 'reserved_sw' are not changed by KVM */
+}
+
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
u32 user_data_size)
{
struct vcpu_svm *svm;
+ struct vmcb_control_area *ctl;
+ unsigned long r;
struct kvm_nested_state kvm_state = {
.flags = 0,
.format = KVM_STATE_NESTED_FORMAT_SVM,
@@ -1134,13 +1476,21 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
*/
if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
return -EFAULT;
- if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
- sizeof(user_vmcb->control)))
+
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ if (!ctl)
+ return -ENOMEM;
+
+ nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
+ r = copy_to_user(&user_vmcb->control, ctl,
+ sizeof(user_vmcb->control));
+ kfree(ctl);
+ if (r)
return -EFAULT;
- if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
sizeof(user_vmcb->save)))
return -EFAULT;
-
out:
return kvm_state.size;
}
@@ -1150,13 +1500,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state *kvm_state)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0];
struct vmcb_control_area *ctl;
struct vmcb_save_area *save;
+ struct vmcb_save_area_cached save_cached;
+ struct vmcb_ctrl_area_cached ctl_cached;
+ unsigned long cr0;
int ret;
- u32 cr0;
BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
KVM_STATE_NESTED_SVM_VMCB_SIZE);
@@ -1184,7 +1535,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
- svm_leave_nested(svm);
+ svm_leave_nested(vcpu);
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
return 0;
}
@@ -1195,8 +1546,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
ret = -ENOMEM;
- ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
- save = kzalloc(sizeof(*save), GFP_KERNEL);
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
+ save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
if (!ctl || !save)
goto out_free;
@@ -1207,12 +1558,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
ret = -EINVAL;
- if (!nested_vmcb_check_controls(ctl))
+ __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
+ if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
goto out_free;
/*
* Processor state contains L2 state. Check that it is
- * valid for guest mode (see nested_vmcb_checks).
+ * valid for guest mode (see nested_vmcb_check_save).
*/
cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1221,29 +1573,53 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
/*
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
- * TODO: validate reserved bits for all saved state.
*/
- if (!(save->cr0 & X86_CR0_PG))
- goto out_free;
- if (!(save->efer & EFER_SVME))
+ __nested_copy_vmcb_save_to_cache(&save_cached, save);
+ if (!(save->cr0 & X86_CR0_PG) ||
+ !(save->cr0 & X86_CR0_PE) ||
+ (save->rflags & X86_EFLAGS_VM) ||
+ !__nested_vmcb_check_save(vcpu, &save_cached))
goto out_free;
+
/*
- * All checks done, we can enter guest mode. L1 control fields
- * come from the nested save state. Guest state is already
- * in the registers, the save area of the nested state instead
- * contains saved L1 state.
+ * All checks done, we can enter guest mode. Userspace provides
+ * vmcb12.control, which will be combined with L1 and stored into
+ * vmcb02, and the L1 save state which we store in vmcb01.
+ * L2 registers if needed are moved from the current VMCB to VMCB02.
*/
+ if (is_guest_mode(vcpu))
+ svm_leave_nested(vcpu);
+ else
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+
svm->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
- copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
- hsave->save = *save;
-
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
- load_nested_vmcb_control(svm, ctl);
- nested_prepare_vmcb_control(svm);
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
+ nested_copy_vmcb_control_to_cache(svm, ctl);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm);
+
+ /*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+ svm->nested.force_msr_bitmap_recalc = true;
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
@@ -1254,8 +1630,39 @@ out_free:
return ret;
}
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_npt_enabled(svm) && is_pae_paging(vcpu))
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ return false;
+
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ return true;
+}
+
struct kvm_x86_nested_ops svm_nested_ops = {
+ .leave_nested = svm_leave_nested,
.check_events = svm_check_nested_events,
+ .handle_page_fault_workaround = nested_svm_handle_page_fault_workaround,
+ .triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index fdf587f19c5f..136039fc6d01 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -16,6 +16,7 @@
#include "cpuid.h"
#include "lapic.h"
#include "pmu.h"
+#include "svm.h"
enum pmu_type {
PMU_TYPE_COUNTER = 0,
@@ -44,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
};
+/* duplicated from amd_f17h_perfmon_event_map. */
+static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
+ [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
+ [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+ [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+/* amd_pmc_perf_hw_id depends on these being the same size */
+static_assert(ARRAY_SIZE(amd_event_mapping) ==
+ ARRAY_SIZE(amd_f17h_event_mapping));
+
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
@@ -100,6 +117,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+ if (!vcpu->kvm->arch.enable_pmu)
+ return NULL;
+
switch (msr) {
case MSR_F15H_PERF_CTL0:
case MSR_F15H_PERF_CTL1:
@@ -134,27 +154,31 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
return &pmu->gp_counters[msr_to_index(msr)];
}
-static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
- u8 event_select,
- u8 unit_mask)
+static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_event_hw_type_mapping *event_mapping;
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
+ /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+ if (WARN_ON(pmc_is_fixed(pmc)))
+ return PERF_COUNT_HW_MAX;
+
+ if (guest_cpuid_family(pmc->vcpu) >= 0x17)
+ event_mapping = amd_f17h_event_mapping;
+ else
+ event_mapping = amd_event_mapping;
+
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
- if (amd_event_mapping[i].eventsel == event_select
- && amd_event_mapping[i].unit_mask == unit_mask)
+ if (event_mapping[i].eventsel == event_select
+ && event_mapping[i].unit_mask == unit_mask)
break;
if (i == ARRAY_SIZE(amd_event_mapping))
return PERF_COUNT_HW_MAX;
- return amd_event_mapping[i].event_type;
-}
-
-/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
-static unsigned amd_find_fixed_event(int idx)
-{
- return PERF_COUNT_HW_MAX;
+ return event_mapping[i].event_type;
}
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
@@ -181,14 +205,13 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER);
}
-/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
idx &= ~(3u << 30);
- return (idx >= pmu->nr_arch_gp_counters);
+ return idx < pmu->nr_arch_gp_counters;
}
/* idx is the ECX register of RDPMC instruction */
@@ -256,17 +279,16 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
pmc->counter += data - pmc_read_counter(pmc);
+ pmc_update_sample_period(pmc);
return 0;
}
/* MSR_EVNTSELn */
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
- if (data == pmc->eventsel)
- return 0;
- if (!(data & pmu->reserved_bits)) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel)
reprogram_gp_counter(pmc, data);
- return 0;
- }
+ return 0;
}
return 1;
@@ -282,7 +304,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
- pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->reserved_bits = 0xfffffff000280000ull;
+ pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
@@ -319,9 +342,8 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
}
}
-struct kvm_pmu_ops amd_pmu_ops = {
- .find_arch_event = amd_find_arch_event,
- .find_fixed_event = amd_find_fixed_event,
+struct kvm_pmu_ops amd_pmu_ops __initdata = {
+ .pmc_perf_hw_id = amd_pmc_perf_hw_id,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 874ea309279f..51fd985cf21d 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,11 +14,13 @@
#include <linux/psp-sev.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
-#include <asm/fpu/internal.h>
+#include <asm/pkru.h>
#include <asm/trapnr.h>
+#include <asm/fpu/xcr.h>
#include "x86.h"
#include "svm.h"
@@ -26,14 +28,41 @@
#include "cpuid.h"
#include "trace.h"
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
+#ifdef CONFIG_KVM_AMD_SEV
+/* enable/disable SEV support */
+static bool sev_enabled = true;
+module_param_named(sev, sev_enabled, bool, 0444);
+
+/* enable/disable SEV-ES support */
+static bool sev_es_enabled = true;
+module_param_named(sev_es, sev_es_enabled, bool, 0444);
+#else
+#define sev_enabled false
+#define sev_es_enabled false
+#endif /* CONFIG_KVM_AMD_SEV */
static u8 sev_enc_bit;
-static int sev_flush_asids(void);
static DECLARE_RWSEM(sev_deactivate_lock);
static DEFINE_MUTEX(sev_bitmap_lock);
unsigned int max_sev_asid;
static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
+static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
@@ -45,9 +74,15 @@ struct enc_region {
unsigned long size;
};
-static int sev_flush_asids(void)
+/* Called with the sev_bitmap_lock held, or on shutdown */
+static int sev_flush_asids(int min_asid, int max_asid)
{
- int ret, error = 0;
+ int ret, asid, error = 0;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
+ if (asid > max_asid)
+ return -EBUSY;
/*
* DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
@@ -66,56 +101,81 @@ static int sev_flush_asids(void)
return ret;
}
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+ return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
/* Must be called with the sev_bitmap_lock held */
static bool __sev_recycle_asids(int min_asid, int max_asid)
{
- int pos;
-
- /* Check if there are any ASIDs to reclaim before performing a flush */
- pos = find_next_bit(sev_reclaim_asid_bitmap, max_sev_asid, min_asid);
- if (pos >= max_asid)
- return false;
-
- if (sev_flush_asids())
+ if (sev_flush_asids(min_asid, max_asid))
return false;
/* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
- max_sev_asid);
- bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
+ nr_asids);
+ bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
return true;
}
+static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ return misc_cg_try_charge(type, sev->misc_cg, 1);
+}
+
+static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ misc_cg_uncharge(type, sev->misc_cg, 1);
+}
+
static int sev_asid_new(struct kvm_sev_info *sev)
{
- int pos, min_asid, max_asid;
+ int asid, min_asid, max_asid, ret;
bool retry = true;
+ WARN_ON(sev->misc_cg);
+ sev->misc_cg = get_current_misc_cg();
+ ret = sev_misc_cg_try_charge(sev);
+ if (ret) {
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+ }
+
mutex_lock(&sev_bitmap_lock);
/*
* SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
* SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
*/
- min_asid = sev->es_active ? 0 : min_sev_asid - 1;
+ min_asid = sev->es_active ? 1 : min_sev_asid;
max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
again:
- pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid);
- if (pos >= max_asid) {
+ asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
+ if (asid > max_asid) {
if (retry && __sev_recycle_asids(min_asid, max_asid)) {
retry = false;
goto again;
}
mutex_unlock(&sev_bitmap_lock);
- return -EBUSY;
+ ret = -EBUSY;
+ goto e_uncharge;
}
- __set_bit(pos, sev_asid_bitmap);
+ __set_bit(asid, sev_asid_bitmap);
mutex_unlock(&sev_bitmap_lock);
- return pos + 1;
+ return asid;
+e_uncharge:
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
}
static int sev_get_asid(struct kvm *kvm)
@@ -125,55 +185,53 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
}
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm_sev_info *sev)
{
struct svm_cpu_data *sd;
- int cpu, pos;
+ int cpu;
mutex_lock(&sev_bitmap_lock);
- pos = asid - 1;
- __set_bit(pos, sev_reclaim_asid_bitmap);
+ __set_bit(sev->asid, sev_reclaim_asid_bitmap);
for_each_possible_cpu(cpu) {
sd = per_cpu(svm_data, cpu);
- sd->sev_vmcbs[pos] = NULL;
+ sd->sev_vmcbs[sev->asid] = NULL;
}
mutex_unlock(&sev_bitmap_lock);
+
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
}
-static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+static void sev_decommission(unsigned int handle)
{
- struct sev_data_decommission *decommission;
- struct sev_data_deactivate *data;
+ struct sev_data_decommission decommission;
if (!handle)
return;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
+ decommission.handle = handle;
+ sev_guest_decommission(&decommission, NULL);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_deactivate deactivate;
+
+ if (!handle)
return;
- /* deactivate handle */
- data->handle = handle;
+ deactivate.handle = handle;
/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
down_read(&sev_deactivate_lock);
- sev_guest_deactivate(data, NULL);
+ sev_guest_deactivate(&deactivate, NULL);
up_read(&sev_deactivate_lock);
- kfree(data);
-
- decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
- if (!decommission)
- return;
-
- /* decommission handle */
- decommission->handle = handle;
- sev_guest_decommission(decommission, NULL);
-
- kfree(decommission);
+ sev_decommission(handle);
}
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
@@ -181,54 +239,50 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
int asid, ret;
+ if (kvm->created_vcpus)
+ return -EINVAL;
+
ret = -EBUSY;
if (unlikely(sev->active))
return ret;
+ sev->active = true;
+ sev->es_active = argp->id == KVM_SEV_ES_INIT;
asid = sev_asid_new(sev);
if (asid < 0)
- return ret;
+ goto e_no_asid;
+ sev->asid = asid;
ret = sev_platform_init(&argp->error);
if (ret)
goto e_free;
- sev->active = true;
- sev->asid = asid;
INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);
+
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
return 0;
e_free:
- sev_asid_free(asid);
+ sev_asid_free(sev);
+ sev->asid = 0;
+e_no_asid:
+ sev->es_active = false;
+ sev->active = false;
return ret;
}
-static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- if (!sev_es)
- return -ENOTTY;
-
- to_kvm_svm(kvm)->sev_info.es_active = true;
-
- return sev_guest_init(kvm, argp);
-}
-
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
- struct sev_data_activate *data;
+ struct sev_data_activate activate;
int asid = sev_get_asid(kvm);
int ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
/* activate ASID on the given handle */
- data->handle = handle;
- data->asid = asid;
- ret = sev_guest_activate(data, error);
- kfree(data);
+ activate.handle = handle;
+ activate.asid = asid;
+ ret = sev_guest_activate(&activate, error);
return ret;
}
@@ -258,7 +312,7 @@ static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_start *start;
+ struct sev_data_launch_start start;
struct kvm_sev_launch_start params;
void *dh_blob, *session_blob;
int *error = &argp->error;
@@ -270,20 +324,16 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
- if (!start)
- return -ENOMEM;
+ memset(&start, 0, sizeof(start));
dh_blob = NULL;
if (params.dh_uaddr) {
dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
- if (IS_ERR(dh_blob)) {
- ret = PTR_ERR(dh_blob);
- goto e_free;
- }
+ if (IS_ERR(dh_blob))
+ return PTR_ERR(dh_blob);
- start->dh_cert_address = __sme_set(__pa(dh_blob));
- start->dh_cert_len = params.dh_len;
+ start.dh_cert_address = __sme_set(__pa(dh_blob));
+ start.dh_cert_len = params.dh_len;
}
session_blob = NULL;
@@ -294,40 +344,40 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_dh;
}
- start->session_address = __sme_set(__pa(session_blob));
- start->session_len = params.session_len;
+ start.session_address = __sme_set(__pa(session_blob));
+ start.session_len = params.session_len;
}
- start->handle = params.handle;
- start->policy = params.policy;
+ start.handle = params.handle;
+ start.policy = params.policy;
/* create memory encryption context */
- ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
if (ret)
goto e_free_session;
/* Bind ASID to this guest */
- ret = sev_bind_asid(kvm, start->handle, error);
- if (ret)
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
goto e_free_session;
+ }
/* return handle to userspace */
- params.handle = start->handle;
+ params.handle = start.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
- sev_unbind_asid(kvm, start->handle);
+ sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
}
- sev->handle = start->handle;
+ sev->handle = start.handle;
sev->fd = argp->sev_fd;
e_free_session:
kfree(session_blob);
e_free_dh:
kfree(dh_blob);
-e_free:
- kfree(start);
return ret;
}
@@ -417,6 +467,7 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
page_virtual = kmap_atomic(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
kunmap_atomic(page_virtual);
+ cond_resched();
}
}
@@ -446,7 +497,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
- struct sev_data_launch_update_data *data;
+ struct sev_data_launch_update_data data;
struct page **inpages;
int ret;
@@ -456,20 +507,14 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
vaddr = params.uaddr;
size = params.len;
vaddr_end = vaddr + size;
/* Lock the user memory. */
inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
- if (IS_ERR(inpages)) {
- ret = PTR_ERR(inpages);
- goto e_free;
- }
+ if (IS_ERR(inpages))
+ return PTR_ERR(inpages);
/*
* Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
@@ -477,6 +522,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
*/
sev_clflush_pages(inpages, npages);
+ data.reserved = 0;
+ data.handle = sev->handle;
+
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
int offset, len;
@@ -491,10 +539,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
- data->handle = sev->handle;
- data->len = len;
- data->address = __sme_page_pa(inpages[i]) + offset;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+ data.len = len;
+ data.address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
if (ret)
goto e_unpin;
@@ -510,19 +557,25 @@ e_unpin:
}
/* unlock the user pages */
sev_unpin_memory(kvm, inpages, npages);
-e_free:
- kfree(data);
return ret;
}
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
{
- struct vmcb_save_area *save = &svm->vmcb->save;
+ struct sev_es_save_area *save = svm->sev_es.vmsa;
/* Check some debug related fields before encrypting the VMSA */
- if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1))
+ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
return -EINVAL;
+ /*
+ * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+ * the traditional VMSA that is part of the VMCB. Copy the
+ * traditional VMSA as it has been built so far (in prep
+ * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ */
+ memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
+
/* Sync registgers */
save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
@@ -548,68 +601,71 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xcr0 = svm->vcpu.arch.xcr0;
save->pkru = svm->vcpu.arch.pkru;
save->xss = svm->vcpu.arch.ia32_xss;
+ save->dr6 = svm->vcpu.arch.dr6;
+
+ return 0;
+}
+
+static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ int *error)
+{
+ struct sev_data_launch_update_vmsa vmsa;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ /* Perform some pre-encryption checks against the VMSA */
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
/*
- * SEV-ES will use a VMSA that is pointed to by the VMCB, not
- * the traditional VMSA that is part of the VMCB. Copy the
- * traditional VMSA as it has been built so far (in prep
- * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
+ * the VMSA memory content (i.e it will write the same memory region
+ * with the guest's key), so invalidate it first.
*/
- memcpy(svm->vmsa, save, sizeof(*save));
+ clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
+ vmsa.reserved = 0;
+ vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
+ vmsa.address = __sme_pa(svm->sev_es.vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
+ if (ret)
+ return ret;
+
+ vcpu->arch.guest_state_protected = true;
return 0;
}
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_update_vmsa *vmsa;
- int i, ret;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
- vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
- if (!vmsa)
- return -ENOMEM;
-
- for (i = 0; i < kvm->created_vcpus; i++) {
- struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
-
- /* Perform some pre-encryption checks against the VMSA */
- ret = sev_es_sync_vmsa(svm);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = mutex_lock_killable(&vcpu->mutex);
if (ret)
- goto e_free;
+ return ret;
- /*
- * The LAUNCH_UPDATE_VMSA command will perform in-place
- * encryption of the VMSA memory content (i.e it will write
- * the same memory region with the guest's key), so invalidate
- * it first.
- */
- clflush_cache_range(svm->vmsa, PAGE_SIZE);
+ ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
- vmsa->handle = sev->handle;
- vmsa->address = __sme_pa(svm->vmsa);
- vmsa->len = PAGE_SIZE;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
- &argp->error);
+ mutex_unlock(&vcpu->mutex);
if (ret)
- goto e_free;
-
- svm->vcpu.arch.guest_state_protected = true;
+ return ret;
}
-e_free:
- kfree(vmsa);
- return ret;
+ return 0;
}
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_measure *data;
+ struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
void __user *p = NULL;
void *blob = NULL;
@@ -621,9 +677,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, measure, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
@@ -631,23 +685,20 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
- goto e_free;
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = params.len;
+ data.address = __psp_pa(blob);
+ data.len = params.len;
}
cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
@@ -664,63 +715,50 @@ cmd:
}
done:
- params.len = data->len;
+ params.len = data.len;
if (copy_to_user(measure, &params, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_finish *data;
- int ret;
+ struct sev_data_launch_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
-
- kfree(data);
- return ret;
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
}
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_guest_status params;
- struct sev_data_guest_status *data;
+ struct sev_data_guest_status data;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
if (ret)
- goto e_free;
+ return ret;
- params.policy = data->policy;
- params.state = data->state;
- params.handle = data->handle;
+ params.policy = data.policy;
+ params.state = data.state;
+ params.handle = data.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
ret = -EFAULT;
-e_free:
- kfree(data);
+
return ret;
}
@@ -729,23 +767,17 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
int *error, bool enc)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_dbg *data;
- int ret;
+ struct sev_data_dbg data;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- data->handle = sev->handle;
- data->dst_addr = dst;
- data->src_addr = src;
- data->len = size;
+ data.reserved = 0;
+ data.handle = sev->handle;
+ data.dst_addr = dst;
+ data.src_addr = src;
+ data.len = size;
- ret = sev_issue_cmd(kvm,
- enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
- data, error);
- kfree(data);
- return ret;
+ return sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ &data, error);
}
static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
@@ -765,7 +797,7 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
}
static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
- unsigned long __user dst_uaddr,
+ void __user *dst_uaddr,
unsigned long dst_paddr,
int size, int *err)
{
@@ -776,7 +808,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
if (!IS_ALIGNED(dst_paddr, 16) ||
!IS_ALIGNED(paddr, 16) ||
!IS_ALIGNED(size, 16)) {
- tpage = (void *)alloc_page(GFP_KERNEL);
+ tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!tpage)
return -ENOMEM;
@@ -789,8 +821,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
if (tpage) {
offset = paddr & 15;
- if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
- page_address(tpage) + offset, size))
+ if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
ret = -EFAULT;
}
@@ -802,9 +833,9 @@ e_free:
}
static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
- unsigned long __user vaddr,
+ void __user *vaddr,
unsigned long dst_paddr,
- unsigned long __user dst_vaddr,
+ void __user *dst_vaddr,
int size, int *error)
{
struct page *src_tpage = NULL;
@@ -812,13 +843,12 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
int ret, len = size;
/* If source buffer is not aligned then use an intermediate buffer */
- if (!IS_ALIGNED(vaddr, 16)) {
+ if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
src_tpage = alloc_page(GFP_KERNEL);
if (!src_tpage)
return -ENOMEM;
- if (copy_from_user(page_address(src_tpage),
- (void __user *)(uintptr_t)vaddr, size)) {
+ if (copy_from_user(page_address(src_tpage), vaddr, size)) {
__free_page(src_tpage);
return -EFAULT;
}
@@ -832,7 +862,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
* - copy the source buffer in an intermediate buffer
* - use the intermediate buffer as source buffer
*/
- if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
int dst_offset;
dst_tpage = alloc_page(GFP_KERNEL);
@@ -857,7 +887,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
page_address(src_tpage), size);
else {
if (copy_from_user(page_address(dst_tpage) + dst_offset,
- (void __user *)(uintptr_t)vaddr, size)) {
+ vaddr, size)) {
ret = -EFAULT;
goto e_free;
}
@@ -937,15 +967,15 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (dec)
ret = __sev_dbg_decrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off,
- dst_vaddr,
+ (void __user *)dst_vaddr,
__sme_page_pa(dst_p[0]) + d_off,
len, &argp->error);
else
ret = __sev_dbg_encrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off,
- vaddr,
+ (void __user *)vaddr,
__sme_page_pa(dst_p[0]) + d_off,
- dst_vaddr,
+ (void __user *)dst_vaddr,
len, &argp->error);
sev_unpin_memory(kvm, src_p, n);
@@ -965,7 +995,7 @@ err:
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_secret *data;
+ struct sev_data_launch_secret data;
struct kvm_sev_launch_secret params;
struct page **pages;
void *blob, *hdr;
@@ -997,41 +1027,36 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_unpin_memory;
}
- ret = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- goto e_unpin_memory;
+ memset(&data, 0, sizeof(data));
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- data->guest_address = __sme_page_pa(pages[0]) + offset;
- data->guest_len = params.guest_len;
+ data.guest_address = __sme_page_pa(pages[0]) + offset;
+ data.guest_len = params.guest_len;
blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
if (IS_ERR(blob)) {
ret = PTR_ERR(blob);
- goto e_free;
+ goto e_unpin_memory;
}
- data->trans_address = __psp_pa(blob);
- data->trans_len = params.trans_len;
+ data.trans_address = __psp_pa(blob);
+ data.trans_len = params.trans_len;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
if (IS_ERR(hdr)) {
ret = PTR_ERR(hdr);
goto e_free_blob;
}
- data->hdr_address = __psp_pa(hdr);
- data->hdr_len = params.hdr_len;
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
kfree(hdr);
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
e_unpin_memory:
/* content of memory is updated, mark pages dirty */
for (i = 0; i < n; i++) {
@@ -1046,7 +1071,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *report = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_attestation_report *data;
+ struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
void __user *p;
void *blob = NULL;
@@ -1058,9 +1083,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
@@ -1068,23 +1091,20 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
- goto e_free;
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = params.len;
- memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
}
cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
*/
@@ -1100,22 +1120,710 @@ cmd:
}
done:
- params.len = data->len;
+ params.len = data.len;
if (copy_to_user(report, &params, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
-int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_start *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ params->session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ struct kvm_sev_send_start params;
+ void *amd_certs, *session_data;
+ void *pdh_cert, *plat_certs;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_start)))
+ return -EFAULT;
+
+ /* if session_len is zero, userspace wants to query the session length */
+ if (!params.session_len)
+ return __sev_send_start_query_session_length(kvm, argp,
+ &params);
+
+ /* some sanity checks */
+ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+ !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ /* allocate the memory to hold the session data blob */
+ session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+ if (!session_data)
+ return -ENOMEM;
+
+ /* copy the certificate blobs from userspace */
+ pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+ params.pdh_cert_len);
+ if (IS_ERR(pdh_cert)) {
+ ret = PTR_ERR(pdh_cert);
+ goto e_free_session;
+ }
+
+ plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+ params.plat_certs_len);
+ if (IS_ERR(plat_certs)) {
+ ret = PTR_ERR(plat_certs);
+ goto e_free_pdh;
+ }
+
+ amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+ params.amd_certs_len);
+ if (IS_ERR(amd_certs)) {
+ ret = PTR_ERR(amd_certs);
+ goto e_free_plat_cert;
+ }
+
+ /* populate the FW SEND_START field with system physical address */
+ memset(&data, 0, sizeof(data));
+ data.pdh_cert_address = __psp_pa(pdh_cert);
+ data.pdh_cert_len = params.pdh_cert_len;
+ data.plat_certs_address = __psp_pa(plat_certs);
+ data.plat_certs_len = params.plat_certs_len;
+ data.amd_certs_address = __psp_pa(amd_certs);
+ data.amd_certs_len = params.amd_certs_len;
+ data.session_address = __psp_pa(session_data);
+ data.session_len = params.session_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ session_data, params.session_len)) {
+ ret = -EFAULT;
+ goto e_free_amd_cert;
+ }
+
+ params.policy = data.policy;
+ params.session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+e_free_amd_cert:
+ kfree(amd_certs);
+e_free_plat_cert:
+ kfree(plat_certs);
+e_free_pdh:
+ kfree(pdh_cert);
+e_free_session:
+ kfree(session_data);
+ return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_update_data *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ params->hdr_len = data.hdr_len;
+ params->trans_len = data.trans_len;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_update_data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ struct kvm_sev_send_update_data params;
+ void *hdr, *trans_data;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_update_data)))
+ return -EFAULT;
+
+ /* userspace wants to query either header or trans length */
+ if (!params.trans_len || !params.hdr_len)
+ return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+ if (!params.trans_uaddr || !params.guest_uaddr ||
+ !params.guest_len || !params.hdr_uaddr)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if ((params.guest_len + offset > PAGE_SIZE))
+ return -EINVAL;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (IS_ERR(guest_page))
+ return PTR_ERR(guest_page);
+
+ /* allocate memory for header and transport buffer */
+ ret = -ENOMEM;
+ hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ if (!hdr)
+ goto e_unpin;
+
+ trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ if (!trans_data)
+ goto e_free_hdr;
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans_data);
+ data.trans_len = params.trans_len;
+
+ /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ if (ret)
+ goto e_free_trans_data;
+
+ /* copy transport buffer to user space */
+ if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ trans_data, params.trans_len)) {
+ ret = -EFAULT;
+ goto e_free_trans_data;
+ }
+
+ /* Copy packet header to userspace. */
+ if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ params.hdr_len))
+ ret = -EFAULT;
+
+e_free_trans_data:
+ kfree(trans_data);
+e_free_hdr:
+ kfree(hdr);
+e_unpin:
+ sev_unpin_memory(kvm, guest_page, n);
+
+ return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_cancel data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_start start;
+ struct kvm_sev_receive_start params;
+ int *error = &argp->error;
+ void *session_data;
+ void *pdh_data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* Get parameter from the userspace */
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_start)))
+ return -EFAULT;
+
+ /* some sanity checks */
+ if (!params.pdh_uaddr || !params.pdh_len ||
+ !params.session_uaddr || !params.session_len)
+ return -EINVAL;
+
+ pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+ if (IS_ERR(pdh_data))
+ return PTR_ERR(pdh_data);
+
+ session_data = psp_copy_user_blob(params.session_uaddr,
+ params.session_len);
+ if (IS_ERR(session_data)) {
+ ret = PTR_ERR(session_data);
+ goto e_free_pdh;
+ }
+
+ memset(&start, 0, sizeof(start));
+ start.handle = params.handle;
+ start.policy = params.policy;
+ start.pdh_cert_address = __psp_pa(pdh_data);
+ start.pdh_cert_len = params.pdh_len;
+ start.session_address = __psp_pa(session_data);
+ start.session_len = params.session_len;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+ error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ params.handle = start.handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ &params, sizeof(struct kvm_sev_receive_start))) {
+ ret = -EFAULT;
+ sev_unbind_asid(kvm, start.handle);
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_data);
+e_free_pdh:
+ kfree(pdh_data);
+
+ return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_receive_update_data params;
+ struct sev_data_receive_update_data data;
+ void *hdr = NULL, *trans = NULL;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -EINVAL;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_update_data)))
+ return -EFAULT;
+
+ if (!params.hdr_uaddr || !params.hdr_len ||
+ !params.guest_uaddr || !params.guest_len ||
+ !params.trans_uaddr || !params.trans_len)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if ((params.guest_len + offset > PAGE_SIZE))
+ return -EINVAL;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto e_free_hdr;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans);
+ data.trans_len = params.trans_len;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 1);
+ if (IS_ERR(guest_page)) {
+ ret = PTR_ERR(guest_page);
+ goto e_free_trans;
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
+ * encrypts the written data with the guest's key, and the cache may
+ * contain dirty, unencrypted data.
+ */
+ sev_clflush_pages(guest_page, n);
+
+ /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+ &argp->error);
+
+ sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+ kfree(trans);
+e_free_hdr:
+ kfree(hdr);
+
+ return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
+static bool is_cmd_allowed_from_mirror(u32 cmd_id)
+{
+ /*
+ * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
+ * active mirror VMs. Also allow the debugging and status commands.
+ */
+ if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
+ cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
+ cmd_id == KVM_SEV_DBG_ENCRYPT)
+ return true;
+
+ return false;
+}
+
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
+
+ /*
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
+ */
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
+ return -EBUSY;
+
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
+ return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
+}
+
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+}
+
+/* vCPU mutex subclasses. */
+enum sev_migration_role {
+ SEV_MIGRATION_SOURCE = 0,
+ SEV_MIGRATION_TARGET,
+ SEV_NR_MIGRATION_ROLES,
+};
+
+static int sev_lock_vcpus_for_migration(struct kvm *kvm,
+ enum sev_migration_role role)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i, j;
+ bool first = true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (mutex_lock_killable_nested(&vcpu->mutex, role))
+ goto out_unlock;
+
+ if (first) {
+ /*
+ * Reset the role to one that avoids colliding with
+ * the role used for the first vcpu mutex.
+ */
+ role = SEV_NR_MIGRATION_ROLES;
+ first = false;
+ } else {
+ mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
+ }
+ }
+
+ return 0;
+
+out_unlock:
+
+ first = true;
+ kvm_for_each_vcpu(j, vcpu, kvm) {
+ if (i == j)
+ break;
+
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
+
+
+ mutex_unlock(&vcpu->mutex);
+ }
+ return -EINTR;
+}
+
+static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ bool first = true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map,
+ SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
+
+ mutex_unlock(&vcpu->mutex);
+ }
+}
+
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_sev_info *mirror;
+
+ dst->active = true;
+ dst->asid = src->asid;
+ dst->handle = src->handle;
+ dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
+
+ src->asid = 0;
+ src->active = false;
+ src->handle = 0;
+ src->pages_locked = 0;
+ src->enc_context_owner = NULL;
+
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info =
+ &to_kvm_svm(dst->enc_context_owner)->sev_info;
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
+}
+
+static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
+{
+ unsigned long i;
+ struct kvm_vcpu *dst_vcpu, *src_vcpu;
+ struct vcpu_svm *dst_svm, *src_svm;
+
+ if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
+ return -EINVAL;
+
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ if (!src_vcpu->arch.guest_state_protected)
+ return -EINVAL;
+ }
+
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ src_svm = to_svm(src_vcpu);
+ dst_vcpu = kvm_get_vcpu(dst, i);
+ dst_svm = to_svm(dst_vcpu);
+
+ /*
+ * Transfer VMSA and GHCB state to the destination. Nullify and
+ * clear source fields as appropriate, the state now belongs to
+ * the destination.
+ */
+ memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
+ dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
+ dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
+ dst_vcpu->arch.guest_state_protected = true;
+
+ memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
+ src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
+ src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ src_vcpu->arch.guest_state_protected = false;
+ }
+ to_kvm_svm(src)->sev_info.es_active = false;
+ to_kvm_svm(dst)->sev_info.es_active = true;
+
+ return 0;
+}
+
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *src_sev, *cg_cleanup_sev;
+ struct file *source_kvm_file;
+ struct kvm *source_kvm;
+ bool charged = false;
+ int ret;
+
+ source_kvm_file = fget(source_fd);
+ if (!file_is_kvm(source_kvm_file)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+
+ source_kvm = source_kvm_file->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto out_fput;
+
+ if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ dst_sev->misc_cg = get_current_misc_cg();
+ cg_cleanup_sev = dst_sev;
+ if (dst_sev->misc_cg != src_sev->misc_cg) {
+ ret = sev_misc_cg_try_charge(dst_sev);
+ if (ret)
+ goto out_dst_cgroup;
+ charged = true;
+ }
+
+ ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
+ if (ret)
+ goto out_dst_cgroup;
+ ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
+ if (ret)
+ goto out_dst_vcpu;
+
+ if (sev_es_guest(source_kvm)) {
+ ret = sev_es_migrate_from(kvm, source_kvm);
+ if (ret)
+ goto out_source_vcpu;
+ }
+
+ sev_migrate_from(kvm, source_kvm);
+ kvm_vm_dead(source_kvm);
+ cg_cleanup_sev = src_sev;
+ ret = 0;
+
+out_source_vcpu:
+ sev_unlock_vcpus_for_migration(source_kvm);
+out_dst_vcpu:
+ sev_unlock_vcpus_for_migration(kvm);
+out_dst_cgroup:
+ /* Operates on the source on success, on the destination on failure. */
+ if (charged)
+ sev_misc_cg_uncharge(cg_cleanup_sev);
+ put_misc_cg(cg_cleanup_sev->misc_cg);
+ cg_cleanup_sev->misc_cg = NULL;
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+out_fput:
+ if (source_kvm_file)
+ fput(source_kvm_file);
+ return ret;
+}
+
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
int r;
- if (!svm_sev_enabled() || !sev)
+ if (!sev_enabled)
return -ENOTTY;
if (!argp)
@@ -1126,13 +1834,23 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
mutex_lock(&kvm->lock);
+ /* Only the enc_context_owner handles some memory enc operations. */
+ if (is_mirroring_enc_context(kvm) &&
+ !is_cmd_allowed_from_mirror(sev_cmd.id)) {
+ r = -EINVAL;
+ goto out;
+ }
+
switch (sev_cmd.id) {
+ case KVM_SEV_ES_INIT:
+ if (!sev_es_enabled) {
+ r = -ENOTTY;
+ goto out;
+ }
+ fallthrough;
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
- case KVM_SEV_ES_INIT:
- r = sev_es_guest_init(kvm, &sev_cmd);
- break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
@@ -1163,6 +1881,27 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
case KVM_SEV_GET_ATTESTATION_REPORT:
r = sev_get_attestation_report(kvm, &sev_cmd);
break;
+ case KVM_SEV_SEND_START:
+ r = sev_send_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_UPDATE_DATA:
+ r = sev_send_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_FINISH:
+ r = sev_send_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_CANCEL:
+ r = sev_send_cancel(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_START:
+ r = sev_receive_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_UPDATE_DATA:
+ r = sev_receive_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_FINISH:
+ r = sev_receive_finish(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -1176,8 +1915,8 @@ out:
return r;
}
-int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct enc_region *region;
@@ -1186,6 +1925,10 @@ int svm_register_enc_region(struct kvm *kvm,
if (!sev_guest(kvm))
return -ENOTTY;
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
@@ -1246,12 +1989,16 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
kfree(region);
}
-int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range)
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
{
struct enc_region *region;
int ret;
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
mutex_lock(&kvm->lock);
if (!sev_guest(kvm)) {
@@ -1282,6 +2029,70 @@ failed:
return ret;
}
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct file *source_kvm_file;
+ struct kvm *source_kvm;
+ struct kvm_sev_info *source_sev, *mirror_sev;
+ int ret;
+
+ source_kvm_file = fget(source_fd);
+ if (!file_is_kvm(source_kvm_file)) {
+ ret = -EBADF;
+ goto e_source_fput;
+ }
+
+ source_kvm = source_kvm_file->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto e_source_fput;
+
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
+ ret = -EINVAL;
+ goto e_unlock;
+ }
+
+ /*
+ * The mirror kvm holds an enc_context_owner ref so its asid can't
+ * disappear until we're done with it
+ */
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
+ kvm_get_kvm(source_kvm);
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
+
+ /* Set enc_context_owner and copy its encryption context over */
+ mirror_sev->enc_context_owner = source_kvm;
+ mirror_sev->active = true;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
+ ret = 0;
+
+ /*
+ * Do not copy ap_jump_table. Since the mirror does not share the same
+ * KVM contexts as the original, and they may have different
+ * memory-views.
+ */
+
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+e_source_fput:
+ if (source_kvm_file)
+ fput(source_kvm_file);
+ return ret;
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -1291,7 +2102,18 @@ void sev_vm_destroy(struct kvm *kvm)
if (!sev_guest(kvm))
return;
- mutex_lock(&kvm->lock);
+ WARN_ON(!list_empty(&sev->mirror_vms));
+
+ /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ if (is_mirroring_enc_context(kvm)) {
+ struct kvm *owner_kvm = sev->enc_context_owner;
+
+ mutex_lock(&owner_kvm->lock);
+ list_del(&sev->mirror_entry);
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
+ return;
+ }
/*
* Ensure that all guest tagged cache entries are flushed before
@@ -1312,20 +2134,35 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- mutex_unlock(&kvm->lock);
-
sev_unbind_asid(kvm, sev->handle);
- sev_asid_free(sev->asid);
+ sev_asid_free(sev);
+}
+
+void __init sev_set_cpu_caps(void)
+{
+ if (!sev_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SEV);
+ if (!sev_es_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
}
void __init sev_hardware_setup(void)
{
- unsigned int eax, ebx, ecx, edx;
+#ifdef CONFIG_KVM_AMD_SEV
+ unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
bool sev_es_supported = false;
bool sev_supported = false;
- /* Does the CPU support SEV? */
- if (!boot_cpu_has(X86_FEATURE_SEV))
+ if (!sev_enabled || !npt_enabled)
+ goto out;
+
+ /*
+ * SEV must obviously be supported in hardware. Sanity check that the
+ * CPU supports decode assists, which is mandatory for SEV guests to
+ * support instruction emulation.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SEV) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
goto out;
/* Retrieve SEV CPUID information */
@@ -1336,27 +2173,39 @@ void __init sev_hardware_setup(void)
/* Maximum number of encrypted guests supported simultaneously */
max_sev_asid = ecx;
-
- if (!svm_sev_enabled())
+ if (!max_sev_asid)
goto out;
/* Minimum ASID value that should be used for SEV guest */
min_sev_asid = edx;
+ sev_me_mask = 1UL << (ebx & 0x3f);
- /* Initialize SEV ASID bitmaps */
- sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ /*
+ * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
+ * even though it's never used, so that the bitmap is indexed by the
+ * actual ASID.
+ */
+ nr_asids = max_sev_asid + 1;
+ sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
if (!sev_asid_bitmap)
goto out;
- sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
- if (!sev_reclaim_asid_bitmap)
+ sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_reclaim_asid_bitmap) {
+ bitmap_free(sev_asid_bitmap);
+ sev_asid_bitmap = NULL;
goto out;
+ }
- pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
+ sev_asid_count = max_sev_asid - min_sev_asid + 1;
+ if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count))
+ goto out;
+
+ pr_info("SEV supported: %u ASIDs\n", sev_asid_count);
sev_supported = true;
/* SEV-ES support requested? */
- if (!sev_es)
+ if (!sev_es_enabled)
goto out;
/* Does the CPU support SEV-ES? */
@@ -1367,74 +2216,91 @@ void __init sev_hardware_setup(void)
if (min_sev_asid == 1)
goto out;
- pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1);
+ sev_es_asid_count = min_sev_asid - 1;
+ if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count))
+ goto out;
+
+ pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count);
sev_es_supported = true;
out:
- sev = sev_supported;
- sev_es = sev_es_supported;
+ sev_enabled = sev_supported;
+ sev_es_enabled = sev_es_supported;
+#endif
}
-void sev_hardware_teardown(void)
+void sev_hardware_unsetup(void)
{
- if (!svm_sev_enabled())
+ if (!sev_enabled)
return;
+ /* No need to take sev_bitmap_lock, all VMs have been destroyed. */
+ sev_flush_asids(1, max_sev_asid);
+
bitmap_free(sev_asid_bitmap);
bitmap_free(sev_reclaim_asid_bitmap);
- sev_flush_asids();
+ misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
+ misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+}
+
+int sev_cpu_init(struct svm_cpu_data *sd)
+{
+ if (!sev_enabled)
+ return 0;
+
+ sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ return -ENOMEM;
+
+ return 0;
}
/*
* Pages used by hardware to hold guest encrypted state must be flushed before
* returning them to the system.
*/
-static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
- unsigned long len)
+static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
{
+ int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
+
/*
- * If hardware enforced cache coherency for encrypted mappings of the
- * same physical page is supported, nothing to do.
+ * Note! The address must be a kernel address, as regular page walk
+ * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+ * address is non-deterministic and unsafe. This function deliberately
+ * takes a pointer to deter passing in a user address.
*/
- if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
- return;
+ unsigned long addr = (unsigned long)va;
/*
- * If the VM Page Flush MSR is supported, use it to flush the page
- * (using the page virtual address and the guest ASID).
+ * If CPU enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+ * flush is still needed in order to work properly with DMA devices.
*/
- if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
- struct kvm_sev_info *sev;
- unsigned long va_start;
- u64 start, stop;
-
- /* Align start and stop to page boundaries. */
- va_start = (unsigned long)va;
- start = (u64)va_start & PAGE_MASK;
- stop = PAGE_ALIGN((u64)va_start + len);
-
- if (start < stop) {
- sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+ clflush_cache_range(va, PAGE_SIZE);
+ return;
+ }
- while (start < stop) {
- wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
- start | sev->asid);
+ /*
+ * VM Page Flush takes a host virtual address and a guest ASID. Fall
+ * back to WBINVD if this faults so as not to make any problems worse
+ * by leaving stale encrypted data in the cache.
+ */
+ if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+ goto do_wbinvd;
- start += PAGE_SIZE;
- }
+ return;
- return;
- }
+do_wbinvd:
+ wbinvd_on_all_cpus();
+}
- WARN(1, "Address overflow, using WBINVD\n");
- }
+void sev_guest_memory_reclaimed(struct kvm *kvm)
+{
+ if (!sev_guest(kvm))
+ return;
- /*
- * Hardware should always have one of the above features,
- * but if not, use WBINVD and issue a warning.
- */
- WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
wbinvd_on_all_cpus();
}
@@ -1448,16 +2314,17 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
if (vcpu->arch.guest_state_protected)
- sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE);
- __free_page(virt_to_page(svm->vmsa));
+ sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+
+ __free_page(virt_to_page(svm->sev_es.vmsa));
- if (svm->ghcb_sa_free)
- kfree(svm->ghcb_sa);
+ if (svm->sev_es.ghcb_sa_free)
+ kvfree(svm->sev_es.ghcb_sa);
}
static void dump_ghcb(struct vcpu_svm *svm)
{
- struct ghcb *ghcb = svm->ghcb;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
unsigned int nbits;
/* Re-use the dump_invalid_vmcb module parameter */
@@ -1483,7 +2350,7 @@ static void dump_ghcb(struct vcpu_svm *svm)
static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- struct ghcb *ghcb = svm->ghcb;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
/*
* The GHCB protocol so far allows for the following data
@@ -1503,7 +2370,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
- struct ghcb *ghcb = svm->ghcb;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
u64 exit_code;
/*
@@ -1548,20 +2415,25 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu;
struct ghcb *ghcb;
- u64 exit_code = 0;
+ u64 exit_code;
+ u64 reason;
- ghcb = svm->ghcb;
-
- /* Only GHCB Usage code 0 is supported */
- if (ghcb->ghcb_usage)
- goto vmgexit_err;
+ ghcb = svm->sev_es.ghcb;
/*
- * Retrieve the exit code now even though is may not be marked valid
+ * Retrieve the exit code now even though it may not be marked valid
* as it could help with debugging.
*/
exit_code = ghcb_get_sw_exit_code(ghcb);
+ /* Only GHCB Usage code 0 is supported */
+ if (ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
if (!ghcb_sw_exit_code_is_valid(ghcb) ||
!ghcb_sw_exit_info_1_is_valid(ghcb) ||
!ghcb_sw_exit_info_2_is_valid(ghcb))
@@ -1640,6 +2512,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
break;
default:
+ reason = GHCB_ERR_INVALID_EVENT;
goto vmgexit_err;
}
@@ -1648,53 +2521,58 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
vmgexit_err:
vcpu = &svm->vcpu;
- if (ghcb->ghcb_usage) {
+ if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
} else {
- vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n",
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
exit_code);
dump_ghcb(svm);
}
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, reason);
- return -EINVAL;
+ /* Resume the guest to "return" the error code. */
+ return 1;
}
-static void pre_sev_es_run(struct vcpu_svm *svm)
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
- if (!svm->ghcb)
+ if (!svm->sev_es.ghcb)
return;
- if (svm->ghcb_sa_free) {
+ if (svm->sev_es.ghcb_sa_free) {
/*
* The scratch area lives outside the GHCB, so there is a
* buffer that, depending on the operation performed, may
* need to be synced, then freed.
*/
- if (svm->ghcb_sa_sync) {
+ if (svm->sev_es.ghcb_sa_sync) {
kvm_write_guest(svm->vcpu.kvm,
- ghcb_get_sw_scratch(svm->ghcb),
- svm->ghcb_sa, svm->ghcb_sa_len);
- svm->ghcb_sa_sync = false;
+ ghcb_get_sw_scratch(svm->sev_es.ghcb),
+ svm->sev_es.ghcb_sa,
+ svm->sev_es.ghcb_sa_len);
+ svm->sev_es.ghcb_sa_sync = false;
}
- kfree(svm->ghcb_sa);
- svm->ghcb_sa = NULL;
- svm->ghcb_sa_free = false;
+ kvfree(svm->sev_es.ghcb_sa);
+ svm->sev_es.ghcb_sa = NULL;
+ svm->sev_es.ghcb_sa_free = false;
}
- trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb);
+ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
sev_es_sync_to_ghcb(svm);
- kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true);
- svm->ghcb = NULL;
+ kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true);
+ svm->sev_es.ghcb = NULL;
}
void pre_sev_run(struct vcpu_svm *svm, int cpu)
@@ -1702,9 +2580,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int asid = sev_get_asid(svm->vcpu.kvm);
- /* Perform any SEV-ES pre-run actions */
- pre_sev_es_run(svm);
-
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -1724,10 +2599,10 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
}
#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
-static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
{
struct vmcb_control_area *control = &svm->vmcb->control;
- struct ghcb *ghcb = svm->ghcb;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
u64 ghcb_scratch_beg, ghcb_scratch_end;
u64 scratch_gpa_beg, scratch_gpa_end;
void *scratch_va;
@@ -1735,14 +2610,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n");
- return false;
+ goto e_scratch;
}
scratch_gpa_end = scratch_gpa_beg + len;
if (scratch_gpa_end < scratch_gpa_beg) {
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
len, scratch_gpa_beg);
- return false;
+ goto e_scratch;
}
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
@@ -1760,10 +2635,10 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_end > ghcb_scratch_end) {
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
scratch_gpa_beg, scratch_gpa_end);
- return false;
+ goto e_scratch;
}
- scratch_va = (void *)svm->ghcb;
+ scratch_va = (void *)svm->sev_es.ghcb;
scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
} else {
/*
@@ -1773,18 +2648,18 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
if (len > GHCB_SCRATCH_AREA_LIMIT) {
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
len, GHCB_SCRATCH_AREA_LIMIT);
- return false;
+ goto e_scratch;
}
- scratch_va = kzalloc(len, GFP_KERNEL);
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
- return false;
+ return -ENOMEM;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
/* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
- kfree(scratch_va);
- return false;
+ kvfree(scratch_va);
+ return -EFAULT;
}
/*
@@ -1793,14 +2668,20 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
* the vCPU next time (i.e. a read was requested so the data
* must be written back to the guest memory).
*/
- svm->ghcb_sa_sync = sync;
- svm->ghcb_sa_free = true;
+ svm->sev_es.ghcb_sa_sync = sync;
+ svm->sev_es.ghcb_sa_free = true;
}
- svm->ghcb_sa = scratch_va;
- svm->ghcb_sa_len = len;
+ svm->sev_es.ghcb_sa = scratch_va;
+ svm->sev_es.ghcb_sa_len = len;
- return true;
+ return 0;
+
+e_scratch:
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return 1;
}
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
@@ -1849,9 +2730,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
vcpu->arch.regs[VCPU_REGS_RCX] = 0;
- ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
break;
}
@@ -1887,10 +2768,17 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_TERM_REASON_POS);
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- fallthrough;
+
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
}
default:
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
+ break;
}
trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
@@ -1899,8 +2787,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
return ret;
}
-int sev_handle_vmgexit(struct vcpu_svm *svm)
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
u64 ghcb_gpa, exit_code;
struct ghcb *ghcb;
@@ -1912,21 +2801,25 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
return sev_handle_vmgexit_msr_protocol(svm);
if (!ghcb_gpa) {
- vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
- return -EINVAL;
+ vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
- if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
/* Unable to map GHCB from guest */
- vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
- svm->ghcb = svm->ghcb_map.hva;
- ghcb = svm->ghcb_map.hva;
+ svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
+ ghcb = svm->sev_es.ghcb_map.hva;
- trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
exit_code = ghcb_get_sw_exit_code(ghcb);
@@ -1938,34 +2831,35 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
- ret = -EINVAL;
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
- if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
break;
- ret = kvm_sev_es_mmio_read(&svm->vcpu,
+ ret = kvm_sev_es_mmio_read(vcpu,
control->exit_info_1,
control->exit_info_2,
- svm->ghcb_sa);
+ svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_MMIO_WRITE:
- if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
+ ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
+ if (ret)
break;
- ret = kvm_sev_es_mmio_write(&svm->vcpu,
+ ret = kvm_sev_es_mmio_write(vcpu,
control->exit_info_1,
control->exit_info_2,
- svm->ghcb_sa);
+ svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_NMI_COMPLETE:
- ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
- ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+ ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
- struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
switch (control->exit_info_1) {
case 0:
@@ -1979,23 +2873,21 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
- ghcb_set_sw_exit_info_1(ghcb, 1);
- ghcb_set_sw_exit_info_2(ghcb,
- X86_TRAP_UD |
- SVM_EVTINJ_TYPE_EXEPT |
- SVM_EVTINJ_VALID);
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
}
ret = 1;
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
- vcpu_unimpl(&svm->vcpu,
+ vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
break;
default:
- ret = svm_invoke_exit_handler(svm, exit_code);
+ ret = svm_invoke_exit_handler(vcpu, exit_code);
}
return ret;
@@ -2003,11 +2895,23 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
{
- if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2))
+ int count;
+ int bytes;
+ int r;
+
+ if (svm->vmcb->control.exit_info_2 > INT_MAX)
return -EINVAL;
- return kvm_sev_es_string_io(&svm->vcpu, size, port,
- svm->ghcb_sa, svm->ghcb_sa_len, in);
+ count = svm->vmcb->control.exit_info_2;
+ if (unlikely(check_mul_overflow(count, size, &bytes)))
+ return -EINVAL;
+
+ r = setup_vmgexit_scratch(svm, in, bytes);
+ if (r)
+ return r;
+
+ return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
+ count, in);
}
void sev_es_init_vmcb(struct vcpu_svm *svm)
@@ -2022,7 +2926,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm)
* VMCB page. Do not include the encryption mask on the VMSA physical
* address since hardware will access it using the guest key.
*/
- svm->vmcb->control.vmsa_pa = __pa(svm->vmsa);
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
/* Can't intercept CR register access, HV can't modify CR registers */
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
@@ -2053,36 +2957,40 @@ void sev_es_init_vmcb(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) &&
+ (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) {
+ set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1);
+ if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ }
}
-void sev_es_create_vcpu(struct vcpu_svm *svm)
+void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
/*
- * Set the GHCB MSR value as per the GHCB specification when creating
- * a vCPU for an SEV-ES guest.
+ * Set the GHCB MSR value as per the GHCB specification when emulating
+ * vCPU RESET for an SEV-ES guest.
*/
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
GHCB_VERSION_MIN,
sev_enc_bit));
}
-void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
+void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- struct vmcb_save_area *hostsa;
-
/*
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
- * of which one step is to perform a VMLOAD. Since hardware does not
- * perform a VMSAVE on VMRUN, the host savearea must be updated.
+ * of which one step is to perform a VMLOAD. KVM performs the
+ * corresponding VMSAVE in svm_prepare_guest_switch for both
+ * traditional and SEV-ES guests.
*/
- vmsave(__sme_page_pa(sd->save_area));
/* XCR0 is restored on VMEXIT, save the current host value */
- hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- /* PKRU is restored on VMEXIT, save the curent host value */
+ /* PKRU is restored on VMEXIT, save the current host value */
hostsa->pkru = read_pkru();
/* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */
@@ -2094,8 +3002,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
struct vcpu_svm *svm = to_svm(vcpu);
/* First SIPI: Use the values as initially set by the VMM */
- if (!svm->received_first_sipi) {
- svm->received_first_sipi = true;
+ if (!svm->sev_es.received_first_sipi) {
+ svm->sev_es.received_first_sipi = true;
return;
}
@@ -2104,5 +3012,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
* the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
* non-zero value.
*/
- ghcb_set_sw_exit_info_2(svm->ghcb, 1);
+ if (!svm->sev_es.ghcb)
+ return;
+
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 58a45bb139f8..87da90360bc7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -25,6 +25,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/rwsem.h>
+#include <linux/cc_platform.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -36,6 +37,7 @@
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/traps.h>
+#include <asm/fpu/api.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -43,7 +45,8 @@
#include "svm.h"
#include "svm_ops.h"
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -56,26 +59,9 @@ static const struct x86_cpu_id svm_cpu_id[] = {
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#endif
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_TSC_RATE (1 << 4)
-#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
-#define SVM_FEATURE_FLUSH_ASID (1 << 6)
-#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
-
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-
-#define TSC_RATIO_RSVD 0xffffff0000000000ULL
-#define TSC_RATIO_MIN 0x0000000000000001ULL
-#define TSC_RATIO_MAX 0x000000ffffffffffULL
-
static bool erratum_383_found __read_mostly;
u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
@@ -87,7 +73,6 @@ u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
static uint64_t osvw_len = 4, osvw_status;
static DEFINE_PER_CPU(u64, current_tsc_ratio);
-#define TSC_RATIO_DEFAULT 0x0100000000ULL
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
@@ -95,6 +80,8 @@ static const struct svm_direct_access_msrs {
} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
+ { .index = MSR_IA32_SYSENTER_EIP, .always = false },
+ { .index = MSR_IA32_SYSENTER_ESP, .always = false },
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true },
{ .index = MSR_FS_BASE, .always = true },
@@ -112,6 +99,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_EFER, .always = false },
{ .index = MSR_IA32_CR_PAT, .always = false },
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
+ { .index = MSR_TSC_AUX, .always = false },
{ .index = MSR_INVALID, .always = false },
};
@@ -183,20 +171,34 @@ static int vls = true;
module_param(vls, int, 0444);
/* enable/disable Virtual GIF */
-static int vgif = true;
+int vgif = true;
module_param(vgif, int, 0444);
-/* enable/disable SEV support */
-int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
-module_param(sev, int, 0444);
+/* enable/disable LBR virtualization */
+static int lbrv = true;
+module_param(lbrv, int, 0444);
+
+static int tsc_scaling = true;
+module_param(tsc_scaling, int, 0444);
+
+/*
+ * enable / disable AVIC. Because the defaults differ for APICv
+ * support between VMX and SVM we cannot use module_param_named.
+ */
+static bool avic;
+module_param(avic, bool, 0444);
-/* enable/disable SEV-ES support */
-int sev_es = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
-module_param(sev_es, int, 0444);
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+
static bool svm_gp_erratum_intercept = true;
static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -214,6 +216,15 @@ struct kvm_ldttss_desc {
DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+/*
+ * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
+ * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
+ *
+ * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
+ * defer the restoration of TSC_AUX until the CPU returns to userspace.
+ */
+static int tsc_aux_uret_slot __read_mostly = -1;
+
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
@@ -241,12 +252,12 @@ u32 svm_msrpm_offset(u32 msr)
return MSR_INVALID;
}
-#define MAX_INST_SIZE 15
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
-static int get_max_npt_level(void)
+static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
- return PT64_ROOT_4LEVEL;
+ return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
#else
return PT32E_ROOT_LEVEL;
#endif
@@ -268,7 +279,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
if (!(efer & EFER_SVME)) {
- svm_leave_nested(svm);
+ svm_leave_nested(vcpu);
svm_set_gif(svm, true);
/* #GP intercept is still needed for vmware backdoor */
if (!enable_vmware_backdoor)
@@ -279,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
* In this case we will return to the nested guest
* as soon as we leave SMM.
*/
- if (!is_smm(&svm->vcpu))
+ if (!is_smm(vcpu))
svm_free_nested(svm);
} else {
@@ -290,7 +301,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return ret;
}
- if (svm_gp_erratum_intercept)
+ /*
+ * Never intercept #GP for SEV guests, KVM can't
+ * decrypt guest memory to workaround the erratum.
+ */
+ if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
}
}
@@ -327,7 +342,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
-static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -363,10 +378,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
bool has_error_code = vcpu->arch.exception.has_error_code;
u32 error_code = vcpu->arch.exception.error_code;
- kvm_deliver_exception_payload(&svm->vcpu);
+ kvm_deliver_exception_payload(vcpu);
if (nr == BP_VECTOR && !nrips) {
- unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
/*
* For guest debugging where we have to reinject #BP if some
@@ -375,8 +390,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- (void)skip_emulated_instruction(&svm->vcpu);
- rip = kvm_rip_read(&svm->vcpu);
+ (void)svm_skip_emulated_instruction(vcpu);
+ rip = kvm_rip_read(vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
}
@@ -442,7 +457,7 @@ static int has_svm(void)
return 0;
}
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
return 0;
}
@@ -450,11 +465,24 @@ static int has_svm(void)
return 1;
}
+void __svm_write_tsc_multiplier(u64 multiplier)
+{
+ preempt_disable();
+
+ if (multiplier == __this_cpu_read(current_tsc_ratio))
+ goto out;
+
+ wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+ __this_cpu_write(current_tsc_ratio, multiplier);
+out:
+ preempt_enable();
+}
+
static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
- wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
cpu_svm_disable();
@@ -496,8 +524,11 @@ static int svm_hardware_enable(void)
wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
- __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
+ /*
+ * Set the default value, even if we don't use TSC scaling
+ * to avoid having stale value in the msr
+ */
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
}
@@ -553,23 +584,19 @@ static void svm_cpu_uninit(int cpu)
static int svm_cpu_init(int cpu)
{
struct svm_cpu_data *sd;
+ int ret = -ENOMEM;
sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
if (!sd)
- return -ENOMEM;
+ return ret;
sd->cpu = cpu;
- sd->save_area = alloc_page(GFP_KERNEL);
+ sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!sd->save_area)
goto free_cpu_data;
- clear_page(page_address(sd->save_area));
-
- if (svm_sev_enabled()) {
- sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
- sizeof(void *),
- GFP_KERNEL);
- if (!sd->sev_vmcbs)
- goto free_save_area;
- }
+
+ ret = sev_cpu_init(sd);
+ if (ret)
+ goto free_save_area;
per_cpu(svm_data, cpu) = sd;
@@ -579,7 +606,7 @@ free_save_area:
__free_page(sd->save_area);
free_cpu_data:
kfree(sd);
- return -ENOMEM;
+ return ret;
}
@@ -642,6 +669,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
u32 msr, int read, int write)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u8 bit_read, bit_write;
unsigned long tmp;
u32 offset;
@@ -670,6 +698,9 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
msrpm[offset] = tmp;
+
+ svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
+ svm->nested.force_msr_bitmap_recalc = true;
}
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
@@ -681,14 +712,15 @@ void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
u32 *svm_vcpu_alloc_msrpm(void)
{
- struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+ unsigned int order = get_order(MSRPM_SIZE);
+ struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
u32 *msrpm;
if (!pages)
return NULL;
msrpm = page_address(pages);
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
return msrpm;
}
@@ -707,7 +739,7 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
void svm_vcpu_free_msrpm(u32 *msrpm)
{
- __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+ __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}
static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
@@ -772,6 +804,17 @@ static void init_msrpm_offsets(void)
}
}
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
+ to_vmcb->save.br_from = from_vmcb->save.br_from;
+ to_vmcb->save.br_to = from_vmcb->save.br_to;
+ to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
+ to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
+
+ vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -781,6 +824,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+ /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
}
static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
@@ -792,6 +839,67 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+
+ /*
+ * Move the LBR msrs back to the vmcb01 to avoid copying them
+ * on nested guest entries.
+ */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+}
+
+static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+{
+ /*
+ * If the LBR virtualization is disabled, the LBR msrs are always
+ * kept in the vmcb01 to avoid copying them on nested guest entries.
+ *
+ * If nested, and the LBR virtualization is enabled/disabled, the msrs
+ * are moved between the vmcb01 and vmcb02 as needed.
+ */
+ struct vmcb *vmcb =
+ (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
+ svm->vmcb : svm->vmcb01.ptr;
+
+ switch (index) {
+ case MSR_IA32_DEBUGCTLMSR:
+ return vmcb->save.dbgctl;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ return vmcb->save.br_from;
+ case MSR_IA32_LASTBRANCHTOIP:
+ return vmcb->save.br_to;
+ case MSR_IA32_LASTINTFROMIP:
+ return vmcb->save.last_excp_from;
+ case MSR_IA32_LASTINTTOIP:
+ return vmcb->save.last_excp_to;
+ default:
+ KVM_BUG(false, svm->vcpu.kvm,
+ "%s: Unknown MSR 0x%x", __func__, index);
+ return 0;
+ }
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
+ DEBUGCTLMSR_LBR;
+
+ bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
+ LBR_CTL_ENABLE_MASK);
+
+ if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
+ if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
+ enable_lbrv = true;
+
+ if (enable_lbrv == current_enable_lbrv)
+ return;
+
+ if (enable_lbrv)
+ svm_enable_lbrv(vcpu);
+ else
+ svm_disable_lbrv(vcpu);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -813,6 +921,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
control->pause_filter_count = __grow_ple_window(old,
pause_filter_count,
pause_filter_count_grow,
@@ -831,6 +942,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
control->pause_filter_count =
__shrink_ple_window(old,
pause_filter_count,
@@ -843,222 +957,20 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-/*
- * The default MMIO mask is a single bit (excluding the present bit),
- * which could conflict with the memory encryption bit. Check for
- * memory encryption support and override the default MMIO mask if
- * memory encryption is enabled.
- */
-static __init void svm_adjust_mmio_mask(void)
-{
- unsigned int enc_bit, mask_bit;
- u64 msr, mask;
-
- /* If there is no memory encryption support, use existing mask */
- if (cpuid_eax(0x80000000) < 0x8000001f)
- return;
-
- /* If memory encryption is not enabled, use existing mask */
- rdmsrl(MSR_K8_SYSCFG, msr);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
- return;
-
- enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
- mask_bit = boot_cpu_data.x86_phys_bits;
-
- /* Increment the mask bit if it is the same as the encryption bit */
- if (enc_bit == mask_bit)
- mask_bit++;
-
- /*
- * If the mask bit location is below 52, then some bits above the
- * physical addressing limit will always be reserved, so use the
- * rsvd_bits() function to generate the mask. This mask, along with
- * the present bit, will be used to generate a page fault with
- * PFER.RSV = 1.
- *
- * If the mask bit location is 52 (or above), then clear the mask.
- */
- mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
-
- kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
-}
-
-static void svm_hardware_teardown(void)
+static void svm_hardware_unsetup(void)
{
int cpu;
- if (svm_sev_enabled())
- sev_hardware_teardown();
+ sev_hardware_unsetup();
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+ get_order(IOPM_SIZE));
iopm_base = 0;
}
-static __init void svm_set_cpu_caps(void)
-{
- kvm_set_cpu_caps();
-
- supported_xss = 0;
-
- /* CPUID 0x80000001 and 0x8000000A (SVM features) */
- if (nested) {
- kvm_cpu_cap_set(X86_FEATURE_SVM);
-
- if (nrips)
- kvm_cpu_cap_set(X86_FEATURE_NRIPS);
-
- if (npt_enabled)
- kvm_cpu_cap_set(X86_FEATURE_NPT);
-
- /* Nested VM can receive #VMEXIT instead of triggering #GP */
- kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
- }
-
- /* CPUID 0x80000008 */
- if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
- boot_cpu_has(X86_FEATURE_AMD_SSBD))
- kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
-}
-
-static __init int svm_hardware_setup(void)
-{
- int cpu;
- struct page *iopm_pages;
- void *iopm_va;
- int r;
-
- iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
- init_msrpm_offsets();
-
- supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
-
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
- if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
- kvm_enable_efer_bits(EFER_FFXSR);
-
- if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 32;
- }
-
- /* Check for pause filtering support */
- if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- pause_filter_count = 0;
- pause_filter_thresh = 0;
- } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
- pause_filter_thresh = 0;
- }
-
- if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
- }
-
- if (IS_ENABLED(CONFIG_KVM_AMD_SEV) && sev) {
- sev_hardware_setup();
- } else {
- sev = false;
- sev_es = false;
- }
-
- svm_adjust_mmio_mask();
-
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
- /*
- * KVM's MMU doesn't support using 2-level paging for itself, and thus
- * NPT isn't supported if the host is using 2-level paging since host
- * CR4 is unchanged on VMRUN.
- */
- if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
- npt_enabled = false;
-
- if (!boot_cpu_has(X86_FEATURE_NPT))
- npt_enabled = false;
-
- kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
- pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
-
- if (nrips) {
- if (!boot_cpu_has(X86_FEATURE_NRIPS))
- nrips = false;
- }
-
- if (avic) {
- if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_AVIC) ||
- !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
- avic = false;
- } else {
- pr_info("AVIC enabled\n");
-
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- }
- }
-
- if (vls) {
- if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
- !IS_ENABLED(CONFIG_X86_64)) {
- vls = false;
- } else {
- pr_info("Virtual VMLOAD VMSAVE supported\n");
- }
- }
-
- if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
- svm_gp_erratum_intercept = false;
-
- if (vgif) {
- if (!boot_cpu_has(X86_FEATURE_VGIF))
- vgif = false;
- else
- pr_info("Virtual GIF supported\n");
- }
-
- svm_set_cpu_caps();
-
- /*
- * It seems that on AMD processors PTE's accessed bit is
- * being set by the CPU hardware before the NPF vmexit.
- * This is not expected behaviour and our tests fail because
- * of it.
- * A workaround here is to disable support for
- * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
- * In this case userspace can know if there is support using
- * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
- * it
- * If future AMD CPU models change the behaviour described above,
- * this variable can be changed accordingly
- */
- allow_smaller_maxphyaddr = !npt_enabled;
-
- return 0;
-
-err:
- svm_hardware_teardown();
- return r;
-}
-
static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
@@ -1076,29 +988,38 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u64 g_tsc_offset = 0;
- if (is_guest_mode(vcpu)) {
- /* Write L1's TSC offset. */
- g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = offset;
- }
+ return svm->nested.ctl.tsc_offset;
+}
- trace_kvm_write_tsc_offset(vcpu->vcpu_id,
- svm->vmcb->control.tsc_offset - g_tsc_offset,
- offset);
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+ return svm->tsc_ratio_msr;
+}
+
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+ svm->vmcb->control.tsc_offset = offset;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
- return svm->vmcb->control.tsc_offset;
}
-static void svm_check_invpcid(struct vcpu_svm *svm)
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+ __svm_write_tsc_multiplier(multiplier);
+}
+
+
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
+ struct vcpu_svm *svm)
{
/*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
@@ -1111,14 +1032,55 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
else
svm_clr_intercept(svm, INTERCEPT_INVPCID);
}
+
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ else
+ svm_set_intercept(svm, INTERCEPT_RDTSCP);
+ }
}
-static void init_vmcb(struct vcpu_svm *svm)
+static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
{
- struct vmcb_control_area *control = &svm->vmcb->control;
- struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vcpu_svm *svm = to_svm(vcpu);
- svm->vcpu.arch.hflags = 0;
+ if (guest_cpuid_is_intel(vcpu)) {
+ /*
+ * We must intercept SYSENTER_EIP and SYSENTER_ESP
+ * accesses because the processor only stores 32 bits.
+ * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+ */
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+
+ svm->v_vmload_vmsave_enabled = false;
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ /* No need to intercept these MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ }
+}
+
+static void init_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct vmcb_control_area *control = &vmcb->control;
+ struct vmcb_save_area *save = &vmcb->save;
svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1126,7 +1088,7 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
- if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ if (!kvm_vcpu_apicv_active(vcpu))
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
@@ -1140,14 +1102,18 @@ static void init_vmcb(struct vcpu_svm *svm)
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
- * as VMware does.
+ * as VMware does. Don't intercept #GP for SEV guests as KVM can't
+ * decrypt guest memory to decode the faulting instruction.
*/
- if (enable_vmware_backdoor)
+ if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
svm_set_intercept(svm, INTERCEPT_INTR);
svm_set_intercept(svm, INTERCEPT_NMI);
- svm_set_intercept(svm, INTERCEPT_SMI);
+
+ if (intercept_smi)
+ svm_set_intercept(svm, INTERCEPT_SMI);
+
svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
svm_set_intercept(svm, INTERCEPT_RDPMC);
svm_set_intercept(svm, INTERCEPT_CPUID);
@@ -1170,12 +1136,12 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_RDPRU);
svm_set_intercept(svm, INTERCEPT_RSM);
- if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+ if (!kvm_mwait_in_guest(vcpu->kvm)) {
svm_set_intercept(svm, INTERCEPT_MONITOR);
svm_set_intercept(svm, INTERCEPT_MWAIT);
}
- if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+ if (!kvm_hlt_in_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_HLT);
control->iopm_base_pa = __sme_set(iopm_base);
@@ -1195,29 +1161,14 @@ static void init_vmcb(struct vcpu_svm *svm)
SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
save->cs.limit = 0xffff;
+ save->gdtr.base = 0;
save->gdtr.limit = 0xffff;
+ save->idtr.base = 0;
save->idtr.limit = 0xffff;
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_cr4(&svm->vcpu, 0);
- svm_set_efer(&svm->vcpu, 0);
- save->dr6 = 0xffff0ff0;
- kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
- save->rip = 0x0000fff0;
- svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
-
- /*
- * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
- * It also updates the guest-visible cr0 value.
- */
- svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
- kvm_mmu_reset_context(&svm->vcpu);
-
- save->cr4 = X86_CR4_PAE;
- /* rdx = ?? */
-
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
@@ -1225,17 +1176,16 @@ static void init_vmcb(struct vcpu_svm *svm)
clr_exception_intercept(svm, PF_VECTOR);
svm_clr_intercept(svm, INTERCEPT_CR3_READ);
svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
- save->g_pat = svm->vcpu.arch.pat;
+ save->g_pat = vcpu->arch.pat;
save->cr3 = 0;
- save->cr4 = 0;
}
- svm->asid_generation = 0;
+ svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
- svm->nested.vmcb12_gpa = 0;
- svm->vcpu.arch.hflags = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
- if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
control->pause_filter_count = pause_filter_count;
if (pause_filter_thresh)
control->pause_filter_thresh = pause_filter_thresh;
@@ -1244,20 +1194,17 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
- svm_check_invpcid(svm);
-
- if (kvm_vcpu_apicv_active(&svm->vcpu))
- avic_init_vmcb(svm);
+ svm_recalc_instruction_intercepts(vcpu, svm);
/*
- * If hardware supports Virtual VMLOAD VMSAVE then enable it
- * in VMCB and clear intercepts to avoid #VMEXIT.
+ * If the host supports V_SPEC_CTRL then disable the interception
+ * of MSR_IA32_SPEC_CTRL.
*/
- if (vls) {
- svm_clr_intercept(svm, INTERCEPT_VMLOAD);
- svm_clr_intercept(svm, INTERCEPT_VMSAVE);
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- }
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_init_vmcb(svm, vmcb);
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1265,50 +1212,61 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
- if (sev_guest(svm->vcpu.kvm)) {
+ if (sev_guest(vcpu->kvm)) {
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
clr_exception_intercept(svm, UD_VECTOR);
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/* Perform SEV-ES specific VMCB updates */
sev_es_init_vmcb(svm);
}
}
- vmcb_mark_all_dirty(svm->vmcb);
+ svm_hv_init_vmcb(vmcb);
+ init_vmcb_after_set_cpuid(vcpu);
+
+ vmcb_mark_all_dirty(vmcb);
enable_gif(svm);
+}
+static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+
+ svm_init_osvw(vcpu);
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio;
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_vcpu_reset(svm);
}
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 dummy;
- u32 eax = 1;
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
- if (!init_event) {
- svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
- }
- init_vmcb(svm);
+ init_vmcb(vcpu);
- kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
- kvm_rdx_write(vcpu, eax);
+ if (!init_event)
+ __svm_vcpu_reset(vcpu);
+}
- if (kvm_vcpu_apicv_active(vcpu) && !init_event)
- avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+ svm->current_vmcb = target_vmcb;
+ svm->vmcb = target_vmcb->ptr;
}
-static int svm_create_vcpu(struct kvm_vcpu *vcpu)
+static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
- struct page *vmcb_page;
+ struct page *vmcb01_page;
struct page *vmsa_page = NULL;
int err;
@@ -1316,11 +1274,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
err = -ENOMEM;
- vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!vmcb_page)
+ vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb01_page)
goto out;
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/*
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
@@ -1332,46 +1290,30 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
/*
* SEV-ES guests maintain an encrypted version of their FPU
* state which is restored and saved on VMRUN and VMEXIT.
- * Free the fpu structure to prevent KVM from attempting to
- * access the FPU state.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
*/
- kvm_free_guest_fpu(vcpu);
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
}
err = avic_init_vcpu(svm);
if (err)
goto error_free_vmsa_page;
- /* We initialize this flag to true to make sure that the is_running
- * bit would be set the first time the vcpu is loaded.
- */
- if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
- svm->avic_is_running = true;
-
svm->msrpm = svm_vcpu_alloc_msrpm();
if (!svm->msrpm) {
err = -ENOMEM;
goto error_free_vmsa_page;
}
- svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
- svm->vmcb = page_address(vmcb_page);
- svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+ svm_switch_vmcb(svm, &svm->vmcb01);
if (vmsa_page)
- svm->vmsa = page_address(vmsa_page);
+ svm->sev_es.vmsa = page_address(vmsa_page);
- svm->asid_generation = 0;
svm->guest_state_loaded = false;
- init_vmcb(svm);
-
- svm_init_osvw(vcpu);
- vcpu->arch.microcode_version = 0x01000065;
-
- if (sev_es_guest(svm->vcpu.kvm))
- /* Perform SEV-ES specific VMCB creation updates */
- sev_es_create_vcpu(svm);
return 0;
@@ -1379,7 +1321,7 @@ error_free_vmsa_page:
if (vmsa_page)
__free_page(vmsa_page);
error_free_vmcb_page:
- __free_page(vmcb_page);
+ __free_page(vmcb01_page);
out:
return err;
}
@@ -1392,7 +1334,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb)
cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
}
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1407,68 +1349,45 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
- __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
- __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- unsigned int i;
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_unmap_ghcb(svm);
if (svm->guest_state_loaded)
return;
/*
- * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
- * area (non-sev-es). Save ones that aren't so we can restore them
- * individually later.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- /*
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
- if (sev_es_guest(svm->vcpu.kvm)) {
- sev_es_prepare_guest_switch(svm, vcpu->cpu);
- } else {
- vmsave(__sme_page_pa(sd->save_area));
- }
+ vmsave(__sme_page_pa(sd->save_area));
+ if (sev_es_guest(vcpu->kvm)) {
+ struct sev_es_save_area *hostsa;
+ hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
- if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
- __this_cpu_write(current_tsc_ratio, tsc_ratio);
- wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
- }
+ sev_es_prepare_switch_to_guest(hostsa);
}
- /* This assumes that the kernel never uses MSR_TSC_AUX */
- if (static_cpu_has(X86_FEATURE_RDTSCP))
- wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+
+ if (likely(tsc_aux_uret_slot >= 0))
+ kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
svm->guest_state_loaded = true;
}
static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- unsigned int i;
-
- if (!svm->guest_state_loaded)
- return;
-
- /*
- * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
- * area (non-sev-es). Restore the ones that weren't.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- svm->guest_state_loaded = false;
+ to_svm(vcpu)->guest_state_loaded = false;
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1476,21 +1395,19 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- if (unlikely(cpu != vcpu->cpu)) {
- svm->asid_generation = 0;
- vmcb_mark_all_dirty(svm->vmcb);
- }
-
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
}
- avic_vcpu_load(vcpu, cpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
- avic_vcpu_put(vcpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_put(vcpu);
+
svm_prepare_host_switch(vcpu);
++vcpu->stat.host_state_reload;
@@ -1524,15 +1441,30 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
+ kvm_register_mark_available(vcpu, reg);
+
switch (reg) {
case VCPU_EXREG_PDPTR:
- BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ /*
+ * When !npt_enabled, mmu->pdptrs[] is already available since
+ * it is always updated per SDM when moving to CRs.
+ */
+ if (npt_enabled)
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
break;
default:
- WARN_ON_ONCE(1);
+ KVM_BUG_ON(1, vcpu->kvm);
}
}
@@ -1540,8 +1472,11 @@ static void svm_set_vintr(struct vcpu_svm *svm)
{
struct vmcb_control_area *control;
- /* The following fields are ignored when AVIC is enabled */
- WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
+ /*
+ * The following fields are ignored when AVIC is enabled
+ */
+ WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
+
svm_set_intercept(svm, INTERCEPT_VINTR);
/*
@@ -1558,17 +1493,20 @@ static void svm_set_vintr(struct vcpu_svm *svm)
static void svm_clear_vintr(struct vcpu_svm *svm)
{
- const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
svm_clr_intercept(svm, INTERCEPT_VINTR);
/* Drop int_ctl fields related to VINTR injection. */
- svm->vmcb->control.int_ctl &= mask;
+ svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
if (is_guest_mode(&svm->vcpu)) {
- svm->nested.hsave->control.int_ctl &= mask;
+ svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
(svm->nested.ctl.int_ctl & V_TPR_MASK));
- svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
+
+ svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
+ V_IRQ_INJECTION_BITS_MASK;
+
+ svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
}
vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
@@ -1577,16 +1515,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+ struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
switch (seg) {
case VCPU_SREG_CS: return &save->cs;
case VCPU_SREG_DS: return &save->ds;
case VCPU_SREG_ES: return &save->es;
- case VCPU_SREG_FS: return &save->fs;
- case VCPU_SREG_GS: return &save->gs;
+ case VCPU_SREG_FS: return &save01->fs;
+ case VCPU_SREG_GS: return &save01->gs;
case VCPU_SREG_SS: return &save->ss;
- case VCPU_SREG_TR: return &save->tr;
- case VCPU_SREG_LDTR: return &save->ldtr;
+ case VCPU_SREG_TR: return &save01->tr;
+ case VCPU_SREG_LDTR: return &save01->ldtr;
}
BUG();
return NULL;
@@ -1675,6 +1614,15 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
return save->cpl;
}
+static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+
static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1709,37 +1657,29 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
-static void update_cr0_intercept(struct vcpu_svm *svm)
+static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
- ulong gcr0;
- u64 *hcr0;
+ struct vcpu_svm *svm = to_svm(vcpu);
/*
- * SEV-ES guests must always keep the CR intercepts cleared. CR
- * tracking is done using the CR write traps.
+ * For guests that don't set guest_state_protected, the cr3 update is
+ * handled via kvm_mmu_load() while entering the guest. For guests
+ * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
+ * VMCB save area now, since the save area will become the initial
+ * contents of the VMSA, and future VMCB save area updates won't be
+ * seen.
*/
- if (sev_es_guest(svm->vcpu.kvm))
- return;
-
- gcr0 = svm->vcpu.arch.cr0;
- hcr0 = &svm->vmcb->save.cr0;
- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
- | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
- vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-
- if (gcr0 == *hcr0) {
- svm_clr_intercept(svm, INTERCEPT_CR0_READ);
- svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
- } else {
- svm_set_intercept(svm, INTERCEPT_CR0_READ);
- svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ if (sev_es_guest(vcpu->kvm)) {
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
}
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u64 hcr0 = cr0;
+ bool old_paging = is_paging(vcpu);
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1756,8 +1696,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
#endif
vcpu->arch.cr0 = cr0;
- if (!npt_enabled)
- cr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (!npt_enabled) {
+ hcr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (old_paging != is_paging(vcpu))
+ svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
/*
* re-enable caching here because the QEMU bios
@@ -1765,10 +1708,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* reboot
*/
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
- svm->vmcb->save.cr0 = cr0;
+ hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+ svm->vmcb->save.cr0 = hcr0;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
- update_cr0_intercept(svm);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (hcr0 == cr0) {
+ /* Selective CR0 write remains on. */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
}
static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1782,11 +1741,15 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = vcpu->arch.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb(vcpu);
+ svm_flush_tlb_current(vcpu);
vcpu->arch.cr4 = cr4;
- if (!npt_enabled)
+ if (!npt_enabled) {
cr4 |= X86_CR4_PAE;
+
+ if (!is_paging(vcpu))
+ cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -1847,7 +1810,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
- svm->asid_generation = sd->asid_generation;
+ svm->current_vmcb->asid_generation = sd->asid_generation;
svm->asid = sd->next_asid++;
}
@@ -1896,39 +1859,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
-static int pf_interception(struct vcpu_svm *svm)
+static int pf_interception(struct kvm_vcpu *vcpu)
{
- u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
- return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+ return kvm_handle_page_fault(vcpu, error_code, fault_address,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
-static int npf_interception(struct vcpu_svm *svm)
+static int npf_interception(struct kvm_vcpu *vcpu)
{
- u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
trace_kvm_page_fault(fault_address, error_code);
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ return kvm_mmu_page_fault(vcpu, fault_address, error_code,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
-static int db_interception(struct vcpu_svm *svm)
+static int db_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
- if (!(svm->vcpu.guest_debug &
+ if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
!svm->nmi_singlestep) {
u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
- kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
return 1;
}
@@ -1938,7 +1905,7 @@ static int db_interception(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
- if (svm->vcpu.guest_debug &
+ if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
@@ -1952,9 +1919,10 @@ static int db_interception(struct vcpu_svm *svm)
return 1;
}
-static int bp_interception(struct vcpu_svm *svm)
+static int bp_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1962,14 +1930,14 @@ static int bp_interception(struct vcpu_svm *svm)
return 0;
}
-static int ud_interception(struct vcpu_svm *svm)
+static int ud_interception(struct kvm_vcpu *vcpu)
{
- return handle_ud(&svm->vcpu);
+ return handle_ud(vcpu);
}
-static int ac_interception(struct vcpu_svm *svm)
+static int ac_interception(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
return 1;
}
@@ -2012,7 +1980,7 @@ static bool is_erratum_383(void)
return true;
}
-static void svm_handle_mce(struct vcpu_svm *svm)
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
{
if (is_erratum_383()) {
/*
@@ -2021,7 +1989,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
*/
pr_err("KVM: Guest triggered AMD Erratum 383\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -2033,41 +2001,46 @@ static void svm_handle_mce(struct vcpu_svm *svm)
kvm_machine_check();
}
-static int mc_interception(struct vcpu_svm *svm)
+static int mc_interception(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int shutdown_interception(struct vcpu_svm *svm)
+static int shutdown_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
/*
* The VM save area has already been encrypted so it
* cannot be reinitialized - just terminate.
*/
- if (sev_es_guest(svm->vcpu.kvm))
+ if (sev_es_guest(vcpu->kvm))
return -EINVAL;
/*
- * VMCB is undefined after a SHUTDOWN intercept
- * so reinitialize it.
+ * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
+ * the VMCB in a known good state. Unfortuately, KVM doesn't have
+ * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+ * userspace. At a platform view, INIT is acceptable behavior as
+ * there exist bare metal platforms that automatically INIT the CPU
+ * in response to shutdown.
*/
clear_page(svm->vmcb);
- init_vmcb(svm);
+ kvm_vcpu_reset(vcpu, true);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
}
-static int io_interception(struct vcpu_svm *svm)
+static int io_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;
- ++svm->vcpu.stat.io_exits;
+ ++vcpu->stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
port = io_info >> 16;
@@ -2082,93 +2055,75 @@ static int io_interception(struct vcpu_svm *svm)
svm->next_rip = svm->vmcb->control.exit_info_2;
- return kvm_fast_pio(&svm->vcpu, size, port, in);
+ return kvm_fast_pio(vcpu, size, port, in);
}
-static int nmi_interception(struct vcpu_svm *svm)
+static int nmi_interception(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int intr_interception(struct vcpu_svm *svm)
+static int smi_interception(struct kvm_vcpu *vcpu)
{
- ++svm->vcpu.stat.irq_exits;
return 1;
}
-static int nop_on_interception(struct vcpu_svm *svm)
+static int intr_interception(struct kvm_vcpu *vcpu)
{
+ ++vcpu->stat.irq_exits;
return 1;
}
-static int halt_interception(struct vcpu_svm *svm)
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
{
- return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_hypercall(&svm->vcpu);
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb12;
struct kvm_host_map map;
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
if (ret) {
if (ret == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
- nested_vmcb = map.hva;
+ vmcb12 = map.hva;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ ret = kvm_skip_emulated_instruction(vcpu);
- nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ if (vmload) {
+ svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
+ svm->sysenter_eip_hi = 0;
+ svm->sysenter_esp_hi = 0;
+ } else {
+ svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
+ }
+
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
-static int vmsave_interception(struct vcpu_svm *svm)
+static int vmload_interception(struct kvm_vcpu *vcpu)
{
- struct vmcb *nested_vmcb;
- struct kvm_host_map map;
- int ret;
-
- if (nested_svm_check_permissions(svm))
- return 1;
-
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
- if (ret) {
- if (ret == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- nested_vmcb = map.hva;
-
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
- nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ return vmload_vmsave_interception(vcpu, true);
+}
- return ret;
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, false);
}
-static int vmrun_interception(struct vcpu_svm *svm)
+static int vmrun_interception(struct kvm_vcpu *vcpu)
{
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- return nested_svm_vmrun(svm);
+ return nested_svm_vmrun(vcpu);
}
enum {
@@ -2207,7 +2162,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
[SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
[SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
};
- int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+ int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_INSTR_VMRUN] = vmrun_interception,
[SVM_INSTR_VMLOAD] = vmload_interception,
[SVM_INSTR_VMSAVE] = vmsave_interception,
@@ -2216,17 +2171,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
int ret;
if (is_guest_mode(vcpu)) {
- svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
/* Returns '1' or -errno on failure, '0' on success. */
- ret = nested_svm_vmexit(svm);
+ ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
if (ret)
return ret;
return 1;
}
- return svm_instr_handlers[opcode](svm);
+ return svm_instr_handlers[opcode](vcpu);
}
/*
@@ -2237,9 +2188,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
* regions (e.g. SMM memory on host).
* 2) VMware backdoor
*/
-static int gp_interception(struct vcpu_svm *svm)
+static int gp_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 error_code = svm->vmcb->control.exit_info_1;
int opcode;
@@ -2264,8 +2215,13 @@ static int gp_interception(struct vcpu_svm *svm)
if (!is_guest_mode(vcpu))
return kvm_emulate_instruction(vcpu,
EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
- } else
+ } else {
+ /* All SVM instructions expect page aligned RAX */
+ if (svm->vmcb->save.rax & ~PAGE_MASK)
+ goto reinject;
+
return emulate_svm_instr(vcpu, opcode);
+ }
reinject:
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -2281,7 +2237,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
* Likewise, clear the VINTR intercept, we will set it
* again while processing KVM_REQ_EVENT if needed.
*/
- if (vgif_enabled(svm))
+ if (vgif)
svm_clr_intercept(svm, INTERCEPT_STGI);
if (svm_is_intercept(svm, INTERCEPT_VINTR))
svm_clear_vintr(svm);
@@ -2299,78 +2255,63 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
* in use, we still rely on the VINTR intercept (rather than
* STGI) to detect an open interrupt window.
*/
- if (!vgif_enabled(svm))
+ if (!vgif)
svm_clear_vintr(svm);
}
}
-static int stgi_interception(struct vcpu_svm *svm)
+static int stgi_interception(struct kvm_vcpu *vcpu)
{
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- svm_set_gif(svm, true);
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), true);
return ret;
}
-static int clgi_interception(struct vcpu_svm *svm)
+static int clgi_interception(struct kvm_vcpu *vcpu)
{
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- svm_set_gif(svm, false);
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), false);
return ret;
}
-static int invlpga_interception(struct vcpu_svm *svm)
+static int invlpga_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
- kvm_rax_read(&svm->vcpu));
+ gva_t gva = kvm_rax_read(vcpu);
+ u32 asid = kvm_rcx_read(vcpu);
- /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
-
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+ /* FIXME: Handle an address size prefix. */
+ if (!is_long_mode(vcpu))
+ gva = (u32)gva;
-static int skinit_interception(struct vcpu_svm *svm)
-{
- trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
+ trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, gva);
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_wbinvd(&svm->vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int xsetbv_interception(struct vcpu_svm *svm)
+static int skinit_interception(struct kvm_vcpu *vcpu)
{
- u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
- u32 index = kvm_rcx_read(&svm->vcpu);
+ trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
- int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
- return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int rdpru_interception(struct vcpu_svm *svm)
-{
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
-static int task_switch_interception(struct vcpu_svm *svm)
+static int task_switch_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u16 tss_selector;
int reason;
int int_type = svm->vmcb->control.exit_int_info &
@@ -2399,7 +2340,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
if (reason == TASK_SWITCH_GATE) {
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = false;
+ vcpu->arch.nmi_injected = false;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
if (svm->vmcb->control.exit_info_2 &
@@ -2408,10 +2349,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
error_code =
(u32)svm->vmcb->control.exit_info_2;
}
- kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
break;
case SVM_EXITINTINFO_TYPE_INTR:
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(vcpu);
break;
default:
break;
@@ -2422,78 +2363,59 @@ static int task_switch_interception(struct vcpu_svm *svm)
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
- if (!skip_emulated_instruction(&svm->vcpu))
+ if (!svm_skip_emulated_instruction(vcpu))
return 0;
}
if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
int_vec = -1;
- return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+ return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
has_error_code, error_code);
}
-static int cpuid_interception(struct vcpu_svm *svm)
+static int iret_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_cpuid(&svm->vcpu);
-}
+ struct vcpu_svm *svm = to_svm(vcpu);
-static int iret_interception(struct vcpu_svm *svm)
-{
- ++svm->vcpu.stat.nmi_window_exits;
- svm->vcpu.arch.hflags |= HF_IRET_MASK;
- if (!sev_es_guest(svm->vcpu.kvm)) {
+ ++vcpu->stat.nmi_window_exits;
+ vcpu->arch.hflags |= HF_IRET_MASK;
+ if (!sev_es_guest(vcpu->kvm)) {
svm_clr_intercept(svm, INTERCEPT_IRET);
- svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
}
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
-static int invd_interception(struct vcpu_svm *svm)
-{
- /* Treat an INVD instruction as a NOP and just skip it. */
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
+static int invlpg_interception(struct kvm_vcpu *vcpu)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return kvm_emulate_instruction(&svm->vcpu, 0);
-
- kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+ return kvm_emulate_instruction(vcpu, 0);
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_instruction(&svm->vcpu, 0);
+ kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int rsm_interception(struct vcpu_svm *svm)
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+ return kvm_emulate_instruction(vcpu, 0);
}
-static int rdpmc_interception(struct vcpu_svm *svm)
+static int rsm_interception(struct kvm_vcpu *vcpu)
{
- int err;
-
- if (!nrips)
- return emulate_on_interception(svm);
-
- err = kvm_rdpmc(&svm->vcpu);
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
}
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
unsigned long val)
{
- unsigned long cr0 = svm->vcpu.arch.cr0;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
bool ret = false;
- if (!is_guest_mode(&svm->vcpu) ||
- (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
+ if (!is_guest_mode(vcpu) ||
+ (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
return false;
cr0 &= ~SVM_CR0_SELECTIVE_MASK;
@@ -2509,17 +2431,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
#define CR_VALID (1ULL << 63)
-static int cr_interception(struct vcpu_svm *svm)
+static int cr_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int reg, cr;
unsigned long val;
int err;
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
@@ -2530,61 +2453,61 @@ static int cr_interception(struct vcpu_svm *svm)
err = 0;
if (cr >= 16) { /* mov to cr */
cr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
+ val = kvm_register_read(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- if (!check_selective_cr0_intercepted(svm, val))
- err = kvm_set_cr0(&svm->vcpu, val);
+ if (!check_selective_cr0_intercepted(vcpu, val))
+ err = kvm_set_cr0(vcpu, val);
else
return 1;
break;
case 3:
- err = kvm_set_cr3(&svm->vcpu, val);
+ err = kvm_set_cr3(vcpu, val);
break;
case 4:
- err = kvm_set_cr4(&svm->vcpu, val);
+ err = kvm_set_cr4(vcpu, val);
break;
case 8:
- err = kvm_set_cr8(&svm->vcpu, val);
+ err = kvm_set_cr8(vcpu, val);
break;
default:
WARN(1, "unhandled write to CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
} else { /* mov from cr */
switch (cr) {
case 0:
- val = kvm_read_cr0(&svm->vcpu);
+ val = kvm_read_cr0(vcpu);
break;
case 2:
- val = svm->vcpu.arch.cr2;
+ val = vcpu->arch.cr2;
break;
case 3:
- val = kvm_read_cr3(&svm->vcpu);
+ val = kvm_read_cr3(vcpu);
break;
case 4:
- val = kvm_read_cr4(&svm->vcpu);
+ val = kvm_read_cr4(vcpu);
break;
case 8:
- val = kvm_get_cr8(&svm->vcpu);
+ val = kvm_get_cr8(vcpu);
break;
default:
WARN(1, "unhandled read from CR%d", cr);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
}
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_complete_insn_gp(vcpu, err);
}
-static int cr_trap(struct vcpu_svm *svm)
+static int cr_trap(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
unsigned long old_value, new_value;
unsigned int cr;
int ret = 0;
@@ -2606,7 +2529,7 @@ static int cr_trap(struct vcpu_svm *svm)
kvm_post_set_cr4(vcpu, old_value, new_value);
break;
case 8:
- ret = kvm_set_cr8(&svm->vcpu, new_value);
+ ret = kvm_set_cr8(vcpu, new_value);
break;
default:
WARN(1, "unhandled CR%d write trap", cr);
@@ -2617,57 +2540,57 @@ static int cr_trap(struct vcpu_svm *svm)
return kvm_complete_insn_gp(vcpu, ret);
}
-static int dr_interception(struct vcpu_svm *svm)
+static int dr_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int reg, dr;
unsigned long val;
int err = 0;
- if (svm->vcpu.guest_debug == 0) {
+ if (vcpu->guest_debug == 0) {
/*
* No more DR vmexits; force a reload of the debug registers
* and reenter on this instruction. The next vmexit will
* retrieve the full state of the debug registers.
*/
clr_dr_intercepts(svm);
- svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
return 1;
}
if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
if (dr >= 16) { /* mov to DRn */
dr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
- err = kvm_set_dr(&svm->vcpu, dr, val);
+ val = kvm_register_read(vcpu, reg);
+ err = kvm_set_dr(vcpu, dr, val);
} else {
- kvm_get_dr(&svm->vcpu, dr, &val);
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_get_dr(vcpu, dr, &val);
+ kvm_register_write(vcpu, reg, val);
}
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ return kvm_complete_insn_gp(vcpu, err);
}
-static int cr8_write_interception(struct vcpu_svm *svm)
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
int r;
- u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+ u8 cr8_prev = kvm_get_cr8(vcpu);
/* instruction emulation calls kvm_set_cr8() */
- r = cr_interception(svm);
- if (lapic_in_kernel(&svm->vcpu))
+ r = cr_interception(vcpu);
+ if (lapic_in_kernel(vcpu))
return r;
- if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+ if (cr8_prev <= kvm_get_cr8(vcpu))
return r;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
-static int efer_trap(struct vcpu_svm *svm)
+static int efer_trap(struct kvm_vcpu *vcpu)
{
struct msr_data msr_info;
int ret;
@@ -2680,10 +2603,10 @@ static int efer_trap(struct vcpu_svm *svm)
*/
msr_info.host_initiated = false;
msr_info.index = MSR_EFER;
- msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
- ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+ msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(vcpu, &msr_info);
- return kvm_complete_insn_gp(&svm->vcpu, ret);
+ return kvm_complete_insn_gp(vcpu, ret);
}
static int svm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -2709,56 +2632,50 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
struct vcpu_svm *svm = to_svm(vcpu);
switch (msr_info->index) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+ return 1;
+ msr_info->data = svm->tsc_ratio_msr;
+ break;
case MSR_STAR:
- msr_info->data = svm->vmcb->save.star;
+ msr_info->data = svm->vmcb01.ptr->save.star;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- msr_info->data = svm->vmcb->save.lstar;
+ msr_info->data = svm->vmcb01.ptr->save.lstar;
break;
case MSR_CSTAR:
- msr_info->data = svm->vmcb->save.cstar;
+ msr_info->data = svm->vmcb01.ptr->save.cstar;
break;
case MSR_KERNEL_GS_BASE:
- msr_info->data = svm->vmcb->save.kernel_gs_base;
+ msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
break;
case MSR_SYSCALL_MASK:
- msr_info->data = svm->vmcb->save.sfmask;
+ msr_info->data = svm->vmcb01.ptr->save.sfmask;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- msr_info->data = svm->vmcb->save.sysenter_cs;
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- msr_info->data = svm->sysenter_eip;
+ msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
break;
case MSR_IA32_SYSENTER_ESP:
- msr_info->data = svm->sysenter_esp;
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
- if (!boot_cpu_has(X86_FEATURE_RDTSCP))
- return 1;
msr_info->data = svm->tsc_aux;
break;
- /*
- * Nobody will change the following 5 values in the VMCB so we can
- * safely return them on rdmsr. They will always be 0 until LBRV is
- * implemented.
- */
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = svm->vmcb->save.dbgctl;
- break;
case MSR_IA32_LASTBRANCHFROMIP:
- msr_info->data = svm->vmcb->save.br_from;
- break;
case MSR_IA32_LASTBRANCHTOIP:
- msr_info->data = svm->vmcb->save.br_to;
- break;
case MSR_IA32_LASTINTFROMIP:
- msr_info->data = svm->vmcb->save.last_excp_from;
- break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm->vmcb->save.last_excp_to;
+ msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -2771,7 +2688,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_has_spec_ctrl_msr(vcpu))
return 1;
- msr_info->data = svm->spec_ctrl;
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ msr_info->data = svm->vmcb->save.spec_ctrl;
+ else
+ msr_info->data = svm->spec_ctrl;
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr_info->host_initiated &&
@@ -2809,22 +2729,17 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (!sev_es_guest(svm->vcpu.kvm) || !err)
- return kvm_complete_insn_gp(&svm->vcpu, err);
+ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
+ return kvm_complete_insn_gp(vcpu, err);
- ghcb_set_sw_exit_info_1(svm->ghcb, 1);
- ghcb_set_sw_exit_info_2(svm->ghcb,
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
X86_TRAP_GP |
SVM_EVTINJ_TYPE_EXEPT |
SVM_EVTINJ_VALID);
return 1;
}
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_rdmsr(&svm->vcpu);
-}
-
static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2853,15 +2768,46 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ int r;
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
+ case MSR_AMD64_TSC_RATIO:
+
+ if (!svm->tsc_scaling_enabled) {
+
+ if (!msr->host_initiated)
+ return 1;
+ /*
+ * In case TSC scaling is not enabled, always
+ * leave this MSR at the default value.
+ *
+ * Due to bug in qemu 6.2.0, it would try to set
+ * this msr to 0 if tsc scaling is not enabled.
+ * Ignore this value as well.
+ */
+ if (data != 0 && data != svm->tsc_ratio_msr)
+ return 1;
+ break;
+ }
+
+ if (data & SVM_TSC_RATIO_RSVD)
+ return 1;
+
+ svm->tsc_ratio_msr = data;
+
+ if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ break;
case MSR_IA32_CR_PAT:
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
return 1;
vcpu->arch.pat = data;
- svm->vmcb->save.g_pat = data;
+ svm->vmcb01.ptr->save.g_pat = data;
+ if (is_guest_mode(vcpu))
+ nested_vmcb02_compute_g_pat(svm);
vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
break;
case MSR_IA32_SPEC_CTRL:
@@ -2872,7 +2818,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (kvm_spec_ctrl_test_value(data))
return 1;
- svm->spec_ctrl = data;
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ svm->vmcb->save.spec_ctrl = data;
+ else
+ svm->spec_ctrl = data;
if (!data)
break;
@@ -2915,47 +2864,56 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->virt_spec_ctrl = data;
break;
case MSR_STAR:
- svm->vmcb->save.star = data;
+ svm->vmcb01.ptr->save.star = data;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- svm->vmcb->save.lstar = data;
+ svm->vmcb01.ptr->save.lstar = data;
break;
case MSR_CSTAR:
- svm->vmcb->save.cstar = data;
+ svm->vmcb01.ptr->save.cstar = data;
break;
case MSR_KERNEL_GS_BASE:
- svm->vmcb->save.kernel_gs_base = data;
+ svm->vmcb01.ptr->save.kernel_gs_base = data;
break;
case MSR_SYSCALL_MASK:
- svm->vmcb->save.sfmask = data;
+ svm->vmcb01.ptr->save.sfmask = data;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- svm->vmcb->save.sysenter_cs = data;
+ svm->vmcb01.ptr->save.sysenter_cs = data;
break;
case MSR_IA32_SYSENTER_EIP:
- svm->sysenter_eip = data;
- svm->vmcb->save.sysenter_eip = data;
+ svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+ /*
+ * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+ * when we spoof an Intel vendor ID (for cross vendor migration).
+ * In this case we use this intercept to track the high
+ * 32 bit part of these msrs to support Intel's
+ * implementation of SYSENTER/SYSEXIT.
+ */
+ svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break;
case MSR_IA32_SYSENTER_ESP:
- svm->sysenter_esp = data;
- svm->vmcb->save.sysenter_esp = data;
+ svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
- if (!boot_cpu_has(X86_FEATURE_RDTSCP))
- return 1;
-
/*
- * This is rare, so we update the MSR here instead of using
- * direct_access_msrs. Doing that would require a rdmsr in
- * svm_vcpu_put.
+ * TSC_AUX is usually changed only during boot and never read
+ * directly. Intercept TSC_AUX instead of exposing it to the
+ * guest via direct_access_msrs, and switch it via user return.
*/
+ preempt_disable();
+ r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
+ preempt_enable();
+ if (r)
+ return 1;
+
svm->tsc_aux = data;
- wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
break;
case MSR_IA32_DEBUGCTLMSR:
- if (!boot_cpu_has(X86_FEATURE_LBRV)) {
+ if (!lbrv) {
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
__func__, data);
break;
@@ -2963,15 +2921,25 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- svm->vmcb->save.dbgctl = data;
- vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
- if (data & (1ULL<<0))
- svm_enable_lbrv(vcpu);
+ if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
+ svm->vmcb->save.dbgctl = data;
else
- svm_disable_lbrv(vcpu);
+ svm->vmcb01.ptr->save.dbgctl = data;
+
+ svm_update_lbrv(vcpu);
+
break;
case MSR_VM_HSAVE_PA:
- svm->nested.hsave_msr = data;
+ /*
+ * Old kernels did not validate the value written to
+ * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
+ * value to allow live migrating buggy or malicious guests
+ * originating from those kernels.
+ */
+ if (!msr->host_initiated && !page_address_valid(vcpu, data))
+ return 1;
+
+ svm->nested.hsave_msr = data & PAGE_MASK;
break;
case MSR_VM_CR:
return svm_set_vm_cr(vcpu, data);
@@ -2996,84 +2964,62 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->msr_decfg = data;
break;
}
- case MSR_IA32_APICBASE:
- if (kvm_vcpu_apicv_active(vcpu))
- avic_update_vapic_bar(to_svm(vcpu), data);
- fallthrough;
default:
return kvm_set_msr_common(vcpu, msr);
}
return 0;
}
-static int wrmsr_interception(struct vcpu_svm *svm)
+static int msr_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_wrmsr(&svm->vcpu);
-}
-
-static int msr_interception(struct vcpu_svm *svm)
-{
- if (svm->vmcb->control.exit_info_1)
- return wrmsr_interception(svm);
+ if (to_svm(vcpu)->vmcb->control.exit_info_1)
+ return kvm_emulate_wrmsr(vcpu);
else
- return rdmsr_interception(svm);
+ return kvm_emulate_rdmsr(vcpu);
}
-static int interrupt_window_interception(struct vcpu_svm *svm)
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- svm_clear_vintr(svm);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ svm_clear_vintr(to_svm(vcpu));
/*
- * For AVIC, the only reason to end up here is ExtINTs.
+ * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
* In this case AVIC was temporarily disabled for
* requesting the IRQ window and we have to re-enable it.
+ *
+ * If running nested, still remove the VM wide AVIC inhibit to
+ * support case in which the interrupt window was requested when the
+ * vCPU was not running nested.
+
+ * All vCPUs which run still run nested, will remain to have their
+ * AVIC still inhibited due to per-cpu AVIC inhibition.
*/
- svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+ kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
- ++svm->vcpu.stat.irq_window_exits;
+ ++vcpu->stat.irq_window_exits;
return 1;
}
-static int pause_interception(struct vcpu_svm *svm)
+static int pause_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel;
-
/*
* CPL is not made available for an SEV-ES guest, therefore
* vcpu->arch.preempted_in_kernel can never be true. Just
* set in_kernel to false as well.
*/
- in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
+ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
- if (!kvm_pause_in_guest(vcpu->kvm))
- grow_ple_window(vcpu);
+ grow_ple_window(vcpu);
kvm_vcpu_on_spin(vcpu, in_kernel);
- return 1;
-}
-
-static int nop_interception(struct vcpu_svm *svm)
-{
- return kvm_skip_emulated_instruction(&(svm->vcpu));
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int monitor_interception(struct vcpu_svm *svm)
+static int invpcid_interception(struct kvm_vcpu *vcpu)
{
- printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
- return nop_interception(svm);
-}
-
-static int mwait_interception(struct vcpu_svm *svm)
-{
- printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
- return nop_interception(svm);
-}
-
-static int invpcid_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
unsigned long type;
gva_t gva;
@@ -3090,15 +3036,10 @@ static int invpcid_interception(struct vcpu_svm *svm)
type = svm->vmcb->control.exit_info_2;
gva = svm->vmcb->control.exit_info_1;
- if (type > 3) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
return kvm_handle_invpcid(vcpu, type, gva);
}
-static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
[SVM_EXIT_READ_CR4] = cr_interception,
@@ -3133,15 +3074,14 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
- [SVM_EXIT_SMI] = nop_on_interception,
- [SVM_EXIT_INIT] = nop_on_interception,
+ [SVM_EXIT_SMI] = smi_interception,
[SVM_EXIT_VINTR] = interrupt_window_interception,
- [SVM_EXIT_RDPMC] = rdpmc_interception,
- [SVM_EXIT_CPUID] = cpuid_interception,
+ [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
+ [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
[SVM_EXIT_IRET] = iret_interception,
- [SVM_EXIT_INVD] = invd_interception,
+ [SVM_EXIT_INVD] = kvm_emulate_invd,
[SVM_EXIT_PAUSE] = pause_interception,
- [SVM_EXIT_HLT] = halt_interception,
+ [SVM_EXIT_HLT] = kvm_emulate_halt,
[SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invlpga_interception,
[SVM_EXIT_IOIO] = io_interception,
@@ -3149,17 +3089,18 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
[SVM_EXIT_SHUTDOWN] = shutdown_interception,
[SVM_EXIT_VMRUN] = vmrun_interception,
- [SVM_EXIT_VMMCALL] = vmmcall_interception,
+ [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
[SVM_EXIT_VMLOAD] = vmload_interception,
[SVM_EXIT_VMSAVE] = vmsave_interception,
[SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT] = skinit_interception,
- [SVM_EXIT_WBINVD] = wbinvd_interception,
- [SVM_EXIT_MONITOR] = monitor_interception,
- [SVM_EXIT_MWAIT] = mwait_interception,
- [SVM_EXIT_XSETBV] = xsetbv_interception,
- [SVM_EXIT_RDPRU] = rdpru_interception,
+ [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
+ [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
+ [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
+ [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
+ [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
+ [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
[SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
[SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
[SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
@@ -3177,12 +3118,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
return;
}
+ pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
+ svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@@ -3239,30 +3183,30 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
save->ds.limit, save->ds.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"fs:",
- save->fs.selector, save->fs.attrib,
- save->fs.limit, save->fs.base);
+ save01->fs.selector, save01->fs.attrib,
+ save01->fs.limit, save01->fs.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"gs:",
- save->gs.selector, save->gs.attrib,
- save->gs.limit, save->gs.base);
+ save01->gs.selector, save01->gs.attrib,
+ save01->gs.limit, save01->gs.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"gdtr:",
save->gdtr.selector, save->gdtr.attrib,
save->gdtr.limit, save->gdtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"ldtr:",
- save->ldtr.selector, save->ldtr.attrib,
- save->ldtr.limit, save->ldtr.base);
+ save01->ldtr.selector, save01->ldtr.attrib,
+ save01->ldtr.limit, save01->ldtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"idtr:",
save->idtr.selector, save->idtr.attrib,
save->idtr.limit, save->idtr.base);
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"tr:",
- save->tr.selector, save->tr.attrib,
- save->tr.limit, save->tr.base);
- pr_err("cpl: %d efer: %016llx\n",
- save->cpl, save->efer);
+ save01->tr.selector, save01->tr.attrib,
+ save01->tr.limit, save01->tr.base);
+ pr_err("vmpl: %d cpl: %d efer: %016llx\n",
+ save->vmpl, save->cpl, save->efer);
pr_err("%-15s %016llx %-13s %016llx\n",
"cr0:", save->cr0, "cr2:", save->cr2);
pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3274,15 +3218,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-15s %016llx %-13s %016llx\n",
"rsp:", save->rsp, "rax:", save->rax);
pr_err("%-15s %016llx %-13s %016llx\n",
- "star:", save->star, "lstar:", save->lstar);
+ "star:", save01->star, "lstar:", save01->lstar);
pr_err("%-15s %016llx %-13s %016llx\n",
- "cstar:", save->cstar, "sfmask:", save->sfmask);
+ "cstar:", save01->cstar, "sfmask:", save01->sfmask);
pr_err("%-15s %016llx %-13s %016llx\n",
- "kernel_gs_base:", save->kernel_gs_base,
- "sysenter_cs:", save->sysenter_cs);
+ "kernel_gs_base:", save01->kernel_gs_base,
+ "sysenter_cs:", save01->sysenter_cs);
pr_err("%-15s %016llx %-13s %016llx\n",
- "sysenter_esp:", save->sysenter_esp,
- "sysenter_eip:", save->sysenter_eip);
+ "sysenter_esp:", save01->sysenter_esp,
+ "sysenter_eip:", save01->sysenter_eip);
pr_err("%-15s %016llx %-13s %016llx\n",
"gpat:", save->g_pat, "dbgctl:", save->dbgctl);
pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3292,12 +3236,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
-static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+static bool svm_check_exit_valid(u64 exit_code)
{
- if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
- svm_exit_handlers[exit_code])
- return 0;
+ return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ svm_exit_handlers[exit_code]);
+}
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
dump_vmcb(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3305,35 +3251,36 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_code;
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
-
- return -EINVAL;
+ return 0;
}
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
- return 0;
+ if (!svm_check_exit_valid(exit_code))
+ return svm_handle_invalid_exit(vcpu, exit_code);
#ifdef CONFIG_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
- return msr_interception(svm);
+ return msr_interception(vcpu);
else if (exit_code == SVM_EXIT_VINTR)
- return interrupt_window_interception(svm);
+ return interrupt_window_interception(vcpu);
else if (exit_code == SVM_EXIT_INTR)
- return intr_interception(svm);
+ return intr_interception(vcpu);
else if (exit_code == SVM_EXIT_HLT)
- return halt_interception(svm);
+ return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
- return npf_interception(svm);
+ return npf_interception(vcpu);
#endif
- return svm_exit_handlers[exit_code](svm);
+ return svm_exit_handlers[exit_code](vcpu);
}
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code)
{
struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+ *reason = control->exit_code;
*info1 = control->exit_info_1;
*info2 = control->exit_info_2;
*intr_info = control->exit_int_info;
@@ -3344,13 +3291,13 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
*error_code = 0;
}
-static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+ trace_kvm_exit(vcpu, KVM_ISA_SVM);
/* SEV-ES guests must use the CR write traps to track CR registers. */
if (!sev_es_guest(vcpu->kvm)) {
@@ -3363,7 +3310,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (is_guest_mode(vcpu)) {
int vmexit;
- trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
vmexit = nested_svm_exit_special(svm);
@@ -3395,7 +3342,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- return svm_invoke_exit_handler(svm, exit_code);
+ return svm_invoke_exit_handler(vcpu, exit_code);
}
static void reload_tss(struct kvm_vcpu *vcpu)
@@ -3406,15 +3353,27 @@ static void reload_tss(struct kvm_vcpu *vcpu)
load_TR_desc();
}
-static void pre_svm_run(struct vcpu_svm *svm)
+static void pre_svm_run(struct kvm_vcpu *vcpu)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If the previous vmrun of the vmcb occurred on a different physical
+ * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
+ * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+ */
+ if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+ svm->current_vmcb->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ svm->current_vmcb->cpu = vcpu->cpu;
+ }
- if (sev_guest(svm->vcpu.kvm))
- return pre_sev_run(svm, svm->vcpu.cpu);
+ if (sev_guest(vcpu->kvm))
+ return pre_sev_run(svm, vcpu->cpu);
/* FIXME: handle wraparound of asid_generation */
- if (svm->asid_generation != sd->asid_generation)
+ if (svm->current_vmcb->asid_generation != sd->asid_generation)
new_asid(svm, sd);
}
@@ -3424,12 +3383,12 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
-static void svm_set_irq(struct kvm_vcpu *vcpu)
+static void svm_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3442,6 +3401,55 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vector)
+{
+ /*
+ * vcpu->arch.apicv_active must be read after vcpu->mode.
+ * Pairs with smp_store_release in vcpu_enter_guest.
+ */
+ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
+
+ if (!READ_ONCE(vcpu->arch.apicv_active)) {
+ /* Process the interrupt via inject_pending_event */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+ if (in_guest_mode) {
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ. If
+ * the vCPU exits the guest before the doorbell chimes, hardware
+ * will automatically process AVIC interrupts at the next VMRUN.
+ */
+ avic_ring_doorbell(vcpu);
+ } else {
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
+
+static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ kvm_lapic_set_irr(vector, apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode. This guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that svm_complete_interrupt_delivery
+ * will signal the doorbell if the CPU has already entered the guest.
+ */
+ smp_mb__after_atomic();
+ svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
+}
+
static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3478,7 +3486,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
return false;
ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (svm->vcpu.arch.hflags & HF_NMI_MASK);
+ (vcpu->arch.hflags & HF_NMI_MASK);
return ret;
}
@@ -3489,18 +3497,18 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_nmi_blocked(vcpu))
+ return 0;
+
/* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return -EBUSY;
-
- return !svm_nmi_blocked(vcpu);
+ return 1;
}
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+ return !!(vcpu->arch.hflags & HF_NMI_MASK);
}
static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3508,12 +3516,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
struct vcpu_svm *svm = to_svm(vcpu);
if (masked) {
- svm->vcpu.arch.hflags |= HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
} else {
- svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- if (!sev_es_guest(svm->vcpu.kvm))
+ vcpu->arch.hflags &= ~HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
svm_clr_intercept(svm, INTERCEPT_IRET);
}
}
@@ -3526,17 +3534,10 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm))
return true;
- if (sev_es_guest(svm->vcpu.kvm)) {
- /*
- * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
- * bit to determine the state of the IF flag.
- */
- if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
- return true;
- } else if (is_guest_mode(vcpu)) {
+ if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
- ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
+ ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
: !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
return true;
@@ -3544,7 +3545,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (nested_exit_on_intr(svm))
return false;
} else {
- if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+ if (!svm_get_if_flag(vcpu))
return true;
}
@@ -3554,9 +3555,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_svm *svm = to_svm(vcpu);
+
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_interrupt_blocked(vcpu))
+ return 0;
+
/*
* An IRQ must not be injected into L2 if it's supposed to VM-Exit,
* e.g. if the IRQ arrived asynchronously after checking nested events.
@@ -3564,7 +3569,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
return -EBUSY;
- return !svm_interrupt_blocked(vcpu);
+ return 1;
}
static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -3579,14 +3584,20 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
* enabled, the STGI interception will not occur. Enable the irq
* window under the assumption that the hardware will set the GIF.
*/
- if (vgif_enabled(svm) || gif_set(svm)) {
+ if (vgif || gif_set(svm)) {
/*
* IRQ window is not needed when AVIC is enabled,
* unless we have pending ExtINT since it cannot be injected
- * via AVIC. In such case, we need to temporarily disable AVIC,
+ * via AVIC. In such case, KVM needs to temporarily disable AVIC,
* and fallback to injecting IRQ via V_IRQ.
+ *
+ * If running nested, AVIC is already locally inhibited
+ * on this vCPU, therefore there is no need to request
+ * the VM wide AVIC inhibition.
*/
- svm_toggle_avic_for_irq_window(vcpu, false);
+ if (!is_guest_mode(vcpu))
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
svm_set_vintr(svm);
}
}
@@ -3595,12 +3606,11 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
- == HF_NMI_MASK)
+ if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
- if (vgif_enabled(svm))
+ if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
return; /* STGI will cause a vm exit */
}
@@ -3614,17 +3624,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- return 0;
-}
-
-static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
-{
- return 0;
-}
-
-void svm_flush_tlb(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3638,7 +3638,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
else
- svm->asid_generation--;
+ svm->current_vmcb->asid_generation--;
}
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3675,8 +3675,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
-static void svm_complete_interrupts(struct vcpu_svm *svm)
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u8 vector;
int type;
u32 exitintinfo = svm->vmcb->control.exit_int_info;
@@ -3688,28 +3689,28 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
- if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
- (sev_es_guest(svm->vcpu.kvm) ||
- kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
- svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+ (sev_es_guest(vcpu->kvm) ||
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+ vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
- svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
if (!(exitintinfo & SVM_EXITINTINFO_VALID))
return;
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = true;
+ vcpu->arch.nmi_injected = true;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
/*
@@ -3725,21 +3726,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
*/
if (kvm_exception_is_soft(vector)) {
if (vector == BP_VECTOR && int3_injected &&
- kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
- kvm_rip_write(&svm->vcpu,
- kvm_rip_read(&svm->vcpu) -
- int3_injected);
+ kvm_is_linear_rip(vcpu, svm->int3_rip))
+ kvm_rip_write(vcpu,
+ kvm_rip_read(vcpu) - int3_injected);
break;
}
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_requeue_exception_e(&svm->vcpu, vector, err);
+ kvm_requeue_exception_e(vcpu, vector, err);
} else
- kvm_requeue_exception(&svm->vcpu, vector);
+ kvm_requeue_exception(vcpu, vector);
break;
case SVM_EXITINTINFO_TYPE_INTR:
- kvm_queue_interrupt(&svm->vcpu, vector, false);
+ kvm_queue_interrupt(vcpu, vector, false);
break;
default:
break;
@@ -3754,7 +3754,12 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
control->exit_int_info = control->event_inj;
control->exit_int_info_err = control->event_inj_err;
control->event_inj = 0;
- svm_complete_interrupts(svm);
+ svm_complete_interrupts(vcpu);
+}
+
+static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ return 1;
}
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
@@ -3766,57 +3771,32 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
- struct vcpu_svm *svm)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
- /*
- * VMENTER enables interrupts (host state), but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about
- * it. This is the same logic as for exit_to_user_mode().
- *
- * This ensures that e.g. latency analysis on the host observes
- * guest mode as interrupt enabled.
- *
- * guest_enter_irqoff() informs context tracking about the
- * transition to guest mode and if enabled adjusts RCU state
- * accordingly.
- */
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long vmcb_pa = svm->current_vmcb->pa;
- guest_enter_irqoff();
- lockdep_hardirqs_on(CALLER_ADDR0);
+ guest_state_enter_irqoff();
- if (sev_es_guest(svm->vcpu.kvm)) {
- __svm_sev_es_vcpu_run(svm->vmcb_pa);
+ if (sev_es_guest(vcpu->kvm)) {
+ __svm_sev_es_vcpu_run(vmcb_pa);
} else {
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+ /*
+ * Use a single vmcb (vmcb01 because it's always valid) for
+ * context switching guest state via VMLOAD/VMSAVE, that way
+ * the state doesn't need to be copied between vmcb01 and
+ * vmcb02 when switching vmcbs for nested virtualization.
+ */
+ vmload(svm->vmcb01.pa);
+ __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
+ vmsave(svm->vmcb01.pa);
vmload(__sme_page_pa(sd->save_area));
}
- /*
- * VMEXIT disables interrupts (host state), but tracing and lockdep
- * have them in state 'on' as recorded before entering guest mode.
- * Same as enter_from_user_mode().
- *
- * guest_exit_irqoff() restores host context and reinstates RCU if
- * enabled and required.
- *
- * This needs to be done before the below as native_read_msr()
- * contains a tracepoint and x86_spec_ctrl_restore_host() calls
- * into world and some more.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- guest_exit_irqoff();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
+ guest_state_exit_irqoff();
}
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
@@ -3845,7 +3825,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
smp_send_reschedule(vcpu->cpu);
}
- pre_svm_run(svm);
+ pre_svm_run(vcpu);
sync_lapic_to_cr8(vcpu);
@@ -3855,11 +3835,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
}
svm->vmcb->save.cr2 = vcpu->arch.cr2;
+ svm_hv_update_vp_id(svm->vmcb, vcpu);
+
/*
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
svm_set_dr6(svm, vcpu->arch.dr6);
else
svm_set_dr6(svm, DR6_ACTIVE_LOW);
@@ -3875,9 +3857,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* is no need to worry about the conditional branch over the wrmsr
* being speculatively taken.
*/
- x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu, svm);
+ svm_vcpu_enter_exit(vcpu);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -3894,23 +3877,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
+ unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
- if (!sev_es_guest(svm->vcpu.kvm))
+ if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
- x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
- if (!sev_es_guest(svm->vcpu.kvm)) {
+ if (!sev_es_guest(vcpu->kvm)) {
vcpu->arch.cr2 = svm->vmcb->save.cr2;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
}
+ vcpu->arch.regs_dirty = 0;
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_before_interrupt(&svm->vcpu);
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
kvm_load_host_xsave_state(vcpu);
stgi();
@@ -3918,13 +3904,19 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
/* Any pending NMI will happen here */
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_after_interrupt(&svm->vcpu);
+ kvm_after_interrupt(vcpu);
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
- if (is_guest_mode(&svm->vcpu)) {
- sync_nested_vmcb_control(svm);
+ if (is_guest_mode(vcpu)) {
+ nested_sync_control_from_vmcb02(svm);
+
+ /* Track VMRUNs that have made past consistency checking */
+ if (svm->nested.nested_run_pending &&
+ svm->vmcb->control.exit_code != SVM_EXIT_ERR)
+ ++vcpu->stat.nested_run;
+
svm->nested.nested_run_pending = 0;
}
@@ -3933,13 +3925,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
/* if exit due to PF check for async PF */
if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
- svm->vcpu.arch.apf.host_apf_flags =
+ vcpu->arch.apf.host_apf_flags =
kvm_read_and_reset_apf_flags();
- if (npt_enabled) {
- vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
- vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
- }
+ vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
/*
* We need to handle MC intercepts here before the vcpu has a chance to
@@ -3947,9 +3936,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
*/
if (unlikely(svm->vmcb->control.exit_code ==
SVM_EXIT_EXCP_BASE + MC_VECTOR))
- svm_handle_mce(svm);
+ svm_handle_mce(vcpu);
- svm_complete_interrupts(svm);
+ svm_complete_interrupts(vcpu);
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
@@ -3957,21 +3946,25 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
return svm_exit_handlers_fastpath(vcpu);
}
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level)
{
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long cr3;
- cr3 = __sme_set(root);
if (npt_enabled) {
- svm->vmcb->control.nested_cr3 = cr3;
+ svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
- /* Loading L2's CR3 is handled by enter_svm_guest_mode. */
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- return;
+ hv_track_root_tdp(vcpu, root_hpa);
+
cr3 = vcpu->arch.cr3;
+ } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
+ cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+ } else {
+ /* PCID in the guest should be impossible with a 32-bit MMU. */
+ WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+ cr3 = root_hpa;
}
svm->vmcb->save.cr3 = cr3;
@@ -4005,11 +3998,6 @@ static int __init svm_check_processor_compat(void)
return 0;
}
-static bool svm_cpu_has_accelerated_tpr(void)
-{
- return false;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -4041,6 +4029,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_cpuid_entry2 *best;
+ struct kvm *kvm = vcpu->kvm;
vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
boot_cpu_has(X86_FEATURE_XSAVE) &&
@@ -4048,10 +4037,22 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
/* Update nrips enabled cache */
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
- guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+ guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
+
+ svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+ svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
+
+ svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+
+ svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
- /* Check again if INVPCID interception if required */
- svm_check_invpcid(svm);
+ svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+
+ svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+
+ svm_recalc_instruction_intercepts(vcpu, svm);
/* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) {
@@ -4060,24 +4061,15 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- /*
- * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
- * is exposed to the guest, disable AVIC.
- */
- if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
- kvm_request_apicv_update(vcpu->kvm, false,
- APICV_INHIBIT_REASON_X2APIC);
-
- /*
- * Currently, AVIC does not work with nested virtualization.
- * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
- */
- if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- kvm_request_apicv_update(vcpu->kvm, false,
- APICV_INHIBIT_REASON_NESTED);
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ /*
+ * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+ * is exposed to the guest, disable AVIC.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
+ }
+ init_vmcb_after_set_cpuid(vcpu);
}
static bool svm_has_wbinvd_exit(void)
@@ -4182,7 +4174,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
info->intercept == x86_intercept_clts)
break;
- if (!(vmcb_is_intercept(&svm->nested.ctl,
+ if (!(vmcb12_is_intercept(&svm->nested.ctl,
INTERCEPT_SELECTIVE_CR0)))
break;
@@ -4271,6 +4263,8 @@ out:
static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
+ vcpu->arch.at_instruction_boundary = true;
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
@@ -4302,65 +4296,123 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_smi_blocked(vcpu))
+ return 0;
+
/* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
return -EBUSY;
- return !svm_smi_blocked(vcpu);
+ return 1;
}
-static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_host_map map_save;
int ret;
- if (is_guest_mode(vcpu)) {
- /* FED8h - SVM Guest */
- put_smstate(u64, smstate, 0x7ed8, 1);
- /* FEE0h - SVM Guest VMCB Physical Address */
- put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
+ if (!is_guest_mode(vcpu))
+ return 0;
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
- ret = nested_svm_vmexit(svm);
- if (ret)
- return ret;
- }
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
+ if (ret)
+ return ret;
+
+ /*
+ * KVM uses VMCB01 to store L1 host state while L2 runs but
+ * VMCB01 is going to be used during SMM and thus the state will
+ * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+ * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+ * format of the area is identical to guest save area offsetted
+ * by 0x400 (matches the offset of 'struct vmcb_save_area'
+ * within 'struct vmcb'). Note: HSAVE area may also be used by
+ * L1 hypervisor to save additional host context (e.g. KVM does
+ * that, see svm_prepare_switch_to_guest()) which must be
+ * preserved.
+ */
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+ &map_save) == -EINVAL)
+ return 1;
+
+ BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+
+ svm_copy_vmrun_state(map_save.hva + 0x400,
+ &svm->vmcb01.ptr->save);
+
+ kvm_vcpu_unmap(vcpu, &map_save, true);
return 0;
}
-static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_host_map map;
- int ret = 0;
+ struct kvm_host_map map, map_save;
+ u64 saved_efer, vmcb12_gpa;
+ struct vmcb *vmcb12;
+ int ret;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return 0;
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
- u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
- u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
- u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
+ /* Non-zero if SMI arrived while vCPU was in guest mode. */
+ if (!GET_SMSTATE(u64, smstate, 0x7ed8))
+ return 0;
- if (guest) {
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return 1;
- if (!(saved_efer & EFER_SVME))
- return 1;
+ saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
+ if (!(saved_efer & EFER_SVME))
+ return 1;
- if (kvm_vcpu_map(&svm->vcpu,
- gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
- return 1;
+ vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+ return 1;
- if (svm_allocate_nested(svm))
- return 1;
+ ret = 1;
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
+ goto unmap_map;
- ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
- }
- }
+ if (svm_allocate_nested(svm))
+ goto unmap_save;
+
+ /*
+ * Restore L1 host state from L1 HSAVE area as VMCB01 was
+ * used during SMM (see svm_enter_smm())
+ */
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
+ /*
+ * Enter the nested guest now
+ */
+
+ vmcb_mark_all_dirty(svm->vmcb01.ptr);
+
+ vmcb12 = map.hva;
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+ ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+
+ if (ret)
+ goto unmap_save;
+
+ svm->nested.nested_run_pending = 1;
+
+unmap_save:
+ kvm_vcpu_unmap(vcpu, &map_save, true);
+unmap_map:
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
@@ -4369,7 +4421,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (!gif_set(svm)) {
- if (vgif_enabled(svm))
+ if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
/* STGI will cause a vm exit */
} else {
@@ -4377,79 +4429,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
}
}
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
bool smep, smap, is_user;
unsigned long cr4;
+ u64 error_code;
+
+ /* Emulation is always possible when KVM has access to all guest state. */
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ /* #UD and #GP should never be intercepted for SEV guests. */
+ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
+ EMULTYPE_TRAP_UD_FORCED |
+ EMULTYPE_VMWARE_GP));
/*
- * When the guest is an SEV-ES guest, emulation is not possible.
+ * Emulation is impossible for SEV-ES guests as KVM doesn't have access
+ * to guest register state.
*/
if (sev_es_guest(vcpu->kvm))
return false;
/*
+ * Emulation is possible if the instruction is already decoded, e.g.
+ * when completing I/O after returning from userspace.
+ */
+ if (emul_type & EMULTYPE_NO_DECODE)
+ return true;
+
+ /*
+ * Emulation is possible for SEV guests if and only if a prefilled
+ * buffer containing the bytes of the intercepted instruction is
+ * available. SEV guest memory is encrypted with a guest specific key
+ * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+ * decode garbage.
+ *
+ * Inject #UD if KVM reached this point without an instruction buffer.
+ * In practice, this path should never be hit by a well-behaved guest,
+ * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
+ * is still theoretically reachable, e.g. via unaccelerated fault-like
+ * AVIC access, and needs to be handled by KVM to avoid putting the
+ * guest into an infinite loop. Injecting #UD is somewhat arbitrary,
+ * but its the least awful option given lack of insight into the guest.
+ */
+ if (unlikely(!insn)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
+
+ /*
+ * Emulate for SEV guests if the insn buffer is not empty. The buffer
+ * will be empty if the DecodeAssist microcode cannot fetch bytes for
+ * the faulting instruction because the code fetch itself faulted, e.g.
+ * the guest attempted to fetch from emulated MMIO or a guest page
+ * table used to translate CS:RIP resides in emulated MMIO.
+ */
+ if (likely(insn_len))
+ return true;
+
+ /*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
*
* Errata:
- * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
- * possible that CPU microcode implementing DecodeAssist will fail
- * to read bytes of instruction which caused #NPF. In this case,
- * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
- * return 0 instead of the correct guest instruction bytes.
- *
- * This happens because CPU microcode reading instruction bytes
- * uses a special opcode which attempts to read data using CPL=0
- * priviledges. The microcode reads CS:RIP and if it hits a SMAP
- * fault, it gives up and returns no instruction bytes.
+ * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
+ * possible that CPU microcode implementing DecodeAssist will fail to
+ * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
+ * be '0'. This happens because microcode reads CS:RIP using a _data_
+ * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
+ * gives up and does not fill the instruction bytes buffer.
*
- * Detection:
- * We reach here in case CPU supports DecodeAssist, raised #NPF and
- * returned 0 in GuestIntrBytes field of the VMCB.
- * First, errata can only be triggered in case vCPU CR4.SMAP=1.
- * Second, if vCPU CR4.SMEP=1, errata could only be triggered
- * in case vCPU CPL==3 (Because otherwise guest would have triggered
- * a SMEP fault instead of #NPF).
- * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
- * As most guests enable SMAP if they have also enabled SMEP, use above
- * logic in order to attempt minimize false-positive of detecting errata
- * while still preserving all cases semantic correctness.
+ * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
+ * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
+ * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
+ * GuestIntrBytes field of the VMCB.
*
- * Workaround:
- * To determine what instruction the guest was executing, the hypervisor
- * will have to decode the instruction at the instruction pointer.
+ * This does _not_ mean that the erratum has been encountered, as the
+ * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
+ * #PF, e.g. if the guest attempt to execute from emulated MMIO and
+ * encountered a reserved/not-present #PF.
*
- * In non SEV guest, hypervisor will be able to read the guest
- * memory to decode the instruction pointer when insn_len is zero
- * so we return true to indicate that decoding is possible.
+ * To hit the erratum, the following conditions must be true:
+ * 1. CR4.SMAP=1 (obviously).
+ * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
+ * have been hit as the guest would have encountered a SMEP
+ * violation #PF, not a #NPF.
+ * 3. The #NPF is not due to a code fetch, in which case failure to
+ * retrieve the instruction bytes is legitimate (see abvoe).
*
- * But in the SEV guest, the guest memory is encrypted with the
- * guest specific key and hypervisor will not be able to decode the
- * instruction pointer so we will not able to workaround it. Lets
- * print the error and request to kill the guest.
+ * In addition, don't apply the erratum workaround if the #NPF occurred
+ * while translating guest page tables (see below).
*/
- if (likely(!insn || insn_len))
- return true;
-
- /*
- * If RIP is invalid, go ahead with emulation which will cause an
- * internal error exit.
- */
- if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
- return true;
+ error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+ if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
+ goto resume_guest;
cr4 = kvm_read_cr4(vcpu);
smep = cr4 & X86_CR4_SMEP;
smap = cr4 & X86_CR4_SMAP;
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
- if (!sev_guest(vcpu->kvm))
- return true;
-
pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ /*
+ * If the fault occurred in userspace, arbitrarily inject #GP
+ * to avoid killing the guest and to hopefully avoid confusing
+ * the guest kernel too much, e.g. injecting #PF would not be
+ * coherent with respect to the guest's page tables. Request
+ * triple fault if the fault occurred in the kernel as there's
+ * no fault that KVM can inject without confusing the guest.
+ * In practice, the triple fault is moot as no sane SEV kernel
+ * will execute from user memory while also running with SMAP=1.
+ */
+ if (is_user)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
}
+resume_guest:
+ /*
+ * If the erratum was not hit, simply resume the guest and let it fault
+ * again. While awful, e.g. the vCPU may get stuck in an infinite loop
+ * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
+ * userspace will kill the guest, and letting the emulator read garbage
+ * will yield random behavior and potentially corrupt the guest.
+ *
+ * Simply resuming the guest is technically not a violation of the SEV
+ * architecture. AMD's APM states that all code fetches and page table
+ * accesses for SEV guest are encrypted, regardless of the C-Bit. The
+ * APM also states that encrypted accesses to MMIO are "ignored", but
+ * doesn't explicitly define "ignored", i.e. doing nothing and letting
+ * the guest spin is technically "ignoring" the access.
+ */
return false;
}
@@ -4487,36 +4600,36 @@ static int svm_vm_init(struct kvm *kvm)
if (!pause_filter_count || !pause_filter_thresh)
kvm->arch.pause_in_guest = true;
- if (avic) {
+ if (enable_apicv) {
int ret = avic_vm_init(kvm);
if (ret)
return ret;
}
- kvm_apicv_init(kvm, avic);
return 0;
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
- .hardware_unsetup = svm_hardware_teardown,
+ .name = "kvm_amd",
+
+ .hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
- .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
.has_emulated_msr = svm_has_emulated_msr,
- .vcpu_create = svm_create_vcpu,
- .vcpu_free = svm_free_vcpu,
+ .vcpu_create = svm_vcpu_create,
+ .vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
.vm_size = sizeof(struct kvm_svm),
.vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
- .prepare_guest_switch = svm_prepare_guest_switch,
+ .prepare_switch_to_guest = svm_prepare_switch_to_guest,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
- .vcpu_blocking = svm_vcpu_blocking,
- .vcpu_unblocking = svm_vcpu_unblocking,
+ .vcpu_blocking = avic_vcpu_blocking,
+ .vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
@@ -4526,8 +4639,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.get_segment = svm_get_segment,
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
- .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .get_cs_db_l_bits = svm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
+ .post_set_cr3 = sev_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
@@ -4540,21 +4654,23 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .get_if_flag = svm_get_if_flag,
- .tlb_flush_all = svm_flush_tlb,
- .tlb_flush_current = svm_flush_tlb,
- .tlb_flush_gva = svm_flush_tlb_gva,
- .tlb_flush_guest = svm_flush_tlb,
+ .flush_tlb_all = svm_flush_tlb_current,
+ .flush_tlb_current = svm_flush_tlb_current,
+ .flush_tlb_gva = svm_flush_tlb_gva,
+ .flush_tlb_guest = svm_flush_tlb_current,
- .run = svm_vcpu_run,
- .handle_exit = handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
+ .vcpu_pre_run = svm_vcpu_pre_run,
+ .vcpu_run = svm_vcpu_run,
+ .handle_exit = svm_handle_exit,
+ .skip_emulated_instruction = svm_skip_emulated_instruction,
.update_emulated_instruction = NULL,
.set_interrupt_shadow = svm_set_interrupt_shadow,
.get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
- .set_irq = svm_set_irq,
- .set_nmi = svm_inject_nmi,
+ .inject_irq = svm_inject_irq,
+ .inject_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
@@ -4564,27 +4680,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
- .set_virtual_apic_mode = svm_set_virtual_apic_mode,
- .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
- .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
- .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
- .load_eoi_exitmap = svm_load_eoi_exitmap,
- .hwapic_irr_update = svm_hwapic_irr_update,
- .hwapic_isr_update = svm_hwapic_isr_update,
- .sync_pir_to_irr = kvm_lapic_find_highest_irr,
- .apicv_post_state_restore = avic_post_state_restore,
-
- .set_tss_addr = svm_set_tss_addr,
- .set_identity_map_addr = svm_set_identity_map_addr,
- .get_mt_mask = svm_get_mt_mask,
+ .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+ .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
+ .apicv_post_state_restore = avic_apicv_post_state_restore,
+ .get_mt_mask = svm_get_mt_mask,
.get_exit_info = svm_get_exit_info,
.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
.has_wbinvd_exit = svm_has_wbinvd_exit,
- .write_l1_tsc_offset = svm_write_l1_tsc_offset,
+ .get_l2_tsc_offset = svm_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
+ .write_tsc_offset = svm_write_tsc_offset,
+ .write_tsc_multiplier = svm_write_tsc_multiplier,
.load_mmu_pgd = svm_load_mmu_pgd,
@@ -4595,22 +4705,24 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.sched_in = svm_sched_in,
- .pmu_ops = &amd_pmu_ops,
.nested_ops = &svm_nested_ops,
- .deliver_posted_interrupt = svm_deliver_avic_intr,
- .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
- .update_pi_irte = svm_update_pi_irte,
+ .deliver_interrupt = svm_deliver_interrupt,
+ .pi_update_irte = avic_pi_update_irte,
.setup_mce = svm_setup_mce,
.smi_allowed = svm_smi_allowed,
- .pre_enter_smm = svm_pre_enter_smm,
- .pre_leave_smm = svm_pre_leave_smm,
+ .enter_smm = svm_enter_smm,
+ .leave_smm = svm_leave_smm,
.enable_smi_window = svm_enable_smi_window,
- .mem_enc_op = svm_mem_enc_op,
- .mem_enc_reg_region = svm_register_enc_region,
- .mem_enc_unreg_region = svm_unregister_enc_region,
+ .mem_enc_ioctl = sev_mem_enc_ioctl,
+ .mem_enc_register_region = sev_mem_enc_register_region,
+ .mem_enc_unregister_region = sev_mem_enc_unregister_region,
+ .guest_memory_reclaimed = sev_guest_memory_reclaimed,
+
+ .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
+ .vm_move_enc_context_from = sev_vm_move_enc_context_from,
.can_emulate_instruction = svm_can_emulate_instruction,
@@ -4620,8 +4732,269 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.complete_emulated_msr = svm_complete_emulated_msr,
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+ .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
};
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
+static __init void svm_set_cpu_caps(void)
+{
+ kvm_set_cpu_caps();
+
+ supported_xss = 0;
+
+ /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+ if (nested) {
+ kvm_cpu_cap_set(X86_FEATURE_SVM);
+ kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
+
+ if (nrips)
+ kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+ if (npt_enabled)
+ kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ if (tsc_scaling)
+ kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
+ if (vls)
+ kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
+ if (lbrv)
+ kvm_cpu_cap_set(X86_FEATURE_LBRV);
+
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
+ kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+ if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+ kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
+ if (vgif)
+ kvm_cpu_cap_set(X86_FEATURE_VGIF);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
+ }
+
+ /* CPUID 0x80000008 */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* AMD PMU PERFCTR_CORE CPUID */
+ if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
+
+ /* CPUID 0x8000001F (SME/SEV features) */
+ sev_set_cpu_caps();
+}
+
+static __init int svm_hardware_setup(void)
+{
+ int cpu;
+ struct page *iopm_pages;
+ void *iopm_va;
+ int r;
+ unsigned int order = get_order(IOPM_SIZE);
+
+ /*
+ * NX is required for shadow paging and for NPT if the NX huge pages
+ * mitigation is enabled.
+ */
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+ kvm_enable_efer_bits(EFER_NX);
+
+ iopm_pages = alloc_pages(GFP_KERNEL, order);
+
+ if (!iopm_pages)
+ return -ENOMEM;
+
+ iopm_va = page_address(iopm_pages);
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
+ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+ init_msrpm_offsets();
+
+ supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (tsc_scaling) {
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ tsc_scaling = false;
+ } else {
+ pr_info("TSC scaling supported\n");
+ kvm_has_tsc_control = true;
+ }
+ }
+ kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
+
+ tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
+ if (nested) {
+ printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ }
+
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+ npt_enabled = false;
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ /* Force VM NPT level equal to the host's paging level */
+ kvm_configure_mmu(npt_enabled, get_npt_level(),
+ get_npt_level(), PG_LEVEL_1G);
+ pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+
+ /* Setup shadow_me_value and shadow_me_mask */
+ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
+
+ /* Note, SEV setup consumes npt_enabled. */
+ sev_hardware_setup();
+
+ svm_hv_hardware_setup();
+
+ svm_adjust_mmio_mask();
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ if (nrips) {
+ if (!boot_cpu_has(X86_FEATURE_NRIPS))
+ nrips = false;
+ }
+
+ enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
+
+ if (enable_apicv) {
+ if (!boot_cpu_has(X86_FEATURE_AVIC)) {
+ pr_warn("AVIC is not supported in CPUID but force enabled");
+ pr_warn("Your system might crash and burn");
+ } else
+ pr_info("AVIC enabled\n");
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ } else {
+ svm_x86_ops.vcpu_blocking = NULL;
+ svm_x86_ops.vcpu_unblocking = NULL;
+ svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+ }
+
+ if (vls) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+ !IS_ENABLED(CONFIG_X86_64)) {
+ vls = false;
+ } else {
+ pr_info("Virtual VMLOAD VMSAVE supported\n");
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
+ if (vgif) {
+ if (!boot_cpu_has(X86_FEATURE_VGIF))
+ vgif = false;
+ else
+ pr_info("Virtual GIF supported\n");
+ }
+
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
+
+ if (!enable_pmu)
+ pr_info("PMU virtualization is disabled\n");
+
+ svm_set_cpu_caps();
+
+ /*
+ * It seems that on AMD processors PTE's accessed bit is
+ * being set by the CPU hardware before the NPF vmexit.
+ * This is not expected behaviour and our tests fail because
+ * of it.
+ * A workaround here is to disable support for
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+ * In this case userspace can know if there is support using
+ * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+ * it
+ * If future AMD CPU models change the behaviour described above,
+ * this variable can be changed accordingly
+ */
+ allow_smaller_maxphyaddr = !npt_enabled;
+
+ return 0;
+
+err:
+ svm_hardware_unsetup();
+ return r;
+}
+
+
static struct kvm_x86_init_ops svm_init_ops __initdata = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -4629,6 +5002,7 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
.check_processor_compatibility = svm_check_processor_compat,
.runtime_ops = &svm_x86_ops,
+ .pmu_ops = &amd_pmu_ops,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 39e071fdab0c..1bddd336a27e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -20,19 +20,27 @@
#include <linux/bits.h>
#include <asm/svm.h>
+#include <asm/sev-common.h>
+
+#include "kvm_cache_regs.h"
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
-static const u32 host_save_user_msrs[] = {
- MSR_TSC_AUX,
-};
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+#define IOPM_SIZE PAGE_SIZE * 3
+#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 18
+#define MAX_DIRECT_ACCESS_MSRS 21
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
+extern int vgif;
+extern bool intercept_smi;
+/*
+ * Clean bits in VMCB.
+ * VMCB_ALL_CLEAN_MASK might also need to
+ * be updated if this enum is modified.
+ */
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
pause filter count */
@@ -50,9 +58,17 @@ enum {
* AVIC PHYSICAL_TABLE pointer,
* AVIC LOGICAL_TABLE pointer
*/
- VMCB_DIRTY_MAX,
+ VMCB_SW = 31, /* Reserved for hypervisor/software use */
};
+#define VMCB_ALL_CLEAN_MASK ( \
+ (1U << VMCB_INTERCEPTS) | (1U << VMCB_PERM_MAP) | \
+ (1U << VMCB_ASID) | (1U << VMCB_INTR) | \
+ (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \
+ (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \
+ (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \
+ (1U << VMCB_SW))
+
/* TPR and CR2 are always written before VMRUN */
#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
@@ -65,6 +81,11 @@ struct kvm_sev_info {
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ struct list_head mirror_vms; /* List of VMs mirroring */
+ struct list_head mirror_entry; /* Use as a list entry of mirrors */
+ struct misc_cg *misc_cg; /* For misc cgroup accounting */
+ atomic_t migration_in_progress;
};
struct kvm_svm {
@@ -81,11 +102,55 @@ struct kvm_svm {
struct kvm_vcpu;
+struct kvm_vmcb_info {
+ struct vmcb *ptr;
+ unsigned long pa;
+ int cpu;
+ uint64_t asid_generation;
+};
+
+struct vmcb_save_area_cached {
+ u64 efer;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+};
+
+struct vmcb_ctrl_area_cached {
+ u32 intercepts[MAX_INTERCEPT];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 nested_cr3;
+ u64 virt_ext;
+ u32 clean;
+ u8 reserved_sw[32];
+};
+
struct svm_nested_state {
- struct vmcb *hsave;
+ struct kvm_vmcb_info vmcb02;
u64 hsave_msr;
u64 vm_cr_msr;
u64 vmcb12_gpa;
+ u64 last_vmcb12_gpa;
/* These are the merged vectors */
u32 *msrpm;
@@ -95,29 +160,59 @@ struct svm_nested_state {
bool nested_run_pending;
/* cache for control fields of the guest */
- struct vmcb_control_area ctl;
+ struct vmcb_ctrl_area_cached ctl;
+
+ /*
+ * Note: this struct is not kept up-to-date while L2 runs; it is only
+ * valid within nested_svm_vmrun.
+ */
+ struct vmcb_save_area_cached save;
bool initialized;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+};
+
+struct vcpu_sev_es_state {
+ /* SEV-ES support */
+ struct sev_es_save_area *vmsa;
+ struct ghcb *ghcb;
+ struct kvm_host_map ghcb_map;
+ bool received_first_sipi;
+
+ /* SEV-ES scratch area support */
+ void *ghcb_sa;
+ u32 ghcb_sa_len;
+ bool ghcb_sa_sync;
+ bool ghcb_sa_free;
};
struct vcpu_svm {
struct kvm_vcpu vcpu;
+ /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
struct vmcb *vmcb;
- unsigned long vmcb_pa;
+ struct kvm_vmcb_info vmcb01;
+ struct kvm_vmcb_info *current_vmcb;
struct svm_cpu_data *svm_data;
u32 asid;
- uint64_t asid_generation;
- uint64_t sysenter_esp;
- uint64_t sysenter_eip;
+ u32 sysenter_esp_hi;
+ u32 sysenter_eip_hi;
uint64_t tsc_aux;
u64 msr_decfg;
u64 next_rip;
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-
u64 spec_ctrl;
+
+ u64 tsc_ratio_msr;
/*
* Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
* translated into the appropriate L2_CFG bits on the host to
@@ -137,14 +232,19 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
- /* cached guest cpuid flags for faster access */
- bool nrips_enabled : 1;
+ /* optional nested SVM features that are enabled for this guest */
+ bool nrips_enabled : 1;
+ bool tsc_scaling_enabled : 1;
+ bool v_vmload_vmsave_enabled : 1;
+ bool lbrv_enabled : 1;
+ bool pause_filter_enabled : 1;
+ bool pause_threshold_enabled : 1;
+ bool vgif_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
- bool avic_is_running;
/*
* Per-vcpu list of struct amd_svm_iommu_ir:
@@ -161,17 +261,7 @@ struct vcpu_svm {
DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
} shadow_msr_intercept;
- /* SEV-ES support */
- struct vmcb_save_area *vmsa;
- struct ghcb *ghcb;
- struct kvm_host_map ghcb_map;
- bool received_first_sipi;
-
- /* SEV-ES scratch area support */
- void *ghcb_sa;
- u64 ghcb_sa_len;
- bool ghcb_sa_sync;
- bool ghcb_sa_free;
+ struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
};
@@ -196,12 +286,12 @@ DECLARE_PER_CPU(struct svm_cpu_data *, svm_data);
void recalc_intercepts(struct vcpu_svm *svm);
-static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
{
return container_of(kvm, struct kvm_svm, kvm);
}
-static inline bool sev_guest(struct kvm *kvm)
+static __always_inline bool sev_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -212,12 +302,12 @@ static inline bool sev_guest(struct kvm *kvm)
#endif
}
-static inline bool sev_es_guest(struct kvm *kvm)
+static __always_inline bool sev_es_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- return sev_guest(kvm) && sev->es_active;
+ return sev->es_active && !WARN_ON_ONCE(!sev->active);
#else
return false;
#endif
@@ -230,7 +320,7 @@ static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
{
- vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ vmcb->control.clean = VMCB_ALL_CLEAN_MASK
& ~VMCB_ALWAYS_DIRTY_MASK;
}
@@ -239,19 +329,26 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
vmcb->control.clean &= ~(1 << bit);
}
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
{
- return container_of(vcpu, struct vcpu_svm, vcpu);
+ return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
}
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(&svm->vcpu))
- return svm->nested.hsave;
- else
- return svm->vmcb;
+ return container_of(vcpu, struct vcpu_svm, vcpu);
}
+/*
+ * Only the PDPTRs are loaded on demand into the shadow MMU. All other
+ * fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
+ *
+ * CR3 might be out of date in the VMCB but it is not marked dirty; instead,
+ * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
+ * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB.
+ */
+#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR)
+
static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
@@ -270,9 +367,15 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
return test_bit(bit, (unsigned long *)&control->intercepts);
}
+static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
if (!sev_es_guest(svm->vcpu.kvm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
@@ -299,7 +402,7 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm)
static inline void clr_dr_intercepts(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb->control.intercepts[INTERCEPT_DR] = 0;
@@ -314,7 +417,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
WARN_ON_ONCE(bit >= 32);
vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -324,7 +427,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
WARN_ON_ONCE(bit >= 32);
vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -334,7 +437,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb_set_intercept(&vmcb->control, bit);
@@ -343,7 +446,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb_clr_intercept(&vmcb->control, bit);
@@ -355,59 +458,84 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
return vmcb_is_intercept(&svm->vmcb->control, bit);
}
-static inline bool vgif_enabled(struct vcpu_svm *svm)
+static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
+{
+ return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
{
- return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+ if (!vgif)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
+ return svm->nested.vmcb02.ptr;
+ else
+ return svm->vmcb01.ptr;
}
static inline void enable_gif(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- svm->vmcb->control.int_ctl |= V_GIF_MASK;
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl |= V_GIF_MASK;
else
svm->vcpu.arch.hflags |= HF_GIF_MASK;
}
static inline void disable_gif(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl &= ~V_GIF_MASK;
else
svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
}
static inline bool gif_set(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_GIF_MASK);
else
return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
}
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+ return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
/* svm.c */
#define MSR_INVALID 0xffffffffU
-extern int sev;
-extern int sev_es;
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
extern bool dump_invalid_vmcb;
u32 svm_msrpm_offset(u32 msr);
u32 *svm_vcpu_alloc_msrpm(void);
void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
void svm_vcpu_free_msrpm(u32 *msrpm);
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+void svm_update_lbrv(struct kvm_vcpu *vcpu);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void svm_flush_tlb(struct kvm_vcpu *vcpu);
void disable_nmi_singlestep(struct vcpu_svm *svm);
bool svm_smi_blocked(struct kvm_vcpu *vcpu);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
void svm_set_gif(struct vcpu_svm *svm, bool value);
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
int read, int write);
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vec);
/* nested.c */
@@ -424,153 +552,111 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
}
static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
}
static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb);
-void svm_leave_nested(struct vcpu_svm *svm);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
+ u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
+void svm_leave_nested(struct kvm_vcpu *vcpu);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
-int nested_svm_vmrun(struct vcpu_svm *svm);
-void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save);
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+ svm->vmcb->control.exit_code = exit_code;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+ return nested_svm_vmexit(svm);
+}
+
int nested_svm_exit_handled(struct vcpu_svm *svm);
-int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
-void sync_nested_vmcb_control(struct vcpu_svm *svm);
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
+void __svm_write_tsc_multiplier(u64 multiplier);
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control);
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
-#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
-#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
-#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
-#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
-#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
-
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
-extern int avic;
-
-static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
-{
- svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
- vmcb_mark_dirty(svm->vmcb, VMCB_AVIC);
-}
-
-static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 *entry = svm->avic_physical_id_cache;
-
- if (!entry)
- return false;
-
- return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-}
-
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
-void avic_init_vmcb(struct vcpu_svm *svm);
-void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
-void avic_post_state_restore(struct kvm_vcpu *vcpu);
-void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-bool svm_check_apicv_inhibit_reasons(ulong bit);
-void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate);
-void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
-void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
-void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
-int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
-bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
+void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
+void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_ring_doorbell(struct kvm_vcpu *vcpu);
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
/* sev.c */
-#define GHCB_VERSION_MAX 1ULL
-#define GHCB_VERSION_MIN 1ULL
-
-#define GHCB_MSR_INFO_POS 0
-#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1)
-
-#define GHCB_MSR_SEV_INFO_RESP 0x001
-#define GHCB_MSR_SEV_INFO_REQ 0x002
-#define GHCB_MSR_VER_MAX_POS 48
-#define GHCB_MSR_VER_MAX_MASK 0xffff
-#define GHCB_MSR_VER_MIN_POS 32
-#define GHCB_MSR_VER_MIN_MASK 0xffff
-#define GHCB_MSR_CBIT_POS 24
-#define GHCB_MSR_CBIT_MASK 0xff
-#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
- ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
- (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
- (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
- GHCB_MSR_SEV_INFO_RESP)
-
-#define GHCB_MSR_CPUID_REQ 0x004
-#define GHCB_MSR_CPUID_RESP 0x005
-#define GHCB_MSR_CPUID_FUNC_POS 32
-#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff
-#define GHCB_MSR_CPUID_VALUE_POS 32
-#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff
-#define GHCB_MSR_CPUID_REG_POS 30
-#define GHCB_MSR_CPUID_REG_MASK 0x3
-
-#define GHCB_MSR_TERM_REQ 0x100
-#define GHCB_MSR_TERM_REASON_SET_POS 12
-#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
-#define GHCB_MSR_TERM_REASON_POS 16
-#define GHCB_MSR_TERM_REASON_MASK 0xff
+#define GHCB_VERSION_MAX 1ULL
+#define GHCB_VERSION_MIN 1ULL
-extern unsigned int max_sev_asid;
-static inline bool svm_sev_enabled(void)
-{
- return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
-}
+extern unsigned int max_sev_asid;
void sev_vm_destroy(struct kvm *kvm);
-int svm_mem_enc_op(struct kvm *kvm, void __user *argp);
-int svm_register_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range);
-int svm_unregister_enc_region(struct kvm *kvm,
- struct kvm_enc_region *range);
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+void sev_guest_memory_reclaimed(struct kvm *kvm);
+
void pre_sev_run(struct vcpu_svm *svm, int cpu);
+void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
-void sev_hardware_teardown(void);
+void sev_hardware_unsetup(void);
+int sev_cpu_init(struct svm_cpu_data *sd);
void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
-void sev_es_create_vcpu(struct vcpu_svm *svm);
+void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
+void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
new file mode 100644
index 000000000000..8cdc62c74a96
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/mshyperv.h>
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+{
+ struct hv_enlightenments *hve;
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
+
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+ if (!*p_hv_pa_pg)
+ return -ENOMEM;
+
+ hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw;
+
+ hve->partition_assist_page = __pa(*p_hv_pa_pg);
+ hve->hv_vm_id = (unsigned long)vcpu->kvm;
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
+ hve->hv_enlightenments_control.nested_flush_hypercall = 1;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ }
+
+ return 0;
+}
+
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
new file mode 100644
index 000000000000..e2fc59380465
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+#include "kvm_onhyperv.h"
+#include "svm/hyperv.h"
+
+static struct kvm_x86_ops svm_x86_ops;
+
+int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)vmcb->control.reserved_sw;
+
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
+ hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
+ hve->hv_enlightenments_control.msr_bitmap = 1;
+}
+
+static inline void svm_hv_hardware_setup(void)
+{
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
+ pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n");
+ svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
+ svm_x86_ops.tlb_remote_flush_with_range =
+ hv_remote_flush_tlb_with_range;
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
+ int cpu;
+
+ pr_info("kvm: Hyper-V Direct TLB Flush enabled\n");
+ for_each_online_cpu(cpu) {
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->nested_control.features.directhypercall = 1;
+ }
+ svm_x86_ops.enable_direct_tlbflush =
+ svm_hv_enable_direct_tlbflush;
+ }
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)vmcb->control.reserved_sw;
+
+ if (hve->hv_enlightenments_control.msr_bitmap)
+ vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
+ struct kvm_vcpu *vcpu)
+{
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ u32 vp_index = kvm_hv_get_vpindex(vcpu);
+
+ if (hve->hv_vp_id != vp_index) {
+ hve->hv_vp_id = vp_index;
+ vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ }
+}
+#else
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+}
+
+static inline void svm_hv_hardware_setup(void)
+{
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
+ struct kvm_vcpu *vcpu)
+{
+}
+#endif /* CONFIG_HYPERV */
+
+#endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 8170f2a5a16f..9430d6437c9f 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -4,7 +4,7 @@
#include <linux/compiler_types.h>
-#include <asm/kvm_host.h>
+#include "x86.h"
#define svm_asm(insn, clobber...) \
do { \
@@ -56,12 +56,12 @@ static inline void invlpga(unsigned long addr, u32 asid)
* VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
* hence 'unsigned long' instead of 'hpa_t'.
*/
-static inline void vmsave(unsigned long pa)
+static __always_inline void vmsave(unsigned long pa)
{
svm_asm1(vmsave, "a" (pa), "memory");
}
-static inline void vmload(unsigned long pa)
+static __always_inline void vmload(unsigned long pa)
{
svm_asm1(vmload, "a" (pa), "memory");
}
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 6feb8c08f45a..dfaeb47fcf2a 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -79,28 +79,10 @@ SYM_FUNC_START(__svm_vcpu_run)
/* Enter guest mode */
sti
-1: vmload %_ASM_AX
- jmp 3f
-2: cmpb $0, kvm_rebooting
- jne 3f
- ud2
- _ASM_EXTABLE(1b, 2b)
-3: vmrun %_ASM_AX
- jmp 5f
-4: cmpb $0, kvm_rebooting
- jne 5f
- ud2
- _ASM_EXTABLE(3b, 4b)
+1: vmrun %_ASM_AX
-5: vmsave %_ASM_AX
- jmp 7f
-6: cmpb $0, kvm_rebooting
- jne 7f
- ud2
- _ASM_EXTABLE(5b, 6b)
-7:
- cli
+2: cli
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -166,7 +148,14 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
+
+3: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
SYM_FUNC_END(__svm_vcpu_run)
/**
@@ -186,18 +175,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
- /* Enter guest mode */
+ /* Move @vmcb to RAX. */
mov %_ASM_ARG1, %_ASM_AX
+
+ /* Enter guest mode */
sti
1: vmrun %_ASM_AX
- jmp 3f
-2: cmpb $0, kvm_rebooting
- jne 3f
- ud2
- _ASM_EXTABLE(1b, 2b)
-3: cli
+2: cli
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -216,5 +202,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
+
+3: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
SYM_FUNC_END(__svm_sev_es_vcpu_run)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a61c015870e3..de4762517569 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -64,9 +64,9 @@ TRACE_EVENT(kvm_hypercall,
* Tracepoint for hypercall.
*/
TRACE_EVENT(kvm_hv_hypercall,
- TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
- __u64 ingpa, __u64 outgpa),
- TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+ TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt,
+ __u16 rep_idx, __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa),
TP_STRUCT__entry(
__field( __u16, rep_cnt )
@@ -74,6 +74,7 @@ TRACE_EVENT(kvm_hv_hypercall,
__field( __u64, ingpa )
__field( __u64, outgpa )
__field( __u16, code )
+ __field( __u16, var_cnt )
__field( bool, fast )
),
@@ -83,13 +84,29 @@ TRACE_EVENT(kvm_hv_hypercall,
__entry->ingpa = ingpa;
__entry->outgpa = outgpa;
__entry->code = code;
+ __entry->var_cnt = var_cnt;
__entry->fast = fast;
),
- TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
__entry->code, __entry->fast ? "fast" : "slow",
- __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
- __entry->outgpa)
+ __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx,
+ __entry->ingpa, __entry->outgpa)
+);
+
+TRACE_EVENT(kvm_hv_hypercall_done,
+ TP_PROTO(u64 result),
+ TP_ARGS(result),
+
+ TP_STRUCT__entry(
+ __field(__u64, result)
+ ),
+
+ TP_fast_assign(
+ __entry->result = result;
+ ),
+
+ TP_printk("result 0x%llx", __entry->result)
);
/*
@@ -236,13 +253,13 @@ TRACE_EVENT(kvm_cpuid,
* Tracepoint for apic access.
*/
TRACE_EVENT(kvm_apic,
- TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_PROTO(unsigned int rw, unsigned int reg, u64 val),
TP_ARGS(rw, reg, val),
TP_STRUCT__entry(
__field( unsigned int, rw )
__field( unsigned int, reg )
- __field( unsigned int, val )
+ __field( u64, val )
),
TP_fast_assign(
@@ -251,7 +268,7 @@ TRACE_EVENT(kvm_apic,
__entry->val = val;
),
- TP_printk("apic_%s %s = 0x%x",
+ TP_printk("apic_%s %s = 0x%llx",
__entry->rw ? "write" : "read",
__print_symbolic(__entry->reg, kvm_trace_symbol_apic),
__entry->val)
@@ -273,8 +290,8 @@ TRACE_EVENT(kvm_apic,
#define TRACE_EVENT_KVM_EXIT(name) \
TRACE_EVENT(name, \
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \
- TP_ARGS(exit_reason, vcpu, isa), \
+ TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \
+ TP_ARGS(vcpu, isa), \
\
TP_STRUCT__entry( \
__field( unsigned int, exit_reason ) \
@@ -288,11 +305,12 @@ TRACE_EVENT(name, \
), \
\
TP_fast_assign( \
- __entry->exit_reason = exit_reason; \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \
+ static_call(kvm_x86_get_exit_info)(vcpu, \
+ &__entry->exit_reason, \
+ &__entry->info1, \
&__entry->info2, \
&__entry->intr_info, \
&__entry->error_code); \
@@ -997,7 +1015,7 @@ TRACE_EVENT(kvm_wait_lapic_expire,
__entry->delta < 0 ? "early" : "late")
);
-TRACE_EVENT(kvm_enter_smm,
+TRACE_EVENT(kvm_smm_transition,
TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
TP_ARGS(vcpu_id, smbase, entering),
@@ -1321,23 +1339,49 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
__entry->vcpu_id, __entry->timer_index)
);
-TRACE_EVENT(kvm_apicv_update_request,
- TP_PROTO(bool activate, unsigned long bit),
- TP_ARGS(activate, bit),
+TRACE_EVENT(kvm_apicv_inhibit_changed,
+ TP_PROTO(int reason, bool set, unsigned long inhibits),
+ TP_ARGS(reason, set, inhibits),
+
+ TP_STRUCT__entry(
+ __field(int, reason)
+ __field(bool, set)
+ __field(unsigned long, inhibits)
+ ),
+
+ TP_fast_assign(
+ __entry->reason = reason;
+ __entry->set = set;
+ __entry->inhibits = inhibits;
+ ),
+
+ TP_printk("%s reason=%u, inhibits=0x%lx",
+ __entry->set ? "set" : "cleared",
+ __entry->reason, __entry->inhibits)
+);
+
+TRACE_EVENT(kvm_apicv_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
TP_STRUCT__entry(
- __field(bool, activate)
- __field(unsigned long, bit)
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
),
TP_fast_assign(
- __entry->activate = activate;
- __entry->bit = bit;
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
),
- TP_printk("%s bit=%lu",
- __entry->activate ? "activate" : "deactivate",
- __entry->bit)
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
);
/*
@@ -1415,6 +1459,26 @@ TRACE_EVENT(kvm_avic_ga_log,
__entry->vmid, __entry->vcpuid)
);
+TRACE_EVENT(kvm_avic_kick_vcpu_slowpath,
+ TP_PROTO(u32 icrh, u32 icrl, u32 index),
+ TP_ARGS(icrh, icrl, index),
+
+ TP_STRUCT__entry(
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->index = index;
+ ),
+
+ TP_printk("icrh:icrl=%#08x:%08x, index=%u",
+ __entry->icrh, __entry->icrl, __entry->index)
+);
+
TRACE_EVENT(kvm_hv_timer_state,
TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
TP_ARGS(vcpu_id, hv_timer_in_use),
@@ -1550,16 +1614,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed,
TP_ARGS(msg, err),
TP_STRUCT__entry(
- __field(const char *, msg)
+ __string(msg, msg)
__field(u32, err)
),
TP_fast_assign(
- __entry->msg = msg;
+ __assign_str(msg, msg);
__entry->err = err;
),
- TP_printk("%s%s", __entry->msg, !__entry->err ? "" :
+ TP_printk("%s%s", __get_str(msg), !__entry->err ? "" :
__print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
);
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index d1d77985e889..3f430e218375 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -5,6 +5,7 @@
#include <asm/vmx.h>
#include "lapic.h"
+#include "x86.h"
extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled;
@@ -12,7 +13,6 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
-extern bool __read_mostly enable_apicv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
@@ -54,7 +54,6 @@ struct nested_vmx_msrs {
struct vmcs_config {
int size;
- int order;
u32 basic_cap;
u32 revision_id;
u32 pin_based_exec_ctrl;
@@ -90,8 +89,7 @@ static inline bool cpu_has_vmx_preemption_timer(void)
static inline bool cpu_has_vmx_posted_intr(void)
{
- return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
- vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}
static inline bool cpu_has_load_ia32_efer(void)
@@ -314,6 +312,15 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
+static inline int ept_caps_to_lpage_level(u32 ept_caps)
+{
+ if (ept_caps & VMX_EPT_1GB_PAGE_BIT)
+ return PG_LEVEL_1G;
+ if (ept_caps & VMX_EPT_2MB_PAGE_BIT)
+ return PG_LEVEL_2M;
+ return PG_LEVEL_4K;
+}
+
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
return vmx_capability.ept & VMX_EPT_AD_BIT;
@@ -382,6 +389,9 @@ static inline u64 vmx_get_perf_capabilities(void)
{
u64 perf_cap = 0;
+ if (!enable_pmu)
+ return perf_cap;
+
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
@@ -398,6 +408,9 @@ static inline u64 vmx_supported_debugctl(void)
{
u64 debugctl = 0;
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
debugctl |= DEBUGCTLMSR_LBR_MASK;
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 41f24661af04..87e3dc10edf4 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -12,9 +12,6 @@
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
-#if IS_ENABLED(CONFIG_HYPERV)
-
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
{EVMCS1_OFFSET(name), clean_field}
@@ -297,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
+#if IS_ENABLED(CONFIG_HYPERV)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
@@ -319,6 +317,9 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
if (unlikely(!assist_page.enlighten_vmentry))
return false;
+ if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs)))
+ return false;
+
*evmcs_gpa = assist_page.current_nested_vmcs;
return true;
@@ -351,14 +352,21 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
switch (msr_index) {
case MSR_IA32_VMX_EXIT_CTLS:
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
break;
case MSR_IA32_VMX_ENTRY_CTLS:
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC;
break;
}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index bd41d9462355..8d70f9aea94b 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
SECONDARY_EXEC_SHADOW_VMCS | \
SECONDARY_EXEC_TSC_SCALING | \
SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
-#if IS_ENABLED(CONFIG_HYPERV)
-
struct evmcs_field {
u16 offset;
u16 clean_field;
@@ -73,31 +73,57 @@ struct evmcs_field {
extern const struct evmcs_field vmcs_field_to_evmcs_1[];
extern const unsigned int nr_evmcs_1_fields;
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-
-static __always_inline int get_evmcs_offset(unsigned long field,
- u16 *clean_field)
+static __always_inline int evmcs_field_offset(unsigned long field,
+ u16 *clean_field)
{
unsigned int index = ROL16(field, 6);
const struct evmcs_field *evmcs_field;
- if (unlikely(index >= nr_evmcs_1_fields)) {
- WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
- field);
+ if (unlikely(index >= nr_evmcs_1_fields))
return -ENOENT;
- }
evmcs_field = &vmcs_field_to_evmcs_1[index];
+ /*
+ * Use offset=0 to detect holes in eVMCS. This offset belongs to
+ * 'revision_id' but this field has no encoding and is supposed to
+ * be accessed directly.
+ */
+ if (unlikely(!evmcs_field->offset))
+ return -ENOENT;
+
if (clean_field)
*clean_field = evmcs_field->clean_field;
return evmcs_field->offset;
}
-#undef ROL16
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+ unsigned long field, u16 offset)
+{
+ /*
+ * vmcs12_read_any() doesn't care whether the supplied structure
+ * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+ * the exact offset of the required field, use it for convenience
+ * here.
+ */
+ return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ int offset = evmcs_field_offset(field, clean_field);
+
+ WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
+ field);
-static inline void evmcs_write64(unsigned long field, u64 value)
+ return offset;
+}
+
+static __always_inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -187,7 +213,7 @@ static inline void evmcs_load(u64 phys_addr)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
-static inline void evmcs_write64(unsigned long field, u64 value) {}
+static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
static inline void evmcs_write16(unsigned long field, u16 value) {}
static inline u64 evmcs_read64(unsigned long field) { return 0; }
@@ -197,6 +223,14 @@ static inline void evmcs_load(u64 phys_addr) {}
static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
+#define EVMPTR_INVALID (-1ULL)
+#define EVMPTR_MAP_PENDING (-2ULL)
+
+static inline bool evmptr_is_valid(u64 evmptr)
+{
+ return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING;
+}
+
enum nested_evmptrld_status {
EVMPTRLD_DISABLED,
EVMPTRLD_SUCCEEDED,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bcca0b80e0d0..f5cb18e00e78 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,10 +7,12 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
+#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "sgx.h"
#include "trace.h"
#include "vmx.h"
#include "x86.h"
@@ -21,13 +23,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
static bool __read_mostly nested_early_check = 0;
module_param(nested_early_check, bool, S_IRUGO);
-#define CC(consistency_check) \
-({ \
- bool failed = (consistency_check); \
- if (failed) \
- trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
- failed; \
-})
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
/*
* Hyper-V requires all of these, so mark them as supported even though
@@ -178,9 +174,13 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
| X86_EFLAGS_ZF);
get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
/*
- * We don't need to force a shadow sync because
- * VM_INSTRUCTION_ERROR is not shadowed
+ * We don't need to force sync to shadow VMCS because
+ * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
+ * fields and thus must be synced.
*/
+ if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
+ to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
+
return kvm_skip_emulated_instruction(vcpu);
}
@@ -192,7 +192,8 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
* failValid writes the error number to the current VMCS, which
* can't be done if there isn't a current VMCS.
*/
- if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+ if (vmx->nested.current_vmptr == INVALID_GPA &&
+ !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
return nested_vmx_failInvalid(vcpu);
return nested_vmx_failValid(vcpu, vm_instruction_error);
@@ -218,7 +219,7 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
vmx->nested.need_vmcs12_to_shadow_sync = false;
}
@@ -226,12 +227,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->nested.hv_evmcs)
- return;
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
+ vmx->nested.hv_evmcs = NULL;
+ }
- kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
- vmx->nested.hv_evmcs_vmptr = 0;
- vmx->nested.hv_evmcs = NULL;
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
}
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
@@ -269,7 +270,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vmx_register_cache_reset(vcpu);
+ vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
+
+ /*
+ * All lazily updated registers will be reloaded from VMCS12 on both
+ * vmentry and vmexit.
+ */
+ vcpu->arch.regs_dirty = 0;
}
/*
@@ -290,9 +297,10 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
free_vpid(vmx->nested.vpid02);
vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmptr = INVALID_GPA;
if (enable_shadow_vmcs) {
vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
@@ -312,7 +320,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
vmx->nested.pi_desc = NULL;
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
nested_release_evmcs(vcpu);
@@ -330,6 +338,31 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
vcpu_put(vcpu);
}
+#define EPTP_PA_MASK GENMASK_ULL(51, 12)
+
+static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
+{
+ return VALID_PAGE(root_hpa) &&
+ ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
+}
+
+static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
+ gpa_t addr)
+{
+ uint i;
+ struct kvm_mmu_root_info *cached_root;
+
+ WARN_ON_ONCE(!mmu_is_nested(vcpu));
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ cached_root = &vcpu->arch.mmu->prev_roots[i];
+
+ if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
+ eptp))
+ vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
+ }
+}
+
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -342,25 +375,44 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vm_exit_reason = EXIT_REASON_PML_FULL;
vmx->nested.pml_full = false;
exit_qualification &= INTR_INFO_UNBLOCK_NMI;
- } else if (fault->error_code & PFERR_RSVD_MASK)
- vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
- else
- vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ } else {
+ if (fault->error_code & PFERR_RSVD_MASK)
+ vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ else
+ vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+
+ /*
+ * Although the caller (kvm_inject_emulated_page_fault) would
+ * have already synced the faulting address in the shadow EPT
+ * tables for the current EPTP12, we also need to sync it for
+ * any other cached EPTP02s based on the same EP4TA, since the
+ * TLB associates mappings to the EP4TA rather than the full EPTP.
+ */
+ nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
+ fault->address);
+ }
nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
vmcs12->guest_physical_address = fault->address;
}
+static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
+ int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
+
+ kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
+ nested_ept_ad_enabled(vcpu),
+ nested_ept_get_eptp(vcpu));
+}
+
static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_ept_mmu(vcpu,
- to_vmx(vcpu)->nested.msrs.ept_caps &
- VMX_EPT_EXECUTE_ONLY_BIT,
- nested_ept_ad_enabled(vcpu),
- nested_ept_get_eptp(vcpu));
+ nested_ept_new_eptp(vcpu);
vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
@@ -424,24 +476,23 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
return 0;
}
-
-static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
+static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
WARN_ON(!is_guest_mode(vcpu));
if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
- !to_vmx(vcpu)->nested.nested_run_pending) {
+ !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
vmcs12->vm_exit_intr_error_code = fault->error_code;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
fault->address);
- } else {
- kvm_inject_page_fault(vcpu, fault);
+ return true;
}
+ return false;
}
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@ -482,67 +533,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
}
/*
- * Check if MSR is intercepted for L01 MSR bitmap.
- */
-static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
-{
- unsigned long *msr_bitmap;
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
- return true;
-
- msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
-
- if (msr <= 0x1fff) {
- return !!test_bit(msr, msr_bitmap + 0x800 / f);
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
- }
-
- return true;
-}
-
-/*
- * If a msr is allowed by L0, we should check whether it is allowed by L1.
- * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
+ * itself utilizing x2APIC. All MSRs were previously set to be intercepted,
+ * only the "disable intercept" case needs to be handled.
*/
-static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
- unsigned long *msr_bitmap_nested,
- u32 msr, int type)
+static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int type)
{
- int f = sizeof(unsigned long);
-
- /*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
- /* read-low */
- __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
-
- if (type & MSR_TYPE_W &&
- !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
- /* write-low */
- __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
- /* read-high */
- __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
-
- if (type & MSR_TYPE_W &&
- !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
- /* write-high */
- __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+ if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
+ vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
- }
+ if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
+ vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
}
static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
@@ -557,6 +560,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
}
}
+#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
+static inline \
+void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
+ unsigned long *msr_bitmap_l1, \
+ unsigned long *msr_bitmap_l0, u32 msr) \
+{ \
+ if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
+ vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
+ vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+ else \
+ vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+}
+BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
+BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
+
+static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
+ unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int types)
+{
+ if (types & MSR_TYPE_R)
+ nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+ if (types & MSR_TYPE_W)
+ nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+}
+
/*
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
@@ -564,16 +595,31 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
int msr;
unsigned long *msr_bitmap_l1;
- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
- struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
+ unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return false;
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
+ * and tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+ if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
+ evmcs->hv_enlightenments_control.msr_bitmap &&
+ evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+ return true;
+
if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
@@ -582,7 +628,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
/*
* To keep the control flow simple, pay eight 8-byte writes (sixteen
* 4-byte writes on 32-bit systems) up front to enable intercepts for
- * the x2APIC MSR range and selectively disable them below.
+ * the x2APIC MSR range and selectively toggle those relevant to L2.
*/
enable_x2apic_msr_intercepts(msr_bitmap_l0);
@@ -601,59 +647,46 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
}
}
- nested_vmx_disable_intercept_for_msr(
+ nested_vmx_disable_intercept_for_x2apic_msr(
msr_bitmap_l1, msr_bitmap_l0,
X2APIC_MSR(APIC_TASKPRI),
MSR_TYPE_R | MSR_TYPE_W);
if (nested_cpu_has_vid(vmcs12)) {
- nested_vmx_disable_intercept_for_msr(
+ nested_vmx_disable_intercept_for_x2apic_msr(
msr_bitmap_l1, msr_bitmap_l0,
X2APIC_MSR(APIC_EOI),
MSR_TYPE_W);
- nested_vmx_disable_intercept_for_msr(
+ nested_vmx_disable_intercept_for_x2apic_msr(
msr_bitmap_l1, msr_bitmap_l0,
X2APIC_MSR(APIC_SELF_IPI),
MSR_TYPE_W);
}
}
- /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_FS_BASE, MSR_TYPE_RW);
+ /*
+ * Always check vmcs01's bitmap to honor userspace MSR filters and any
+ * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
+ */
+#ifdef CONFIG_X86_64
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_FS_BASE, MSR_TYPE_RW);
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_GS_BASE, MSR_TYPE_RW);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_GS_BASE, MSR_TYPE_RW);
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
- /*
- * Checking the L0->L1 bitmap is trying to verify two things:
- *
- * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
- * ensures that we do not accidentally generate an L02 MSR bitmap
- * from the L12 MSR bitmap that is too permissive.
- * 2. That L1 or L2s have actually used the MSR. This avoids
- * unnecessarily merging of the bitmap if the MSR is unused. This
- * works properly because we only update the L01 MSR bitmap lazily.
- * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
- * updated to reflect this when L1 (or its L2s) actually write to
- * the MSR.
- */
- if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- MSR_IA32_SPEC_CTRL,
- MSR_TYPE_R | MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PRED_CMD, MSR_TYPE_W);
- if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- MSR_IA32_PRED_CMD,
- MSR_TYPE_W);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
- kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
+ vmx->nested.force_msr_bitmap_recalc = false;
return true;
}
@@ -661,33 +694,39 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- struct kvm_host_map map;
- struct vmcs12 *shadow;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
- vmcs12->vmcs_link_pointer == -1ull)
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
return;
- shadow = get_shadow_vmcs12(vcpu);
-
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE))
return;
- memcpy(shadow, map.hva, VMCS12_SIZE);
- kvm_vcpu_unmap(vcpu, &map, false);
+ kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ VMCS12_SIZE);
}
static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
- vmcs12->vmcs_link_pointer == -1ull)
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
+ return;
+
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE))
return;
- kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
- get_shadow_vmcs12(vcpu), VMCS12_SIZE);
+ kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ VMCS12_SIZE);
}
/*
@@ -1061,54 +1100,13 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
}
/*
- * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
- * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
- * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
- * Here's why.
- *
- * If EPT is enabled by L0 a sync is never needed:
- * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
- * cannot be unsync'd SPTEs for either L1 or L2.
- *
- * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
- * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
- * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
- * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't
- * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
- *
- * If EPT is disabled by L0:
- * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
- * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
- * required to invalidate linear mappings (EPT is disabled so there are
- * no combined or guest-physical mappings), i.e. L1 can't rely on the
- * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
- *
- * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
- * linear mappings (EPT is disabled so there are no combined or guest-physical
- * mappings) to be invalidated on both VM-Enter and VM-Exit.
- *
- * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
- * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
- * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
- * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
- * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
- * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
- * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
- * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
- * stale TLB entries, at which point L0 will sync L2's MMU.
- */
-static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
-{
- return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
-}
-
-/*
* Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
* emulating VM-Entry into a guest with EPT enabled. On failure, the expected
* Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
* @entry_failure_code.
*/
-static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_ept, bool reload_pdptrs,
enum vm_entry_failure_code *entry_failure_code)
{
if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
@@ -1120,27 +1118,20 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
* If PAE paging and EPT are both on, CR3 is not used by the CPU and
* must not be dereferenced.
*/
- if (!nested_ept && is_pae_paging(vcpu) &&
- (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
- *entry_failure_code = ENTRY_FAIL_PDPTE;
- return -EINVAL;
- }
+ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, cr3))) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return -EINVAL;
}
- /*
- * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
- * flushes are handled by nested_vmx_transition_tlb_flush(). See
- * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
- */
- if (!nested_ept)
- kvm_mmu_new_pgd(vcpu, cr3, true,
- !nested_vmx_transition_mmu_sync(vcpu));
-
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
- kvm_init_mmu(vcpu, false);
+ if (!nested_ept)
+ kvm_mmu_new_pgd(vcpu, cr3);
return 0;
}
@@ -1173,41 +1164,48 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
- * If VPID is disabled, linear and combined mappings are flushed on
- * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
- * their associated EPTP.
+ * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
+ * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
+ * full TLB flush from the guest's perspective. This is required even
+ * if VPID is disabled in the host as KVM may need to synchronize the
+ * MMU in response to the guest TLB flush.
+ *
+ * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
+ * EPT is a special snowflake, as guest-physical mappings aren't
+ * flushed on VPID invalidations, including VM-Enter or VM-Exit with
+ * VPID disabled. As a result, KVM _never_ needs to sync nEPT
+ * entries on VM-Enter because L1 can't rely on VM-Enter to flush
+ * those mappings.
*/
- if (!enable_vpid)
+ if (!nested_cpu_has_vpid(vmcs12)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
return;
+ }
+
+ /* L2 should never have a VPID if VPID is disabled. */
+ WARN_ON(!enable_vpid);
/*
- * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
- * for *all* contexts to be flushed on VM-Enter/VM-Exit.
- *
- * If VPID is enabled and used by vmc12, but L2 does not have a unique
- * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
- * a VPID for L2, flush the current context as the effective ASID is
- * common to both L1 and L2.
- *
- * Defer the flush so that it runs after vmcs02.EPTP has been set by
- * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
- * redundant flushes further down the nested pipeline.
- *
- * If a TLB flush isn't required due to any of the above, and vpid12 is
- * changing then the new "virtual" VPID (vpid12) will reuse the same
- * "real" VPID (vpid02), and so needs to be sync'd. There is no direct
- * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
- * all nested vCPUs.
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
*/
- if (!nested_cpu_has_vpid(vmcs12)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- } else if (!nested_has_guest_tlb_tag(vcpu)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- } else if (is_vmenter &&
- vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- vpid_sync_context(nested_get_vpid02(vcpu));
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
}
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -1589,7 +1587,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
vmcs_load(vmx->loaded_vmcs->vmcs);
}
-static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
+static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
@@ -1598,7 +1596,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
vmcs12->guest_rsp = evmcs->guest_rsp;
vmcs12->guest_rflags = evmcs->guest_rflags;
@@ -1606,23 +1604,23 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
evmcs->guest_interruptibility_info;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
vmcs12->cpu_based_vm_exec_control =
evmcs->cpu_based_vm_exec_control;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
vmcs12->exception_bitmap = evmcs->exception_bitmap;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
vmcs12->vm_entry_intr_info_field =
evmcs->vm_entry_intr_info_field;
@@ -1632,7 +1630,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
evmcs->vm_entry_instruction_len;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
@@ -1652,7 +1650,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->host_tr_selector = evmcs->host_tr_selector;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
vmcs12->pin_based_vm_exec_control =
evmcs->pin_based_vm_exec_control;
@@ -1661,18 +1659,18 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
evmcs->secondary_vm_exec_control;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
vmcs12->msr_bitmap = evmcs->msr_bitmap;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
vmcs12->guest_es_base = evmcs->guest_es_base;
vmcs12->guest_cs_base = evmcs->guest_cs_base;
@@ -1712,14 +1710,14 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
vmcs12->tsc_offset = evmcs->tsc_offset;
vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
@@ -1731,7 +1729,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->guest_dr7 = evmcs->guest_dr7;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
vmcs12->host_fs_base = evmcs->host_fs_base;
vmcs12->host_gs_base = evmcs->host_gs_base;
@@ -1741,13 +1739,13 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->host_rsp = evmcs->host_rsp;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
vmcs12->ept_pointer = evmcs->ept_pointer;
vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
@@ -1802,10 +1800,10 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
* vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
*/
- return 0;
+ return;
}
-static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
+static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
@@ -1965,7 +1963,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
- return 0;
+ return;
}
/*
@@ -1982,13 +1980,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
if (likely(!vmx->nested.enlightened_vmcs_enabled))
return EVMPTRLD_DISABLED;
- if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
+ if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
+ nested_release_evmcs(vcpu);
return EVMPTRLD_DISABLED;
+ }
- if (unlikely(!vmx->nested.hv_evmcs ||
- evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
- if (!vmx->nested.hv_evmcs)
- vmx->nested.current_vmptr = -1ull;
+ if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+ vmx->nested.current_vmptr = INVALID_GPA;
nested_release_evmcs(vcpu);
@@ -2026,7 +2024,6 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
return EVMPTRLD_VMFAIL;
}
- vmx->nested.dirty_vmcs12 = true;
vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
evmcs_gpa_changed = true;
@@ -2048,10 +2045,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
* Clean fields data can't be used on VMLAUNCH and when we switch
* between different L2 guests as KVM keeps a single VMCS12 per L1.
*/
- if (from_launch || evmcs_gpa_changed)
+ if (from_launch || evmcs_gpa_changed) {
vmx->nested.hv_evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ vmx->nested.force_msr_bitmap_recalc = true;
+ }
+
return EVMPTRLD_SUCCEEDED;
}
@@ -2059,14 +2059,10 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.hv_evmcs) {
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
copy_vmcs12_to_enlightened(vmx);
- /* All fields are clean */
- vmx->nested.hv_evmcs->hv_clean_fields |=
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- } else {
+ else
copy_vmcs12_to_shadow(vmx);
- }
vmx->nested.need_vmcs12_to_shadow_sync = false;
}
@@ -2177,7 +2173,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
}
if (cpu_has_vmx_encls_vmexit())
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
/*
* Set the MSR load/store lists to match L0's settings. Only the
@@ -2196,7 +2192,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
{
prepare_vmcs02_constant_state(vmx);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -2206,34 +2202,34 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
}
}
-static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+ struct vmcs12 *vmcs12)
{
u32 exec_control;
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
- if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
+ if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
prepare_vmcs02_early_rare(vmx, vmcs12);
/*
* PIN CONTROLS
*/
- exec_control = vmx_pin_based_exec_ctrl(vmx);
+ exec_control = __pin_controls_get(vmcs01);
exec_control |= (vmcs12->pin_based_vm_exec_control &
~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
- if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- vmx->nested.pi_pending = false;
- } else {
+ else
exec_control &= ~PIN_BASED_POSTED_INTR;
- }
pin_controls_set(vmx, exec_control);
/*
* EXEC CONTROLS
*/
- exec_control = vmx_exec_control(vmx); /* L0's desires */
+ exec_control = __exec_controls_get(vmcs01); /* L0's desires */
exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
@@ -2270,17 +2266,21 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* SECONDARY EXEC CONTROLS
*/
if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmx->secondary_exec_control;
+ exec_control = __secondary_exec_controls_get(vmcs01);
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_ENABLE_VMFUNC);
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_DESC);
+
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -2306,6 +2306,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+ vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
secondary_exec_controls_set(vmx, exec_control);
}
@@ -2317,8 +2320,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* on the related bits (if supported by the CPU) in the hope that
* we can avoid VMWrites during vmx_set_efer().
*/
- exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
- ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+ exec_control = __vm_entry_controls_get(vmcs01);
+ exec_control |= vmcs12->vm_entry_controls;
+ exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
@@ -2334,9 +2338,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
- exec_control = vmx_vmexit_ctrl();
+ exec_control = __vm_exit_controls_get(vmcs01);
if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
+ else
+ exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
vm_exit_controls_set(vmx, exec_control);
/*
@@ -2488,18 +2494,18 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* is assigned to entry_failure_code on failure.
*/
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ bool from_vmentry,
enum vm_entry_failure_code *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
bool load_guest_pdptrs_vmcs12 = false;
- if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
+ if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
prepare_vmcs02_rare(vmx, vmcs12);
vmx->nested.dirty_vmcs12 = false;
- load_guest_pdptrs_vmcs12 = !hv_evmcs ||
- !(hv_evmcs->hv_clean_fields &
+ load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
+ !(vmx->nested.hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
@@ -2532,10 +2538,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
}
- vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ vmx_get_l2_tsc_offset(vcpu),
+ vmx_get_l2_tsc_multiplier(vcpu));
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ vcpu->arch.l1_tsc_scaling_ratio,
+ vmx_get_l2_tsc_multiplier(vcpu));
+
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (kvm_has_tsc_control)
- decache_tsc_multiplier(vmx);
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
@@ -2564,15 +2578,20 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Guest state is invalid and unrestricted guest is disabled,
* which means L1 attempted VMEntry to L2 with invalid state.
* Fail the VMEntry.
+ *
+ * However when force loading the guest state (SMM exit or
+ * loading nested state after migration, it is possible to
+ * have invalid guest state now, which will be later fixed by
+ * restoring L2 register state
*/
- if (CC(!vmx_guest_state_valid(vcpu))) {
+ if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
/* Shadow page tables on either EPT or shadow page tables. */
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
- entry_failure_code))
+ from_vmentry, entry_failure_code))
return -EINVAL;
/*
@@ -2594,16 +2613,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}
- if (!enable_ept)
- vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
- vmcs12->guest_ia32_perf_global_ctrl)))
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
+ }
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
+
+ /*
+ * It was observed that genuine Hyper-V running in L1 doesn't reset
+ * 'hv_clean_fields' by itself, it only sets the corresponding dirty
+ * bits when it changes a field in eVMCS. Mark all fields as clean
+ * here.
+ */
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ vmx->nested.hv_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
return 0;
}
@@ -2830,6 +2859,17 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+#ifdef CONFIG_X86_64
+ if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
+ !!(vcpu->arch.efer & EFER_LMA)))
+ return -EINVAL;
+#endif
+ return 0;
+}
+
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -2854,18 +2894,16 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
return -EINVAL;
#ifdef CONFIG_X86_64
- ia32e = !!(vcpu->arch.efer & EFER_LMA);
+ ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
#else
ia32e = false;
#endif
if (ia32e) {
- if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
- CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
+ if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
return -EINVAL;
} else {
- if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
- CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
+ if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
CC((vmcs12->host_rip) >> 32))
return -EINVAL;
@@ -2910,27 +2948,31 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- int r = 0;
- struct vmcs12 *shadow;
- struct kvm_host_map map;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+ struct vmcs_hdr hdr;
- if (vmcs12->vmcs_link_pointer == -1ull)
+ if (vmcs12->vmcs_link_pointer == INVALID_GPA)
return 0;
if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
return -EINVAL;
- if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
- return -EINVAL;
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
+ return -EINVAL;
- shadow = map.hva;
+ if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+ offsetof(struct vmcs12, hdr),
+ sizeof(hdr))))
+ return -EINVAL;
- if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
- CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
- r = -EINVAL;
+ if (CC(hdr.revision_id != VMCS12_REVISION) ||
+ CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
+ return -EINVAL;
- kvm_vcpu_unmap(vcpu, &map, false);
- return r;
+ return 0;
}
/*
@@ -3093,20 +3135,20 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
* L2 was running), map it here to make sure vmcs12 changes are
* properly reflected.
*/
- if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) {
+ if (vmx->nested.enlightened_vmcs_enabled &&
+ vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
enum nested_evmptrld_status evmptrld_status =
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
if (evmptrld_status == EVMPTRLD_VMFAIL ||
- evmptrld_status == EVMPTRLD_ERROR) {
- pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
- __func__);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ evmptrld_status == EVMPTRLD_ERROR)
return false;
- }
+
+ /*
+ * Post migration VMCS12 always provides the most actual
+ * information, copy it to eVMCS upon entry.
+ */
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
}
return true;
@@ -3120,6 +3162,18 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
struct page *page;
u64 hpa;
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ return false;
+ }
+
+
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
/*
* Translate L1 physical address to host physical
@@ -3169,7 +3223,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
* force VM-Entry to fail.
*/
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
}
}
@@ -3182,6 +3236,15 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
offset_in_page(vmcs12->posted_intr_desc_addr));
vmcs_write64(POSTED_INTR_DESC_ADDR,
pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
+ } else {
+ /*
+ * Defer the KVM_INTERNAL_EXIT until KVM tries to
+ * access the contents of the VMCS12 posted interrupt
+ * descriptor. (Note that KVM may do this when it
+ * should not, per the architectural specification.)
+ */
+ vmx->nested.pi_desc = NULL;
+ pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
}
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
@@ -3194,8 +3257,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
- if (!nested_get_evmcs_page(vcpu))
+ if (!nested_get_evmcs_page(vcpu)) {
+ pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
return false;
+ }
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
return false;
@@ -3296,8 +3367,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -3331,7 +3401,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
- prepare_vmcs02_early(vmx, vmcs12);
+ prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
if (from_vmentry) {
if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
@@ -3353,10 +3423,8 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
}
enter_guest_mode(vcpu);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
- vcpu->arch.tsc_offset += vmcs12->tsc_offset;
- if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit_guest_mode;
@@ -3436,7 +3504,7 @@ vmentry_fail_vmexit:
load_vmcs12_host_state(vcpu, vmcs12);
vmcs12->vm_exit_reason = exit_reason.full;
- if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
+ if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
vmx->nested.need_vmcs12_to_shadow_sync = true;
return NVMX_VMENTRY_VMEXIT;
}
@@ -3460,11 +3528,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (evmptrld_status == EVMPTRLD_ERROR) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
- } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
- return nested_vmx_failInvalid(vcpu);
}
- if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+ if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
+ vmx->nested.current_vmptr == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
@@ -3478,8 +3550,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (CC(vmcs12->hdr.shadow_vmcs))
return nested_vmx_failInvalid(vcpu);
- if (vmx->nested.hv_evmcs) {
- copy_enlightened_to_vmcs12(vmx);
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
/* Enlightened VMCS doesn't have launch state */
vmcs12->launch_state = !launch;
} else if (enable_shadow_vmcs) {
@@ -3507,6 +3579,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (nested_vmx_check_controls(vcpu, vmcs12))
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ if (nested_vmx_check_address_space_size(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+
if (nested_vmx_check_host_state(vcpu, vmcs12))
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
@@ -3537,7 +3612,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* snapshot restore (migration).
*
* In this flow, it is assumed that vmcs12 cache was
- * trasferred as part of captured nVMX state and should
+ * transferred as part of captured nVMX state and should
* therefore not be read from guest memory (which may not
* exist on destination host yet).
*/
@@ -3555,7 +3630,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
vmx->nested.nested_run_pending = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
break;
case GUEST_ACTIVITY_WAIT_SIPI:
@@ -3616,12 +3691,34 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
}
static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+ struct vmcs12 *vmcs12,
+ u32 vm_exit_reason, u32 exit_intr_info)
{
u32 idt_vectoring;
unsigned int nr;
- if (vcpu->arch.exception.injected) {
+ /*
+ * Per the SDM, VM-Exits due to double and triple faults are never
+ * considered to occur during event delivery, even if the double/triple
+ * fault is the result of an escalating vectoring issue.
+ *
+ * Note, the SDM qualifies the double fault behavior with "The original
+ * event results in a double-fault exception". It's unclear why the
+ * qualification exists since exits due to double fault can occur only
+ * while vectoring a different exception (injected events are never
+ * subject to interception), i.e. there's _always_ an original event.
+ *
+ * The SDM also uses NMI as a confusing example for the "original event
+ * causes the VM exit directly" clause. NMI isn't special in any way,
+ * the same rule applies to all events that cause an exit directly.
+ * NMI is an odd choice for the example because NMIs can only occur on
+ * instruction boundaries, i.e. they _can't_ occur during vectoring.
+ */
+ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
+ ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
+ is_double_fault(exit_intr_info))) {
+ vmcs12->idt_vectoring_info_field = 0;
+ } else if (vcpu->arch.exception.injected) {
nr = vcpu->arch.exception.nr;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -3654,6 +3751,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
idt_vectoring |= INTR_TYPE_EXT_INTR;
vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else {
+ vmcs12->idt_vectoring_info_field = 0;
}
}
@@ -3679,25 +3778,29 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
}
}
-static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
void *vapic_page;
u16 status;
- if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
- return;
+ if (!vmx->nested.pi_pending)
+ return 0;
+
+ if (!vmx->nested.pi_desc)
+ goto mmio_needed;
vmx->nested.pi_pending = false;
+
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
- return;
+ return 0;
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
- return;
+ goto mmio_needed;
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
@@ -3710,6 +3813,11 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
}
nested_mark_vmcs12_pages_dirty(vcpu);
+ return 0;
+
+mmio_needed:
+ kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
+ return -ENXIO;
}
static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
@@ -3810,9 +3918,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
/*
* Process any exceptions that are not debug traps before MTF.
+ *
+ * Note that only a pending nested run can block a pending exception.
+ * Otherwise an injected NMI/interrupt should either be
+ * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
+ * while delivering the pending exception.
*/
+
if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
- if (block_nested_events)
+ if (vmx->nested.nested_run_pending)
return -EBUSY;
if (!nested_vmx_check_exception(vcpu, &exit_qual))
goto no_vmexit;
@@ -3829,7 +3943,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
if (vcpu->arch.exception.pending) {
- if (block_nested_events)
+ if (vmx->nested.nested_run_pending)
return -EBUSY;
if (!nested_vmx_check_exception(vcpu, &exit_qual))
goto no_vmexit;
@@ -3878,8 +3992,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
no_vmexit:
- vmx_complete_nested_posted_interrupt(vcpu);
- return 0;
+ return vmx_complete_nested_posted_interrupt(vcpu);
}
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -4023,10 +4136,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.hv_evmcs)
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
+ !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
@@ -4105,13 +4219,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
{
/* update exit information fields: */
vmcs12->vm_exit_reason = vm_exit_reason;
+ if (to_vmx(vcpu)->exit_reason.enclave_mode)
+ vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
vmcs12->exit_qualification = exit_qualification;
- vmcs12->vm_exit_intr_info = exit_intr_info;
-
- vmcs12->idt_vectoring_info_field = 0;
- vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ /*
+ * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
+ * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
+ * exit info fields are unmodified.
+ */
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
vmcs12->launch_state = 1;
@@ -4123,7 +4239,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Transfer the event that L0 or L1 may wanted to inject into
* L2 to IDT_VECTORING_INFO_FIELD.
*/
- vmcs12_save_pending_event(vcpu, vmcs12);
+ vmcs12_save_pending_event(vcpu, vmcs12,
+ vm_exit_reason, exit_intr_info);
+
+ vmcs12->vm_exit_intr_info = exit_intr_info;
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
* According to spec, there's no need to store the guest's
@@ -4195,7 +4316,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* Only PDPTE load can fail as the value of cr3 was checked on entry and
* couldn't have changed.
*/
- if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
@@ -4235,7 +4356,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
seg.l = 1;
else
seg.db = 1;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
seg = (struct kvm_segment) {
.base = 0,
.limit = 0xFFFFFFFF,
@@ -4246,17 +4367,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
.g = 1
};
seg.selector = vmcs12->host_ds_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
seg.selector = vmcs12->host_es_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
seg.selector = vmcs12->host_ss_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
seg.selector = vmcs12->host_fs_selector;
seg.base = vmcs12->host_fs_base;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
seg.selector = vmcs12->host_gs_selector;
seg.base = vmcs12->host_gs_base;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
seg = (struct kvm_segment) {
.base = vmcs12->host_tr_base,
.limit = 0x67,
@@ -4264,17 +4385,20 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
.type = 11,
.present = 1
};
- vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
+ memset(&seg, 0, sizeof(seg));
+ seg.unusable = 1;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
-
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+
+ to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
@@ -4350,9 +4474,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
-
/*
* This nasty bit of open coding is a compromise between blindly
* loading L1's MSRs using the exit load lists (incorrect emulation
@@ -4422,11 +4543,18 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ /*
+ * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
+ * Enlightened VMCS after migration and we still need to
+ * do that when something is forcing L2->L1 exit prior to
+ * the first L2 run.
+ */
+ (void)nested_get_evmcs_page(vcpu);
+ }
- /* Service the TLB flush request for L2 before switching to L1. */
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
/*
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
@@ -4441,8 +4569,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
if (nested_cpu_has_preemption_timer(vmcs12))
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
- vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
+ vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ }
if (likely(!vmx->fail)) {
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
@@ -4479,12 +4610,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (kvm_has_tsc_control)
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+
if (vmx->nested.l1_tpr_threshold != -1)
vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
- if (kvm_has_tsc_control)
- decache_tsc_multiplier(vmx);
-
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
vmx->nested.change_vmcs01_virtual_apic_mode = false;
vmx_set_virtual_apic_mode(vcpu);
@@ -4509,8 +4640,13 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
}
+ if (vmx->nested.update_vmcs01_apicv_status) {
+ vmx->nested.update_vmcs01_apicv_status = false;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
if ((vm_exit_reason != -1) &&
- (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+ (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
@@ -4558,6 +4694,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx->fail = 0;
}
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
/*
* Decode the memory-address operand of a vmx instruction, as recorded on an
* exit caused by such an instruction (run by a guest hypervisor).
@@ -4688,7 +4829,8 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
return 0;
}
-void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
+ bool vcpu_has_perf_global_ctrl)
{
struct vcpu_vmx *vmx;
@@ -4696,7 +4838,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
return;
vmx = to_vmx(vcpu);
- if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
+ if (vcpu_has_perf_global_ctrl) {
vmx->nested.msrs.entry_ctls_high |=
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
vmx->nested.msrs.exit_ctls_high |=
@@ -4743,18 +4885,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
/*
- * We should allocate a shadow vmcs for vmcs01 only when L1
- * executes VMXON and free it when L1 executes VMXOFF.
- * As it is invalid to execute VMXON twice, we shouldn't reach
- * here when vmcs01 already have an allocated shadow vmcs.
+ * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
+ * when L1 executes VMXOFF or the vCPU is forced out of nested
+ * operation. VMXON faults if the CPU is already post-VMXON, so it
+ * should be impossible to already have an allocated shadow VMCS. KVM
+ * doesn't support virtualization of VMCS shadowing, so vmcs01 should
+ * always be the loaded VMCS.
*/
- WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
+ return loaded_vmcs->shadow_vmcs;
+
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
- if (!loaded_vmcs->shadow_vmcs) {
- loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
- if (loaded_vmcs->shadow_vmcs)
- vmcs_clear(loaded_vmcs->shadow_vmcs);
- }
return loaded_vmcs->shadow_vmcs;
}
@@ -4771,6 +4915,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -4807,14 +4952,7 @@ out_vmcs02:
return -ENOMEM;
}
-/*
- * Emulate the VMXON instruction.
- * Currently, we just remember that VMX is active, and do not save or even
- * inspect the argument to VMXON (the so-called "VMXON pointer") because we
- * do not currently need to store anything in that guest-allocated memory
- * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
- * argument is different from the VMXON pointer (which the spec says they do).
- */
+/* Emulate the VMXON instruction. */
static int handle_vmon(struct kvm_vcpu *vcpu)
{
int ret;
@@ -4883,7 +5021,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.current_vmptr == -1ull)
+ if (vmx->nested.current_vmptr == INVALID_GPA)
return;
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
@@ -4901,9 +5039,9 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
vmx->nested.current_vmptr >> PAGE_SHIFT,
vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
- vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmptr = INVALID_GPA;
}
/* Emulate the VMXOFF instruction */
@@ -4960,6 +5098,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12,
launch_state),
&zero, sizeof(zero));
+ } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
+ nested_release_evmcs(vcpu);
}
return nested_vmx_succeed(vcpu);
@@ -4995,27 +5135,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- /*
- * In VMX non-root operation, when the VMCS-link pointer is -1ull,
- * any VMREAD sets the ALU flags for VMfailInvalid.
- */
- if (vmx->nested.current_vmptr == -1ull ||
- (is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
- return nested_vmx_failInvalid(vcpu);
-
/* Decode instruction info and find the field to read */
- field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
+ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- offset = vmcs_field_to_offset(field);
- if (offset < 0)
- return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
- if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
- copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ offset = get_vmcs12_field_offset(field);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
+ } else {
+ /*
+ * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
+ * enlightened VMCS is active VMREAD/VMWRITE instructions are
+ * unsupported. Unfortunately, certain versions of Windows 11
+ * don't comply with this requirement which is not enforced in
+ * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
+ * workaround, as misbehaving guests will panic on VM-Fail.
+ * Note, enlightened VMCS is incompatible with shadow VMCS so
+ * all VMREADs from L2 should go to L1.
+ */
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return nested_vmx_failInvalid(vcpu);
+
+ offset = evmcs_field_offset(field, NULL);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- /* Read the field, zero-extended to a u64 value */
- value = vmcs12_read_any(vmcs12, field, offset);
+ /* Read the field, zero-extended to a u64 value */
+ value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+ }
/*
* Now copy part of this value to register or memory, as requested.
@@ -5023,7 +5185,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
* on the guest's mode (32 or 64 bit), not on the given field's length.
*/
if (instr_info & BIT(10)) {
- kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value);
+ kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
} else {
len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
@@ -5088,16 +5250,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
/*
- * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
* any VMWRITE sets the ALU flags for VMfailInvalid.
*/
- if (vmx->nested.current_vmptr == -1ull ||
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
if (instr_info & BIT(10))
- value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf));
+ value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
else {
len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
@@ -5108,9 +5270,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return kvm_handle_memory_failure(vcpu, r, &e);
}
- field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
+ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- offset = vmcs_field_to_offset(field);
+ offset = get_vmcs12_field_offset(field);
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
@@ -5179,6 +5341,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
vmx->nested.need_vmcs12_to_shadow_sync = true;
}
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
}
/* Emulate the VMPTRLD instruction */
@@ -5201,14 +5364,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
/* Forbid normal VMPTRLD if Enlightened version was used */
- if (vmx->nested.hv_evmcs)
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
return 1;
if (vmx->nested.current_vmptr != vmptr) {
- struct kvm_host_map map;
- struct vmcs12 *new_vmcs12;
+ struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
+ struct vmcs_hdr hdr;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
@@ -5219,12 +5382,16 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
- new_vmcs12 = map.hva;
+ if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+ offsetof(struct vmcs12, hdr),
+ sizeof(hdr))) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
- if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
- (new_vmcs12->hdr.shadow_vmcs &&
+ if (hdr.revision_id != VMCS12_REVISION ||
+ (hdr.shadow_vmcs &&
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
- kvm_vcpu_unmap(vcpu, &map, false);
return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -5235,8 +5402,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
* Load VMCS12 from guest memory since it is not already
* cached.
*/
- memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
- kvm_vcpu_unmap(vcpu, &map, false);
+ if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
+ VMCS12_SIZE)) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
set_current_vmptr(vmx, vmptr);
}
@@ -5257,7 +5427,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
+ if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
return 1;
if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
@@ -5272,14 +5442,6 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return nested_vmx_succeed(vcpu);
}
-#define EPTP_PA_MASK GENMASK_ULL(51, 12)
-
-static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
-{
- return VALID_PAGE(root_hpa) &&
- ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
-}
-
/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{
@@ -5292,7 +5454,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
struct {
u64 eptp, gpa;
} operand;
- int i, r;
+ int i, r, gpr_index;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_EPT) ||
@@ -5305,7 +5467,8 @@ static int handle_invept(struct kvm_vcpu *vcpu)
return 1;
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
@@ -5335,7 +5498,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
roots_to_free = 0;
- if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
+ if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
operand.eptp))
roots_to_free |= KVM_MMU_ROOT_CURRENT;
@@ -5355,7 +5518,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
}
if (roots_to_free)
- kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
return nested_vmx_succeed(vcpu);
}
@@ -5372,7 +5535,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
u64 gla;
} operand;
u16 vpid02;
- int r;
+ int r, gpr_index;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
@@ -5385,7 +5548,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return 1;
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
types = (vmx->nested.msrs.vpid_caps &
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
@@ -5434,8 +5598,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
/*
* Sync the shadow page tables if EPT is disabled, L1 is invalidating
- * linear mappings for L2 (tagged with L2's VPID). Free all roots as
- * VPIDs are not tracked in the MMU role.
+ * linear mappings for L2 (tagged with L2's VPID). Free all guest
+ * roots as VPIDs are not tracked in the MMU role.
*
* Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
* an MMU when EPT is disabled.
@@ -5443,8 +5607,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
* TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
*/
if (!enable_ept)
- kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
- KVM_MMU_ROOTS_ALL);
+ kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
return nested_vmx_succeed(vcpu);
}
@@ -5454,23 +5617,16 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
{
u32 index = kvm_rcx_read(vcpu);
u64 new_eptp;
- bool accessed_dirty;
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (!nested_cpu_has_eptp_switching(vmcs12) ||
- !nested_cpu_has_ept(vmcs12))
+ if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
return 1;
-
if (index >= VMFUNC_EPTP_ENTRIES)
return 1;
-
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
&new_eptp, index * 8, 8))
return 1;
- accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT);
-
/*
* If the (L2) guest does a vmfunc to the currently
* active ept pointer, we don't have to do anything else
@@ -5479,16 +5635,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
if (!nested_vmx_check_eptp(vcpu, new_eptp))
return 1;
- kvm_mmu_unload(vcpu);
- mmu->ept_ad = accessed_dirty;
- mmu->mmu_role.base.ad_disabled = !accessed_dirty;
vmcs12->ept_pointer = new_eptp;
- /*
- * TODO: Check what's the correct approach in case
- * mmu reload fails. Currently, we just let the next
- * reload potentially fail
- */
- kvm_mmu_reload(vcpu);
+ nested_ept_new_eptp(vcpu);
+
+ if (!nested_cpu_has_vpid(vmcs12))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
return 0;
@@ -5511,7 +5662,17 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
}
vmcs12 = get_vmcs12(vcpu);
- if ((vmcs12->vm_function_control & (1 << function)) == 0)
+
+ /*
+ * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
+ * is enabled in vmcs02 if and only if it's enabled in vmcs12.
+ */
+ if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!(vmcs12->vm_function_control & BIT_ULL(function)))
goto fail;
switch (function) {
@@ -5547,7 +5708,7 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
gpa_t bitmap, last_bitmap;
u8 b;
- last_bitmap = (gpa_t)-1;
+ last_bitmap = INVALID_GPA;
b = -1;
while (size > 0) {
@@ -5646,7 +5807,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
reg = (exit_qualification >> 8) & 15;
- val = kvm_register_readl(vcpu, reg);
+ val = kvm_register_read(vcpu, reg);
switch (cr) {
case 0:
if (vmcs12->cr0_guest_host_mask &
@@ -5705,6 +5866,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
return false;
}
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 encls_leaf;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+ return false;
+
+ encls_leaf = kvm_rax_read(vcpu);
+ if (encls_leaf > 62)
+ encls_leaf = 63;
+ return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12, gpa_t bitmap)
{
@@ -5761,7 +5937,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
if (is_nmi(intr_info))
return true;
else if (is_page_fault(intr_info))
- return vcpu->arch.apf.host_apf_flags || !enable_ept;
+ return vcpu->arch.apf.host_apf_flags ||
+ vmx_need_pf_intercept(vcpu);
else if (is_debug(intr_info) &&
vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -5769,6 +5946,9 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
else if (is_breakpoint(intr_info) &&
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
return true;
+ else if (is_alignment_check(intr_info) &&
+ !vmx_guest_inject_ac(vcpu))
+ return true;
return false;
case EXIT_REASON_EXTERNAL_INTERRUPT:
return true;
@@ -5801,8 +5981,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_VMFUNC:
/* VM functions are emulated through L2->L0 vmexits. */
return true;
- case EXIT_REASON_ENCLS:
- /* SGX is never exposed to L1 */
+ case EXIT_REASON_BUS_LOCK:
+ /*
+ * At present, bus lock VM exit is never exposed to L1.
+ * Handle L2's bus locks in L0 directly.
+ */
return true;
default:
break;
@@ -5927,6 +6110,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_TPAUSE:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+ case EXIT_REASON_ENCLS:
+ return nested_vmx_exit_handled_encls(vcpu, vmcs12);
default:
return true;
}
@@ -5958,7 +6143,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
goto reflect_vmexit;
}
- trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -5999,8 +6184,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
.format = KVM_STATE_NESTED_FORMAT_VMX,
.size = sizeof(kvm_state),
.hdr.vmx.flags = 0,
- .hdr.vmx.vmxon_pa = -1ull,
- .hdr.vmx.vmcs12_pa = -1ull,
+ .hdr.vmx.vmxon_pa = INVALID_GPA,
+ .hdr.vmx.vmcs12_pa = INVALID_GPA,
.hdr.vmx.preemption_timer_deadline = 0,
};
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
@@ -6020,12 +6205,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (vmx_has_valid_vmcs12(vcpu)) {
kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
- if (vmx->nested.hv_evmcs)
+ /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
+ if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
if (is_guest_mode(vcpu) &&
nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull)
+ vmcs12->vmcs_link_pointer != INVALID_GPA)
kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
}
@@ -6076,8 +6262,15 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
} else {
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
if (!vmx->nested.need_vmcs12_to_shadow_sync) {
- if (vmx->nested.hv_evmcs)
- copy_enlightened_to_vmcs12(vmx);
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ /*
+ * L1 hypervisor is not obliged to keep eVMCS
+ * clean fields data always up-to-date while
+ * not in guest mode, 'hv_clean_fields' is only
+ * supposed to be actual upon vmentry so we need
+ * to ignore it here and do full copy.
+ */
+ copy_enlightened_to_vmcs12(vmx, 0);
else if (enable_shadow_vmcs)
copy_shadow_to_vmcs12(vmx);
}
@@ -6094,7 +6287,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
return -EFAULT;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull) {
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
get_shadow_vmcs12(vcpu), VMCS12_SIZE))
return -EFAULT;
@@ -6129,11 +6322,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
return -EINVAL;
- if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
if (kvm_state->hdr.vmx.smm.flags)
return -EINVAL;
- if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
return -EINVAL;
/*
@@ -6187,7 +6380,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
vmx_leave_nested(vcpu);
- if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
return 0;
vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
@@ -6200,13 +6393,13 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
/* See vmx_has_valid_vmcs12. */
if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
(kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
- (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
+ (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
return -EINVAL;
else
return 0;
}
- if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
!page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
return -EINVAL;
@@ -6219,6 +6412,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
* restored yet. EVMCS will be mapped from
* nested_get_vmcs12_pages().
*/
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
} else {
return -EINVAL;
@@ -6250,7 +6444,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
ret = -EINVAL;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull) {
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
if (kvm_state->size <
@@ -6283,6 +6477,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
goto error_guest_mode;
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
ret = nested_vmx_enter_non_root_mode(vcpu, false);
if (ret)
goto error_guest_mode;
@@ -6303,6 +6498,40 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void)
}
/*
+ * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
+ * that madness to get the encoding for comparison.
+ */
+#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
+
+static u64 nested_vmx_calc_vmcs_enum_msr(void)
+{
+ /*
+ * Note these are the so called "index" of the VMCS field encoding, not
+ * the index into vmcs12.
+ */
+ unsigned int max_idx, idx;
+ int i;
+
+ /*
+ * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
+ * vmcs12, regardless of whether or not the associated feature is
+ * exposed to L1. Simply find the field with the highest index.
+ */
+ max_idx = 0;
+ for (i = 0; i < nr_vmcs12_fields; i++) {
+ /* The vmcs12 table is very, very sparsely populated. */
+ if (!vmcs12_field_offsets[i])
+ continue;
+
+ idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
+ if (idx > max_idx)
+ max_idx = idx;
+ }
+
+ return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
+}
+
+/*
* nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
* returned for the various VMX controls MSRs when nested VMX is enabled.
* The same values should also be used to verify that vmcs12 control fields are
@@ -6438,7 +6667,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDSEED_EXITING |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_TSC_SCALING;
/*
* We can emulate "VMCS shadowing," even if the hardware
@@ -6502,6 +6732,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
msrs->secondary_ctls_high |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (enable_sgx)
+ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC,
msrs->misc_low,
@@ -6543,8 +6776,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
- /* highest index: VMX_PREEMPTION_TIMER_VALUE */
- msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
+ msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
}
void nested_vmx_hardware_unsetup(void)
@@ -6597,8 +6829,11 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
}
struct kvm_x86_nested_ops vmx_nested_ops = {
+ .leave_nested = vmx_leave_nested,
.check_events = vmx_check_nested_events,
+ .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
.hv_timer_pending = nested_vmx_preemption_timer_pending,
+ .triple_fault = nested_vmx_triple_fault,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
.get_nested_state_pages = vmx_get_nested_state_pages,
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 197148d76b8f..c92cea0b8ccc 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -32,7 +32,8 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
-void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
+ bool vcpu_has_perf_global_ctrl);
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
int size);
@@ -56,14 +57,9 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /*
- * In case we do two consecutive get/set_nested_state()s while L2 was
- * running hv_evmcs may end up not being mapped (we map it from
- * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
- * have vmcs12 if it is true.
- */
- return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
- vmx->nested.hv_evmcs;
+ /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
+ return vmx->nested.current_vmptr != -1ull ||
+ vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID;
}
static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
@@ -244,6 +240,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
PIN_BASED_EXT_INTR_MASK;
}
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
/*
* if fixed0[i] == 1: val[i] must be 1
* if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 9efc1a6b8693..37e9eb32e3d9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -21,7 +21,6 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
- /* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
@@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ /* The above index must match CPUID 0x0A.EBX bit vector */
[7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
};
@@ -68,34 +68,29 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
reprogram_counter(pmu, bit);
}
-static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
- u8 event_select,
- u8 unit_mask)
+static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
- for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
- if (intel_arch_events[i].eventsel == event_select
- && intel_arch_events[i].unit_mask == unit_mask
- && (pmu->available_event_types & (1 << i)))
- break;
-
- if (i == ARRAY_SIZE(intel_arch_events))
- return PERF_COUNT_HW_MAX;
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
+ if (intel_arch_events[i].eventsel != event_select ||
+ intel_arch_events[i].unit_mask != unit_mask)
+ continue;
- return intel_arch_events[i].event_type;
-}
+ /* disable event that reported as not present by cpuid */
+ if ((i < 7) && !(pmu->available_event_types & (1 << i)))
+ return PERF_COUNT_HW_MAX + 1;
-static unsigned intel_find_fixed_event(int idx)
-{
- u32 event;
- size_t size = ARRAY_SIZE(fixed_pmc_events);
+ break;
+ }
- if (idx >= size)
+ if (i == ARRAY_SIZE(intel_arch_events))
return PERF_COUNT_HW_MAX;
- event = fixed_pmc_events[array_index_nospec(idx, size)];
- return intel_arch_events[event].event_type;
+ return intel_arch_events[i].event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -118,16 +113,15 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
}
}
-/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
idx &= ~(3u << 30);
- return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
- (fixed && idx >= pmu->nr_arch_fixed_counters);
+ return fixed ? idx < pmu->nr_arch_fixed_counters
+ : idx < pmu->nr_arch_gp_counters;
}
static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
@@ -365,7 +359,7 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = pmu->global_ctrl;
return 0;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- msr_info->data = pmu->global_ovf_ctrl;
+ msr_info->data = 0;
return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
@@ -395,6 +389,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
struct kvm_pmc *pmc;
u32 msr = msr_info->index;
u64 data = msr_info->data;
+ u64 reserved_bits;
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
@@ -423,7 +418,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & pmu->global_ovf_ctrl_mask)) {
if (!msr_info->host_initiated)
pmu->global_status &= ~data;
- pmu->global_ovf_ctrl = data;
return 0;
}
break;
@@ -437,20 +431,20 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event)
- perf_event_period(pmc->perf_event,
- get_sample_period(pmc, data));
+ pmc_update_sample_period(pmc);
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event)
- perf_event_period(pmc->perf_event,
- get_sample_period(pmc, data));
+ pmc_update_sample_period(pmc);
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
return 0;
- if (!(data & pmu->reserved_bits)) {
+ reserved_bits = pmu->reserved_bits;
+ if ((pmc->idx == 2) &&
+ (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
+ reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
+ if (!(data & reserved_bits)) {
reprogram_gp_counter(pmc, data);
return 0;
}
@@ -461,6 +455,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
+static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
+{
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+ struct kvm_pmc *pmc;
+ u32 event;
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmc = &pmu->fixed_counters[i];
+ event = fixed_pmc_events[array_index_nospec(i, size)];
+ pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
+ intel_arch_events[event].eventsel;
+ }
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -477,9 +486,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry)
+ if (!entry || !vcpu->kvm->arch.enable_pmu)
return;
eax.full = entry->eax;
edx.full = entry->edx;
@@ -502,12 +512,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_fixed_counters = 0;
} else {
pmu->nr_arch_fixed_counters =
- min_t(int, edx.split.num_counters_fixed,
- x86_pmu.num_counters_fixed);
+ min3(ARRAY_SIZE(fixed_pmc_events),
+ (size_t) edx.split.num_counters_fixed,
+ (size_t) x86_pmu.num_counters_fixed);
edx.split.bit_width_fixed = min_t(int,
edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
+ setup_fixed_pmc_eventsel(pmu);
}
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
@@ -523,15 +535,18 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
entry = kvm_find_cpuid_entry(vcpu, 7, 0);
if (entry &&
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
- (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
- pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) {
+ pmu->reserved_bits ^= HSW_IN_TX;
+ pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+ }
bitmap_set(pmu->all_valid_pmc_idx,
0, pmu->nr_arch_gp_counters);
bitmap_set(pmu->all_valid_pmc_idx,
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
- nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+ nested_vmx_pmu_refresh(vcpu,
+ intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL));
if (intel_pmu_lbr_is_compatible(vcpu))
x86_perf_get_lbr(&lbr_desc->records);
@@ -555,7 +570,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->gp_counters[i].current_config = 0;
}
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
@@ -581,15 +596,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmc->counter = pmc->eventsel = 0;
}
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmc = &pmu->fixed_counters[i];
pmc_stop_counter(pmc);
pmc->counter = 0;
}
- pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
- pmu->global_ovf_ctrl = 0;
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
intel_pmu_release_guest_lbr_event(vcpu);
}
@@ -705,9 +719,8 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
intel_pmu_release_guest_lbr_event(vcpu);
}
-struct kvm_pmu_ops intel_pmu_ops = {
- .find_arch_event = intel_find_arch_event,
- .find_fixed_event = intel_find_fixed_event,
+struct kvm_pmu_ops intel_pmu_ops __initdata = {
+ .pmc_perf_hw_id = intel_pmc_perf_hw_id,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 4831bc44ce66..07e5fcf5a5aa 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -5,63 +5,115 @@
#include <asm/cpu.h>
#include "lapic.h"
+#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
+ * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
+ * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when
+ * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled.
+ * The vCPUs posted interrupt descriptor is updated at the same time to set its
+ * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices
+ * wake the target vCPUs. vCPUs are removed from the list and the notification
+ * vector is reset when the vCPU is scheduled in.
*/
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
+/*
+ * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
+ * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
+ * ->sched_in() path will need to take the vCPU off the list of the _previous_
+ * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
+ * occur if a wakeup IRQ arrives and attempts to acquire the lock.
+ */
+static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
return &(to_vmx(vcpu)->pi_desc);
}
+static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
+{
+ /*
+ * PID.ON can be set at any time by a different vCPU or by hardware,
+ * e.g. a device. PID.control must be written atomically, and the
+ * update must be retried with a fresh snapshot an ON change causes
+ * the cmpxchg to fail.
+ */
+ if (cmpxchg64(&pi_desc->control, old, new) != old)
+ return -EBUSY;
+
+ return 0;
+}
+
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new;
+ unsigned long flags;
unsigned int dest;
/*
- * In case of hot-plug or hot-unplug, we may have to undo
- * vmx_vcpu_pi_put even if there is no assigned device. And we
- * always keep PI.NDST up to date for simplicity: it makes the
- * code easier, and CPU migration is not a fast path.
+ * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
+ * PI.SN up-to-date even if there is no assigned device or if APICv is
+ * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
*/
- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ if (!enable_apicv || !lapic_in_kernel(vcpu))
return;
/*
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
- * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
- * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
- * correctly.
+ * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
+ * full update can be skipped as neither the vector nor the destination
+ * needs to be changed.
*/
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- goto after_clear_sn;
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
+ /*
+ * Clear SN if it was set due to being preempted. Again, do
+ * this even if there is no assigned device for simplicity.
+ */
+ if (pi_test_and_clear_sn(pi_desc))
+ goto after_clear_sn;
+ return;
}
- /* The full case. */
- do {
- old.control = new.control = pi_desc->control;
+ local_irq_save(flags);
+
+ /*
+ * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
+ * list of the _previous_ pCPU, which will not be the same as the
+ * current pCPU if the task was migrated.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_del(&vmx->pi_wakeup_list);
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ }
- dest = cpu_physical_id(cpu);
+ dest = cpu_physical_id(cpu);
+ if (!x2apic_mode)
+ dest = (dest << 8) & 0xFF00;
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
+ /*
+ * Clear SN (as above) and refresh the destination APIC ID to
+ * handle task migration (@cpu != vcpu->cpu).
+ */
+ new.ndst = dest;
new.sn = 0;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+
+ /*
+ * Restore the notification vector; in the blocking case, the
+ * descriptor was modified on "put" to use the wakeup vector.
+ */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
+
+ local_irq_restore(flags);
after_clear_sn:
@@ -77,130 +129,71 @@ after_clear_sn:
pi_set_on(pi_desc);
}
-void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return;
-
- /* Set SN when the vCPU is preempted */
- if (vcpu->preempted)
- pi_set_sn(pi_desc);
-}
-
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- do {
- old.control = new.control = pi_desc->control;
- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the VCPU is blocked\n");
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- vcpu->pre_pcpu = -1;
- }
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
}
/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- * we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- * 'NDST' <-- vcpu->pre_pcpu
- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- * interrupt is posted for this vCPU, we cannot block it, in
- * this case, return 1, otherwise, return 0.
- *
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
+ * WAKEUP as the notification vector in the PI descriptor.
*/
-int pi_pre_block(struct kvm_vcpu *vcpu)
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
- unsigned int dest;
- struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct pi_desc old, new;
+ unsigned long flags;
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return 0;
+ local_irq_save(flags);
- WARN_ON(irqs_disabled());
- local_irq_disable();
- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- }
-
- do {
- old.control = new.control = pi_desc->control;
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_add_tail(&vmx->pi_wakeup_list,
+ &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- WARN((pi_desc->sn == 1),
- "Warning: SN field of posted-interrupts "
- "is set before blocking\n");
+ WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
- /*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
- */
- dest = cpu_physical_id(vcpu->pre_pcpu);
-
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
/* set 'NV' to 'wakeup vector' */
new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
- /* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc) == 1)
- __pi_post_block(vcpu);
+ /*
+ * Send a wakeup IPI to this CPU if an interrupt may have been posted
+ * before the notification vector was updated, in which case the IRQ
+ * will arrive on the non-wakeup vector. An IPI is needed as calling
+ * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
+ * enabled until it is safe to call try_to_wake_up() on the task being
+ * scheduled out).
+ */
+ if (pi_test_on(&new))
+ apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
- local_irq_enable();
- return (vcpu->pre_pcpu == -1);
+ local_irq_restore(flags);
}
-void pi_post_block(struct kvm_vcpu *vcpu)
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
- if (vcpu->pre_pcpu == -1)
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
- WARN_ON(irqs_disabled());
- local_irq_disable();
- __pi_post_block(vcpu);
- local_irq_enable();
+ if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
+ pi_enable_wakeup_handler(vcpu);
+
+ /*
+ * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
+ * as blocking and preempted, e.g. if it's preempted between setting
+ * its wait state and manually scheduling out.
+ */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
}
/*
@@ -208,24 +201,24 @@ void pi_post_block(struct kvm_vcpu *vcpu)
*/
void pi_wakeup_handler(void)
{
- struct kvm_vcpu *vcpu;
int cpu = smp_processor_id();
+ struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
+ raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
+ struct vcpu_vmx *vmx;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
- blocked_vcpu_list) {
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ raw_spin_lock(spinlock);
+ list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) {
- if (pi_test_on(pi_desc) == 1)
- kvm_vcpu_kick(vcpu);
+ if (pi_test_on(&vmx->pi_desc))
+ kvm_vcpu_wake_up(&vmx->vcpu);
}
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ raw_spin_unlock(spinlock);
}
void __init pi_init_cpu(int cpu)
{
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
+ raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
@@ -238,7 +231,21 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
- * pi_update_irte - set IRTE for Posted-Interrupts
+ * Bail out of the block loop if the VM has an assigned
+ * device, but the blocking vCPU didn't reconfigure the
+ * PI.NV to the wakeup vector, i.e. the assigned device
+ * came along after the initial check in vmx_vcpu_pi_put().
+ */
+void vmx_pi_start_assignment(struct kvm *kvm)
+{
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
+}
+
+/*
+ * vmx_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
@@ -246,8 +253,8 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
-int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
- bool set)
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
@@ -256,9 +263,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
struct vcpu_data vcpu_info;
int idx, ret = 0;
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
@@ -307,7 +312,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
continue;
}
- vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 0bdc41391c5b..9a45d5c9f116 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -40,7 +40,13 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
@@ -74,13 +80,13 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_on(struct pi_desc *pi_desc)
+static inline bool pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_sn(struct pi_desc *pi_desc)
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
@@ -88,12 +94,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
-int pi_pre_block(struct kvm_vcpu *vcpu);
-void pi_post_block(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
- bool set);
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+void vmx_pi_start_assignment(struct kvm *kvm);
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644
index 000000000000..35e7ec91ae86
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. */
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "nested.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
+
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode. Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+ int size, int alignment, gva_t *gva)
+{
+ struct kvm_segment s;
+ bool fault;
+
+ /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+ *gva = offset;
+ if (!is_long_mode(vcpu)) {
+ vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+ *gva += s.base;
+ }
+
+ if (!IS_ALIGNED(*gva, alignment)) {
+ fault = true;
+ } else if (likely(is_long_mode(vcpu))) {
+ fault = is_noncanonical_address(*gva, vcpu);
+ } else {
+ *gva &= 0xffffffff;
+ fault = (s.unusable) ||
+ (s.type != 2 && s.type != 3) ||
+ (*gva > s.limit) ||
+ ((s.base != 0 || s.limit != 0xffffffff) &&
+ (((u64)*gva + size - 1) > s.limit + 1));
+ }
+ if (fault)
+ kvm_inject_gp(vcpu, 0);
+ return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+ unsigned int size)
+{
+ uint64_t data[2] = { addr, size };
+
+ __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data));
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+ unsigned int size)
+{
+ if (__copy_from_user(data, (void __user *)hva, size)) {
+ sgx_handle_emulation_failure(vcpu, hva, size);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+ gpa_t *gpa)
+{
+ struct x86_exception ex;
+
+ if (write)
+ *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+ else
+ *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+ if (*gpa == UNMAPPED_GVA) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+ *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+ if (kvm_is_error_hva(*hva)) {
+ sgx_handle_emulation_failure(vcpu, gpa, 1);
+ return -EFAULT;
+ }
+
+ *hva |= gpa & ~PAGE_MASK;
+
+ return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+ struct x86_exception ex;
+
+ /*
+ * A non-EPCM #PF indicates a bad userspace HVA. This *should* check
+ * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+ * but the error code isn't (yet) plumbed through the ENCLS helpers.
+ */
+ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ /*
+ * If the guest thinks it's running on SGX2 hardware, inject an SGX
+ * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+ * #PF on SGX2). The assumption is that EPCM faults are much more
+ * likely than a bad userspace address.
+ */
+ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+ memset(&ex, 0, sizeof(ex));
+ ex.vector = PF_VECTOR;
+ ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+ PFERR_SGX_MASK;
+ ex.address = gva;
+ ex.error_code_valid = true;
+ ex.nested_page_fault = false;
+ kvm_inject_page_fault(vcpu, &ex);
+ } else {
+ kvm_inject_gp(vcpu, 0);
+ }
+ return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+ struct sgx_pageinfo *pageinfo,
+ unsigned long secs_hva,
+ gva_t secs_gva)
+{
+ struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+ struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+ u64 attributes, xfrm, size;
+ u32 miscselect;
+ u8 max_size_log2;
+ int trapnr, ret;
+
+ sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+ sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+ if (!sgx_12_0 || !sgx_12_1) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ miscselect = contents->miscselect;
+ attributes = contents->attributes;
+ xfrm = contents->xfrm;
+ size = contents->size;
+
+ /* Enforce restriction of access to the PROVISIONKEY. */
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+ (attributes & SGX_ATTR_PROVISIONKEY)) {
+ if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+ pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+ if ((u32)miscselect & ~sgx_12_0->ebx ||
+ (u32)attributes & ~sgx_12_1->eax ||
+ (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+ (u32)xfrm & ~sgx_12_1->ecx ||
+ (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* Enforce CPUID restriction on max enclave size. */
+ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+ sgx_12_0->edx;
+ if (size >= BIT_ULL(max_size_log2))
+ kvm_inject_gp(vcpu, 0);
+
+ /*
+ * sgx_virt_ecreate() returns:
+ * 1) 0: ECREATE was successful
+ * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
+ * exception number.
+ * 3) -EINVAL: access_ok() on @secs_hva failed. This should never
+ * happen as KVM checks host addresses at memslot creation.
+ * sgx_virt_ecreate() has already warned in this case.
+ */
+ ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+ if (!ret)
+ return kvm_skip_emulated_instruction(vcpu);
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ gva_t pageinfo_gva, secs_gva;
+ gva_t metadata_gva, contents_gva;
+ gpa_t metadata_gpa, contents_gpa, secs_gpa;
+ unsigned long metadata_hva, contents_hva, secs_hva;
+ struct sgx_pageinfo pageinfo;
+ struct sgx_secs *contents;
+ struct x86_exception ex;
+ int r;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+ return 1;
+
+ /*
+ * Copy the PAGEINFO to local memory, its pointers need to be
+ * translated, i.e. we need to do a deep copy/translate.
+ */
+ r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+ sizeof(pageinfo), &ex);
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return 1;
+ } else if (r != X86EMUL_CONTINUE) {
+ sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+ sizeof(pageinfo));
+ return 0;
+ }
+
+ if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+ sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+ &contents_gva))
+ return 1;
+
+ /*
+ * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+ sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid.
+ */
+ if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+ sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+ return 0;
+
+ /*
+ * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+ * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+ * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+ * enforce restriction of access to the PROVISIONKEY.
+ */
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!contents)
+ return -ENOMEM;
+
+ /* Exit to userspace if copying from a host userspace address fails. */
+ if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+ free_page((unsigned long)contents);
+ return 0;
+ }
+
+ pageinfo.metadata = metadata_hva;
+ pageinfo.contents = (u64)contents;
+
+ r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+ free_page((unsigned long)contents);
+
+ return r;
+}
+
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+ unsigned long sig_hva, secs_hva, token_hva, rflags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gva_t sig_gva, secs_gva, token_gva;
+ gpa_t sig_gpa, secs_gpa, token_gpa;
+ int ret, trapnr;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+ return 1;
+
+ /*
+ * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+ sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid. Note, all structures are aligned and
+ * cannot split pages.
+ */
+ if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+ sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+ return 0;
+
+ ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+ (void __user *)secs_hva,
+ vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ /*
+ * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+ * @token_hva or @secs_hva. This should never happen as KVM checks host
+ * addresses at memslot creation. sgx_virt_einit() has already warned
+ * in this case, so just return.
+ */
+ if (ret < 0)
+ return ret;
+
+ rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+ X86_EFLAGS_AF | X86_EFLAGS_SF |
+ X86_EFLAGS_OF);
+ if (ret)
+ rflags |= X86_EFLAGS_ZF;
+ else
+ rflags &= ~X86_EFLAGS_ZF;
+ vmx_set_rflags(vcpu, rflags);
+
+ kvm_rax_write(vcpu, ret);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+ if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ return false;
+
+ if (leaf >= ECREATE && leaf <= ETRACK)
+ return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+
+ if (leaf >= EAUG && leaf <= EMODT)
+ return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+ return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+ const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+ return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+ u32 leaf = (u32)kvm_rax_read(vcpu);
+
+ if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ } else if (!sgx_enabled_in_guest_bios(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ } else {
+ if (leaf == ECREATE)
+ return handle_encls_ecreate(vcpu);
+ if (leaf == EINIT)
+ return handle_encls_einit(vcpu);
+ WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+ return 0;
+ }
+ return 1;
+}
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+ /*
+ * Use Intel's default value for Skylake hardware if Launch Control is
+ * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+ * Launch Control is supported and enabled, i.e. mimic the reset value
+ * and let the guest write the MSRs at will. If Launch Control is
+ * supported but disabled, then use the current MSR values as the hash
+ * MSRs exist but are read-only (locked and not writable).
+ */
+ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+ rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+ sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+ sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+ sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+ sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+ } else {
+ /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+ }
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+ sizeof(sgx_pubkey_hash));
+}
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *guest_cpuid;
+ u32 eax, ebx, ecx, edx;
+
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+ guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+ return true;
+
+ return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ /*
+ * There is no software enable bit for SGX that is virtualized by
+ * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+ * guest (either by the host or by the guest's BIOS) but enabled in the
+ * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+ * the expected system behavior for ENCLS.
+ */
+ u64 bitmap = -1ull;
+
+ /* Nothing to do if hardware doesn't support SGX */
+ if (!cpu_has_vmx_encls_vmexit())
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+ sgx_enabled_in_guest_bios(vcpu)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+ if (sgx_intercept_encls_ecreate(vcpu))
+ bitmap |= (1 << ECREATE);
+ }
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+ bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+ /*
+ * Trap and execute EINIT if launch control is enabled in the
+ * host using the guest's values for launch control MSRs, even
+ * if the guest's values are fixed to hardware default values.
+ * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+ * the MSRs is extraordinarily expensive.
+ */
+ if (boot_cpu_has(X86_FEATURE_SGX_LC))
+ bitmap |= (1 << EINIT);
+
+ if (!vmcs12 && is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+ bitmap |= vmcs12->encls_exiting_bitmap;
+ }
+ vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644
index 000000000000..a400888b376d
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#include "capabilities.h"
+#include "vmx_ops.h"
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+#else
+#define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ /* Nothing to do if hardware doesn't support SGX */
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+#endif
+
+#endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 1472c6c376f7..2b9d7a7e83f7 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -11,6 +11,8 @@
#include "capabilities.h"
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+
struct vmcs_hdr {
u32 revision_id:31;
u32 shadow_vmcs:1;
@@ -102,6 +104,11 @@ static inline bool is_breakpoint(u32 intr_info)
return is_exception_n(intr_info, BP_VECTOR);
}
+static inline bool is_double_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, DF_VECTOR);
+}
+
static inline bool is_page_fault(u32 intr_info)
{
return is_exception_n(intr_info, PF_VECTOR);
@@ -117,11 +124,21 @@ static inline bool is_gp_fault(u32 intr_info)
return is_exception_n(intr_info, GP_VECTOR);
}
+static inline bool is_alignment_check(u32 intr_info)
+{
+ return is_exception_n(intr_info, AC_VECTOR);
+}
+
static inline bool is_machine_check(u32 intr_info)
{
return is_exception_n(intr_info, MC_VECTOR);
}
+static inline bool is_nm_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, NM_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
@@ -164,4 +181,12 @@ static inline int vmcs_field_readonly(unsigned long field)
return (((field >> 10) & 0x3) == 1);
}
+#define VMCS_FIELD_INDEX_SHIFT (1)
+#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1)
+
+static inline unsigned int vmcs_field_index(unsigned long field)
+{
+ return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT;
+}
+
#endif /* __KVM_X86_VMX_VMCS_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index c8e51c004f78..2251b60920f8 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -2,14 +2,13 @@
#include "vmcs12.h"
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
#define FIELD64(number, name) \
FIELD(number, name), \
[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
-const unsigned short vmcs_field_to_offset_table[] = {
+const unsigned short vmcs12_field_offsets[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -37,6 +36,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
FIELD64(PML_ADDRESS, pml_address),
FIELD64(TSC_OFFSET, tsc_offset),
+ FIELD64(TSC_MULTIPLIER, tsc_multiplier),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
@@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VMREAD_BITMAP, vmread_bitmap),
FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+ FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
@@ -150,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD(HOST_RSP, host_rsp),
FIELD(HOST_RIP, host_rip),
};
-const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table);
+const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 80232daf00ff..746129ddd5ae 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -69,7 +69,9 @@ struct __packed vmcs12 {
u64 vm_function_control;
u64 eptp_list_address;
u64 pml_address;
- u64 padding64[3]; /* room for future expansion */
+ u64 encls_exiting_bitmap;
+ u64 tsc_multiplier;
+ u64 padding64[1]; /* room for future expansion */
/*
* To allow migration of L1 (complete with its L2 guests) between
* machines of different natural widths (32 or 64 bit), we cannot have
@@ -204,12 +206,6 @@ struct __packed vmcs12 {
#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
/*
- * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
- * supported VMCS12 field encoding.
- */
-#define VMCS12_MAX_FIELD_INDEX 0x17
-
-/*
* For save/restore compatibility, the vmcs12 field offsets must not change.
*/
#define CHECK_OFFSET(field, loc) \
@@ -256,6 +252,8 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(vm_function_control, 296);
CHECK_OFFSET(eptp_list_address, 304);
CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(encls_exiting_bitmap, 320);
+ CHECK_OFFSET(tsc_multiplier, 328);
CHECK_OFFSET(cr0_guest_host_mask, 344);
CHECK_OFFSET(cr4_guest_host_mask, 352);
CHECK_OFFSET(cr0_read_shadow, 360);
@@ -363,12 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(guest_pml_index, 996);
}
-extern const unsigned short vmcs_field_to_offset_table[];
+extern const unsigned short vmcs12_field_offsets[];
extern const unsigned int nr_vmcs12_fields;
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-
-static inline short vmcs_field_to_offset(unsigned long field)
+static inline short get_vmcs12_field_offset(unsigned long field)
{
unsigned short offset;
unsigned int index;
@@ -381,14 +377,12 @@ static inline short vmcs_field_to_offset(unsigned long field)
return -ENOENT;
index = array_index_nospec(index, nr_vmcs12_fields);
- offset = vmcs_field_to_offset_table[index];
+ offset = vmcs12_field_offsets[index];
if (offset == 0)
return -ENOENT;
return offset;
}
-#undef ROL16
-
static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
u16 offset)
{
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 3a6461694fc2..435c187927c4 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -49,14 +49,14 @@ SYM_FUNC_START_LOCAL(vmx_vmenter)
je 2f
1: vmresume
- ret
+ RET
2: vmlaunch
- ret
+ RET
3: cmpb $0, kvm_rebooting
je 4f
- ret
+ RET
4: ud2
_ASM_EXTABLE(1b, 3b)
@@ -89,7 +89,7 @@ SYM_FUNC_START(vmx_vmexit)
pop %_ASM_AX
.Lvmexit_skip_rsb:
#endif
- ret
+ RET
SYM_FUNC_END(vmx_vmexit)
/**
@@ -228,7 +228,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
/* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
2: mov $1, %eax
@@ -293,7 +293,7 @@ SYM_FUNC_START(vmread_error_trampoline)
pop %_ASM_AX
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmread_error_trampoline)
SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
@@ -326,5 +326,5 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
*/
mov %_ASM_BP, %_ASM_SP
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 32cf8287d4a7..3a919e49129b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -35,7 +35,9 @@
#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
@@ -51,12 +53,14 @@
#include "cpuid.h"
#include "evmcs.h"
#include "hyperv.h"
+#include "kvm_onhyperv.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "sgx.h"
#include "trace.h"
#include "vmcs.h"
#include "vmcs12.h"
@@ -99,7 +103,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
-bool __read_mostly enable_apicv = 1;
module_param(enable_apicv, bool, S_IRUGO);
/*
@@ -134,8 +137,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
- X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -156,9 +158,13 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_SPEC_CTRL,
MSR_IA32_PRED_CMD,
MSR_IA32_TSC,
+#ifdef CONFIG_X86_64
MSR_FS_BASE,
MSR_GS_BASE,
MSR_KERNEL_GS_BASE,
+ MSR_IA32_XFD,
+ MSR_IA32_XFD_ERR,
+#endif
MSR_IA32_SYSENTER_CS,
MSR_IA32_SYSENTER_ESP,
MSR_IA32_SYSENTER_EIP,
@@ -223,6 +229,9 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
+/* Control for disabling CPU Fill buffer clear */
+static bool __read_mostly vmx_fb_clear_ctrl_available;
+
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
@@ -354,6 +363,60 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+static void vmx_setup_fb_clear_ctrl(void)
+{
+ u64 msr;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA)) {
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_FB_CLEAR_CTRL)
+ vmx_fb_clear_ctrl_available = true;
+ }
+}
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ msr |= FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
@@ -361,8 +424,6 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
- u32 msr, int type);
void vmx_vmexit(void);
@@ -453,102 +514,10 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
-/*
- * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
- * will emulate SYSCALL in legacy mode if the vendor string in guest
- * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
- * support this emulation, IA32_STAR must always be included in
- * vmx_uret_msrs_list[], even in i386 builds.
- */
-static const u32 vmx_uret_msrs_list[] = {
-#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
-#endif
- MSR_EFER, MSR_TSC_AUX, MSR_STAR,
- MSR_IA32_TSX_CTRL,
-};
-
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- u64 tmp_eptp = INVALID_PAGE;
- int i;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!VALID_PAGE(tmp_eptp)) {
- tmp_eptp = to_vmx(vcpu)->ept_pointer;
- } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
- to_kvm_vmx(kvm)->ept_pointers_match
- = EPT_POINTERS_MISMATCH;
- return;
- }
- }
-
- to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
-static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
- void *data)
-{
- struct kvm_tlb_range *range = data;
-
- return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
- range->pages);
-}
-
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
-{
- u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
- /*
- * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
- * of the base of EPT PML4 table, strip off EPT configuration
- * information.
- */
- if (range)
- return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
- kvm_fill_hv_flush_list_func, (void *)range);
- else
- return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
-}
-
-static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
-{
- struct kvm_vcpu *vcpu;
- int ret = 0, i;
-
- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-
- if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
- check_ept_pointer_match(kvm);
-
- if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- /* If ept_pointer is invalid pointer, bypass flush request. */
- if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
- ret |= __hv_remote_flush_tlb_with_range(
- kvm, vcpu, range);
- }
- } else {
- ret = __hv_remote_flush_tlb_with_range(kvm,
- kvm_get_vcpu(kvm, 0), range);
- }
-
- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- return ret;
-}
-static int hv_remote_flush_tlb(struct kvm *kvm)
-{
- return hv_remote_flush_tlb_with_range(kvm, NULL);
-}
-
static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
@@ -559,7 +528,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
* evmcs in singe VM shares same assist page.
*/
if (!*p_hv_pa_pg)
- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
if (!*p_hv_pa_pg)
return -ENOMEM;
@@ -629,11 +598,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static inline bool report_flexpriority(void)
-{
- return flexpriority_enabled;
-}
-
static int possible_passthrough_msr_slot(u32 msr)
{
u32 i;
@@ -677,21 +641,11 @@ static bool is_valid_passthrough_msr(u32 msr)
return r;
}
-static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- for (i = 0; i < vmx->nr_uret_msrs; ++i)
- if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
- return i;
- return -1;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- i = __vmx_find_uret_msr(vmx, msr);
+ i = kvm_find_user_return_msr(msr);
if (i >= 0)
return &vmx->guest_uret_msrs[i];
return NULL;
@@ -700,17 +654,16 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
struct vmx_uret_msr *msr, u64 data)
{
+ unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
- u64 old_msr_data = msr->data;
- msr->data = data;
- if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
+ if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
preempt_enable();
- if (ret)
- msr->data = old_msr_data;
}
+ if (!ret)
+ msr->data = data;
return ret;
}
@@ -744,10 +697,10 @@ static void __loaded_vmcs_clear(void *arg)
/*
* Ensure all writes to loaded_vmcs, including deleting it from its
- * current percpu list, complete before setting loaded_vmcs->vcpu to
- * -1, otherwise a different cpu can see vcpu == -1 first and add
- * loaded_vmcs to its percpu list before it's deleted from this cpu's
- * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
+ * current percpu list, complete before setting loaded_vmcs->cpu to
+ * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
+ * and add loaded_vmcs to its percpu list before it's deleted from this
+ * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
@@ -846,42 +799,44 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
else {
- /*
- * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
- * between guest and host. In that case we only care about present
- * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in
- * prepare_vmcs02_rare.
- */
- bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR));
- int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0;
+ int mask = 0, match = 0;
+
+ if (enable_ept && (eb & (1u << PF_VECTOR))) {
+ /*
+ * If EPT is enabled, #PF is currently only intercepted
+ * if MAXPHYADDR is smaller on the guest than on the
+ * host. In that case we only care about present,
+ * non-reserved faults. For vmcs02, however, PFEC_MASK
+ * and PFEC_MATCH are set in prepare_vmcs02_rare.
+ */
+ mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
+ match = PFERR_PRESENT_MASK;
+ }
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}
+ /*
+ * Disabling xfd interception indicates that dynamic xfeatures
+ * might be used in the guest. Always trap #NM in this case
+ * to save guest xfd_err timely.
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ eb |= (1u << NM_VECTOR);
+
vmcs_write32(EXCEPTION_BITMAP, eb);
}
/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{
- unsigned long *msr_bitmap;
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
+ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
return true;
- msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
-
- if (msr <= 0x1fff) {
- return !!test_bit(msr, msr_bitmap + 0x800 / f);
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
- }
-
- return true;
+ return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap,
+ MSR_IA32_SPEC_CTRL);
}
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -1058,7 +1013,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
return false;
}
- i = __vmx_find_uret_msr(vmx, MSR_EFER);
+ i = kvm_find_user_return_msr(MSR_EFER);
if (i < 0)
return false;
@@ -1154,8 +1109,8 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
wrmsrl(MSR_IA32_RTIT_CTL, 0);
- pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
+ pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
}
@@ -1165,12 +1120,16 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
+ pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
}
- /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ /*
+ * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
+ * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
+ */
+ if (vmx->pt_desc.host.ctl)
+ wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
@@ -1220,11 +1179,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/
if (!vmx->guest_uret_msrs_loaded) {
vmx->guest_uret_msrs_loaded = true;
- for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
- kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (!vmx->guest_uret_msrs[i].load_into_hardware)
+ continue;
+
+ kvm_set_user_return_msr(i,
vmx->guest_uret_msrs[i].data,
vmx->guest_uret_msrs[i].mask);
-
+ }
}
if (vmx->nested.need_vmcs12_to_shadow_sync)
@@ -1370,7 +1332,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
- unsigned long sysenter_esp;
/*
* Flush all EPTP/VPID contexts, the new pCPU may have stale
@@ -1386,16 +1347,14 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
vmx->loaded_vmcs->cpu = cpu;
}
-
- /* Setup TSC multiplier */
- if (kvm_has_tsc_control &&
- vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
- decache_tsc_multiplier(vmx);
}
/*
@@ -1420,7 +1379,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
vmx_prepare_switch_to_host(to_vmx(vcpu));
}
-static bool emulation_required(struct kvm_vcpu *vcpu)
+bool vmx_emulation_required(struct kvm_vcpu *vcpu)
{
return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}
@@ -1464,7 +1423,12 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
- vmx->emulation_required = emulation_required(vcpu);
+ vmx->emulation_required = vmx_emulation_required(vcpu);
+}
+
+static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -1529,7 +1493,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
/*
* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
- * utilize encodings marked reserved will casue a #GP fault.
+ * utilize encodings marked reserved will cause a #GP fault.
*/
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
@@ -1553,29 +1517,43 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
return 1;
return 0;
}
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
+ /*
+ * Emulation of instructions in SGX enclaves is impossible as RIP does
+ * not point at the failing instruction, and even if it did, the code
+ * stream is inaccessible. Inject #UD instead of exiting to userspace
+ * so that guest userspace can't DoS the guest simply by triggering
+ * emulation (enclaves are CPL3 only).
+ */
+ if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
return true;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
+ union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
unsigned long rip, orig_rip;
+ u32 instr_len;
/*
* Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1586,9 +1564,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
- to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+ /*
+ * Emulating an enclave's instructions isn't supported as KVM
+ * cannot access the enclave's memory or its true RIP, e.g. the
+ * vmcs.GUEST_RIP points at the exit point of the enclave, not
+ * the RIP that actually triggered the VM-Exit. But, because
+ * most instructions that cause VM-Exit will #UD in an enclave,
+ * most instruction-based VM-Exits simply do not occur.
+ *
+ * There are a few exceptions, notably the debug instructions
+ * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+ * and generate #DB/#BP as expected, which KVM might intercept.
+ * But again, the CPU does the dirty work and saves an instr
+ * length of zero so VMMs don't shoot themselves in the foot.
+ * WARN if KVM tries to skip a non-zero length instruction on
+ * a VM-Exit from an enclave.
+ */
+ if (!instr_len)
+ goto rip_updated;
+
+ WARN(exit_reason.enclave_mode,
+ "KVM: skipping instruction after SGX enclave VM-Exit");
+
orig_rip = kvm_rip_read(vcpu);
- rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ rip = orig_rip + instr_len;
#ifdef CONFIG_X86_64
/*
* We need to mask out the high 32 bits of RIP if not in 64-bit
@@ -1604,6 +1606,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
return 0;
}
+rip_updated:
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
@@ -1693,73 +1696,90 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+ bool load_into_hardware)
{
- struct vmx_uret_msr tmp;
- int from, to;
+ struct vmx_uret_msr *uret_msr;
- from = __vmx_find_uret_msr(vmx, msr);
- if (from < 0)
+ uret_msr = vmx_find_uret_msr(vmx, msr);
+ if (!uret_msr)
return;
- to = vmx->nr_active_uret_msrs++;
- tmp = vmx->guest_uret_msrs[to];
- vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
- vmx->guest_uret_msrs[from] = tmp;
+ uret_msr->load_into_hardware = load_into_hardware;
}
/*
- * Set up the vmcs to automatically save and restore system
- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
+ * Configuring user return MSRs to automatically save, load, and restore MSRs
+ * that need to be shoved into hardware when running the guest. Note, omitting
+ * an MSR here does _NOT_ mean it's not emulated, only that it will not be
+ * loaded into hardware when running the guest.
*/
-static void setup_msrs(struct vcpu_vmx *vmx)
+static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{
- vmx->guest_uret_msrs_loaded = false;
- vmx->nr_active_uret_msrs = 0;
#ifdef CONFIG_X86_64
+ bool load_syscall_msrs;
+
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
- if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- vmx_setup_uret_msr(vmx, MSR_STAR);
- vmx_setup_uret_msr(vmx, MSR_LSTAR);
- vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
- }
+ load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+ (vmx->vcpu.arch.efer & EFER_SCE);
+
+ vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
#endif
- if (update_transition_efer(vmx))
- vmx_setup_uret_msr(vmx, MSR_EFER);
+ vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
- if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
- vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ /*
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+ * kernel and old userspace. If those guests run on a tsx=off host, do
+ * allow guests to use TSX_CTRL, but don't change the value in hardware
+ * so that TSX remains always disabled.
+ */
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(&vmx->vcpu);
+ /*
+ * The set of MSRs to load may have changed, reload MSRs before the
+ * next VM-Enter.
+ */
+ vmx->guest_uret_msrs_loaded = false;
}
-static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u64 g_tsc_offset = 0;
- /*
- * We're here if L1 chose not to trap WRMSR to TSC. According
- * to the spec, this should set L1's TSC; The offset that L1
- * set for L2 remains unchanged, and still needs to be added
- * to the newly set TSC to get L2's TSC.
- */
- if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
- g_tsc_offset = vmcs12->tsc_offset;
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
+ return vmcs12->tsc_offset;
- trace_kvm_write_tsc_offset(vcpu->vcpu_id,
- vcpu->arch.tsc_offset - g_tsc_offset,
- offset);
- vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
- return offset + g_tsc_offset;
+ return 0;
+}
+
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ return vmcs12->tsc_multiplier;
+
+ return kvm_default_tsc_scaling_ratio;
+}
+
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ vmcs_write64(TSC_OFFSET, offset);
+}
+
+static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+ vmcs_write64(TSC_MULTIPLIER, multiplier);
}
/*
@@ -1797,7 +1817,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
}
/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
@@ -1865,6 +1885,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+ [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+ break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
@@ -1872,10 +1899,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
&msr_info->data))
return 1;
/*
- * Enlightened VMCS v1 doesn't have certain fields, but buggy
- * Hyper-V versions are still trying to use corresponding
- * features when they are exposed. Filter out the essential
- * minimum.
+ * Enlightened VMCS v1 doesn't have certain VMCS fields but
+ * instead of just ignoring the features, different Hyper-V
+ * versions are either trying to use them and fail or do some
+ * sanity checking and refuse to boot. Filter all unsupported
+ * features out.
*/
if (!msr_info->host_initiated &&
vmx->nested.enlightened_vmcs_enabled)
@@ -1920,19 +1948,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if (!vmx_pt_mode_is_host_guest() ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ (index >= 2 * vmx->pt_desc.num_address_ranges))
return 1;
if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- goto find_uret_msr;
case MSR_IA32_DEBUGCTLMSR:
msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
break;
@@ -1966,6 +1988,9 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
if (!intel_pmu_lbr_is_enabled(vcpu))
debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+
return debugctl;
}
@@ -1999,6 +2024,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data);
break;
+ case MSR_IA32_XFD:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ /*
+ * Always intercepting WRMSR could incur non-negligible
+ * overhead given xfd might be changed frequently in
+ * guest context switch. Disable write interception
+ * upon the first write with a non-zero value (indicating
+ * potential usage on dynamic xfeatures). Also update
+ * exception bitmap to trap #NM for proper virtualization
+ * of guest xfd_err.
+ */
+ if (!ret && data) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+ MSR_TYPE_RW);
+ vcpu->arch.xfd_no_write_intercept = true;
+ vmx_update_exception_bitmap(vcpu);
+ }
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
if (is_guest_mode(vcpu))
@@ -2139,9 +2182,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
ret = kvm_set_msr_common(vcpu, msr_info);
break;
- case MSR_IA32_TSC_ADJUST:
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
@@ -2158,6 +2198,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
+
+ /* SGX may be enabled/disabled by guest's firmware */
+ vmx_write_encls_bitmap(vcpu, NULL);
+ break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ /*
+ * On real hardware, the LE hash MSRs are writable before
+ * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+ * at which point SGX related bits in IA32_FEATURE_CONTROL
+ * become writable.
+ *
+ * KVM does not emulate SGX activation for simplicity, so
+ * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+ * is unlocked. This is technically not architectural
+ * behavior, but it's close enough.
+ */
+ if (!msr_info->host_initiated &&
+ (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+ !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+ return 1;
+ vmx->msr_ia32_sgxlepubkeyhash
+ [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!msr_info->host_initiated)
@@ -2215,8 +2278,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!pt_can_write_msr(vmx))
return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges))
+ if (index >= 2 * vmx->pt_desc.num_address_ranges)
return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
@@ -2225,14 +2287,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- /* Check reserved bit, higher 32 bits should be zero */
- if ((data >> 32) != 0)
- return 1;
- goto find_uret_msr;
case MSR_IA32_PERF_CAPABILITIES:
if (data && !vcpu_to_pmu(vcpu)->version)
return 1;
@@ -2255,6 +2309,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
ret = kvm_set_msr_common(vcpu, msr_info);
}
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
return ret;
}
@@ -2282,8 +2340,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
break;
case VCPU_EXREG_CR3:
- if (is_unrestricted_guest(vcpu) ||
- (enable_ept && is_paging(vcpu)))
+ /*
+ * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
+ * CR3 is loaded into hardware, not the guest's CR3.
+ */
+ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
break;
case VCPU_EXREG_CR4:
@@ -2293,7 +2354,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
break;
default:
- WARN_ON_ONCE(1);
+ KVM_BUG_ON(1, vcpu->kvm);
break;
}
}
@@ -2329,7 +2390,7 @@ fault:
return -EFAULT;
}
-static int hardware_enable(void)
+static int vmx_hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2370,7 +2431,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void hardware_disable(void)
+static void vmx_hardware_disable(void)
{
vmclear_local_loaded_vmcss();
@@ -2444,7 +2505,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_exec_control) < 0)
return -EIO;
#ifdef CONFIG_X86_64
- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
~CPU_BASED_CR8_STORE_EXITING;
#endif
@@ -2592,7 +2653,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
@@ -2617,7 +2677,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
struct page *pages;
struct vmcs *vmcs;
- pages = __alloc_pages_node(node, flags, vmcs_config.order);
+ pages = __alloc_pages_node(node, flags, 0);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -2636,7 +2696,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
void free_vmcs(struct vmcs *vmcs)
{
- free_pages((unsigned long)vmcs, vmcs_config.order);
+ free_page((unsigned long)vmcs);
}
/*
@@ -2673,15 +2733,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -2752,7 +2803,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
- vmx_set_segment(vcpu, save, seg);
+ __vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2761,7 +2812,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
- * Update real mode segment cache. It may be not up-to-date if sement
+ * Update real mode segment cache. It may be not up-to-date if segment
* register was written while vcpu was in a guest mode.
*/
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
@@ -2773,7 +2824,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 0;
- vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2871,29 +2922,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
-
- kvm_mmu_reset_context(vcpu);
}
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
/* Nothing to do if hardware doesn't support EFER. */
- if (!msr)
+ if (!vmx_find_uret_msr(vmx, MSR_EFER))
return 0;
vcpu->arch.efer = efer;
- if (efer & EFER_LMA) {
- vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
- msr->data = efer;
- } else {
- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+ if (efer & EFER_LMA)
+ vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
+ else
+ vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
- msr->data = efer & ~EFER_LME;
- }
- setup_msrs(vmx);
+ vmx_setup_uret_msrs(vmx);
return 0;
}
@@ -2918,7 +2963,6 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
static void exit_lmode(struct kvm_vcpu *vcpu)
{
- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}
@@ -2947,10 +2991,17 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 root_hpa = mmu->root_hpa;
+ u64 root_hpa = mmu->root.hpa;
/* No flush required if the current context is invalid. */
if (!VALID_PAGE(root_hpa))
@@ -2958,32 +3009,30 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
- mmu->shadow_root_level));
- else if (!is_guest_mode(vcpu))
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ mmu->root_role.level));
else
- vpid_sync_context(nested_get_vpid02(vcpu));
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
- * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
- vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
- * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
- * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
- * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -3013,45 +3062,27 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
}
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
- unsigned long cr0,
- struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- if (!(cr0 & X86_CR0_PG)) {
- /* From paging/starting to nonpaging */
- exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- } else if (!is_paging(vcpu)) {
- /* From nonpaging to paging */
- exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- }
-
- if (!(cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
-}
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING)
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long hw_cr0;
+ unsigned long hw_cr0, old_cr0_pg;
+ u32 tmp;
+
+ old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
if (is_unrestricted_guest(vcpu))
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+ if (!enable_ept)
+ hw_cr0 |= X86_CR0_WP;
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -3060,25 +3091,70 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
enter_rmode(vcpu);
}
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+ if (!old_cr0_pg && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+ else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
exit_lmode(vcpu);
}
#endif
- if (enable_ept && !is_unrestricted_guest(vcpu))
- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+ if (enable_ept && !is_unrestricted_guest(vcpu)) {
+ /*
+ * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
+ * the below code _enables_ CR3 exiting, vmx_cache_reg() will
+ * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
+ * KVM's CR3 is installed.
+ */
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- vmcs_writel(CR0_READ_SHADOW, cr0);
- vmcs_writel(GUEST_CR0, hw_cr0);
- vcpu->arch.cr0 = cr0;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+ /*
+ * When running with EPT but not unrestricted guest, KVM must
+ * intercept CR3 accesses when paging is _disabled_. This is
+ * necessary because restricted guests can't actually run with
+ * paging disabled, and so KVM stuffs its own CR3 in order to
+ * run the guest when identity mapped page tables.
+ *
+ * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
+ * update, it may be stale with respect to CR3 interception,
+ * e.g. after nested VM-Enter.
+ *
+ * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
+ * stores to forward them to L1, even if KVM does not need to
+ * intercept them to preserve its identity mapped page tables.
+ */
+ if (!(cr0 & X86_CR0_PG)) {
+ exec_controls_setbit(vmx, CR3_EXITING_BITS);
+ } else if (!is_guest_mode(vcpu)) {
+ exec_controls_clearbit(vmx, CR3_EXITING_BITS);
+ } else {
+ tmp = exec_controls_get(vmx);
+ tmp &= ~CR3_EXITING_BITS;
+ tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
+ exec_controls_set(vmx, tmp);
+ }
+
+ /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
+ if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+
+ /*
+ * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
+ * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
+ */
+ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ }
/* depends on vcpu->arch.cr0 to be set to a new value */
- vmx->emulation_required = emulation_required(vcpu);
+ vmx->emulation_required = vmx_emulation_required(vcpu);
}
static int vmx_get_max_tdp_level(void)
@@ -3088,8 +3164,7 @@ static int vmx_get_max_tdp_level(void)
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
- int root_level)
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
u64 eptp = VMX_EPTP_MT_WB;
@@ -3098,13 +3173,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
eptp |= VMX_EPTP_AD_ENABLE_BIT;
- eptp |= (root_hpa & PAGE_MASK);
+ eptp |= root_hpa;
return eptp;
}
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
- int pgd_level)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3112,32 +3187,27 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, pgd, pgd_level);
+ eptp = construct_eptp(vcpu, root_hpa, root_level);
vmcs_write64(EPT_POINTER, eptp);
- if (kvm_x86_ops.tlb_remote_flush) {
- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- to_vmx(vcpu)->ept_pointer = eptp;
- to_kvm_vmx(kvm)->ept_pointers_match
- = EPT_POINTERS_CHECK;
- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- }
+ hv_track_root_tdp(vcpu, root_hpa);
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
- else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
guest_cr3 = vcpu->arch.cr3;
- else /* vmcs01.GUEST_CR3 is already up-to-date. */
+ else /* vmcs.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
- guest_cr3 = pgd;
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
}
if (update_guest_cr3)
vmcs_writel(GUEST_CR3, guest_cr3);
}
+
static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
@@ -3297,7 +3367,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
return ar;
}
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3310,7 +3380,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write16(sf->selector, var->selector);
else if (var->s)
fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
- goto out;
+ return;
}
vmcs_writel(sf->base, var->base);
@@ -3332,9 +3402,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->type |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+}
-out:
- vmx->emulation_required = emulation_required(vcpu);
+static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ __vmx_set_segment(vcpu, var, seg);
+
+ to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3647,7 +3721,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
int ret = 0;
mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_page_done)
+ if (kvm->arch.apic_access_memslot_enabled)
goto out;
hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
@@ -3667,7 +3741,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
* is able to migrate it.
*/
put_page(page);
- kvm->arch.apic_access_page_done = true;
+ kvm->arch.apic_access_memslot_enabled = true;
out:
mutex_unlock(&kvm->slots_lock);
return ret;
@@ -3698,48 +3772,20 @@ void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __clear_bit(msr, msr_bitmap + 0x000 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
-}
-
-static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __clear_bit(msr, msr_bitmap + 0x800 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
-}
-
-static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __set_bit(msr, msr_bitmap + 0x000 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
-}
-
-static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
+ /*
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
+ * bitmap has changed.
+ */
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
- if (msr <= 0x1fff)
- __set_bit(msr, msr_bitmap + 0x800 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+ vmx->nested.force_msr_bitmap_recalc = true;
}
-static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
- u32 msr, int type)
+void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
@@ -3747,8 +3793,7 @@ static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
/*
* Mark the desired intercept state in shadow bitmap, this is needed
@@ -3784,8 +3829,7 @@ static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
vmx_clear_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
- u32 msr, int type)
+void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
@@ -3793,8 +3837,7 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
/*
* Mark the desired intercept state in shadow bitmap, this is needed
@@ -3818,30 +3861,6 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
- u32 msr, int type, bool value)
-{
- if (value)
- vmx_enable_intercept_for_msr(vcpu, msr, type);
- else
- vmx_disable_intercept_for_msr(vcpu, msr, type);
-}
-
-static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
-{
- u8 mode = 0;
-
- if (cpu_has_secondary_exec_ctrls() &&
- (secondary_exec_controls_get(to_vmx(vcpu)) &
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- mode |= MSR_BITMAP_MODE_X2APIC;
- if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
- mode |= MSR_BITMAP_MODE_X2APIC_APICV;
- }
-
- return mode;
-}
-
static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
{
unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
@@ -3859,11 +3878,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
}
}
-static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u8 mode;
+
if (!cpu_has_vmx_msr_bitmap())
return;
+ if (cpu_has_secondary_exec_ctrls() &&
+ (secondary_exec_controls_get(vmx) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode = MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ } else {
+ mode = 0;
+ }
+
+ if (mode == vmx->x2apic_msr_bitmap_mode)
+ return;
+
+ vmx->x2apic_msr_bitmap_mode = mode;
+
vmx_reset_x2apic_msrs(vcpu, mode);
/*
@@ -3880,21 +3917,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
}
}
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u8 mode = vmx_msr_bitmap_mode(vcpu);
- u8 changed = mode ^ vmx->msr_bitmap_mode;
-
- if (!changed)
- return;
-
- if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
- vmx_update_msr_bitmap_x2apic(vcpu, mode);
-
- vmx->msr_bitmap_mode = mode;
-}
-
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3905,7 +3927,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
- for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
}
@@ -3951,46 +3973,50 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
}
pt_update_intercept_for_msr(vcpu);
- vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
}
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- bool nested)
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ int pi_vec)
{
#ifdef CONFIG_SMP
- int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
-
if (vcpu->mode == IN_GUEST_MODE) {
/*
- * The vector of interrupt to be delivered to vcpu had
- * been set in PIR before this function.
+ * The vector of the virtual has already been set in the PIR.
+ * Send a notification event to deliver the virtual interrupt
+ * unless the vCPU is the currently running vCPU, i.e. the
+ * event is being sent from a fastpath VM-Exit handler, in
+ * which case the PIR will be synced to the vIRR before
+ * re-entering the guest.
*
- * Following cases will be reached in this block, and
- * we always send a notification event in all cases as
- * explained below.
+ * When the target is not the running vCPU, the following
+ * possibilities emerge:
*
- * Case 1: vcpu keeps in non-root mode. Sending a
- * notification event posts the interrupt to vcpu.
+ * Case 1: vCPU stays in non-root mode. Sending a notification
+ * event posts the interrupt to the vCPU.
*
- * Case 2: vcpu exits to root mode and is still
- * runnable. PIR will be synced to vIRR before the
- * next vcpu entry. Sending a notification event in
- * this case has no effect, as vcpu is not in root
- * mode.
+ * Case 2: vCPU exits to root mode and is still runnable. The
+ * PIR will be synced to the vIRR before re-entering the guest.
+ * Sending a notification event is ok as the host IRQ handler
+ * will ignore the spurious event.
*
- * Case 3: vcpu exits to root mode and is blocked.
- * vcpu_block() has already synced PIR to vIRR and
- * never blocks vcpu if vIRR is not cleared. Therefore,
- * a blocked vcpu here does not wait for any requested
- * interrupts in PIR, and sending a notification event
- * which has no effect is safe here.
+ * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+ * has already synced PIR to vIRR and never blocks the vCPU if
+ * the vIRR is not empty. Therefore, a blocked vCPU here does
+ * not wait for any requested interrupts in PIR, and sending a
+ * notification event also results in a benign, spurious event.
*/
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return true;
+ if (vcpu != kvm_get_running_vcpu())
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ return;
}
#endif
- return false;
+ /*
+ * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+ * otherwise do nothing as KVM will grab the highest priority pending
+ * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ */
+ kvm_vcpu_wake_up(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4006,9 +4032,21 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
vmx->nested.pi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * This pairs with the smp_mb_*() after setting vcpu->mode in
+ * vcpu_enter_guest() to guarantee the vCPU sees the event
+ * request if triggering a posted interrupt "fails" because
+ * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
+ * the smb_wmb() in kvm_make_request() only ensures everything
+ * done before making the request is visible when the request
+ * is visible, it doesn't ensure ordering between the store to
+ * vcpu->requests and the load from vcpu->mode.
+ */
+ smp_mb__after_atomic();
+
/* the PIR and ON have been set by L1. */
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
- kvm_vcpu_kick(vcpu);
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
return 0;
}
return -1;
@@ -4039,13 +4077,31 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_on(&vmx->pi_desc))
return 0;
- if (vcpu != kvm_get_running_vcpu() &&
- !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
- kvm_vcpu_kick(vcpu);
-
+ /*
+ * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+ * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+ * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+ * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+ */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
return 0;
}
+static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ if (vmx_deliver_posted_interrupt(vcpu, vector)) {
+ kvm_lapic_set_irr(vector, apic);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ }
+}
+
/*
* Set up the vmcs's constant host-state fields, i.e., host-state fields that
* will not change in the lifetime of the guest.
@@ -4097,6 +4153,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+
+ /*
+ * SYSENTER is used for 32-bit system calls on either 32-bit or
+ * 64-bit kernels. It is always zero If neither is allowed, otherwise
+ * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+ * have already done so!).
+ */
+ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
@@ -4115,15 +4181,17 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
~vcpu->arch.cr4_guest_rsvd_bits;
- if (!enable_ept)
- vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
+ if (!enable_ept) {
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
+ }
if (is_guest_mode(&vmx->vcpu))
vcpu->arch.cr4_guest_owned_bits &=
~get_vmcs12(vcpu)->cr4_guest_host_mask;
vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
}
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
@@ -4139,10 +4207,39 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
return pin_based_exec_ctrl;
}
+static u32 vmx_vmentry_ctrl(void)
+{
+ u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmentry_ctrl &
+ ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
+}
+
+static u32 vmx_vmexit_ctrl(void)
+{
+ u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmexit_ctrl &
+ ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+}
+
static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_apicv_status = true;
+ return;
+ }
+
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
if (cpu_has_secondary_exec_ctrls()) {
if (kvm_vcpu_apicv_active(vcpu))
@@ -4155,11 +4252,10 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
}
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
-u32 vmx_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -4241,7 +4337,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
-static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
struct kvm_vcpu *vcpu = &vmx->vcpu;
@@ -4299,7 +4395,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
xsaves_enabled, false);
}
- vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
+ /*
+ * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+ * feature is exposed to the guest. This creates a virtualization hole
+ * if both are supported in hardware but only one is exposed to the
+ * guest, but letting the guest execute RDTSCP or RDPID when either one
+ * is advertised is preferable to emulating the advertised instruction
+ * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+ */
+ if (cpu_has_vmx_rdtscp()) {
+ bool rdpid_or_rdtscp_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_ENABLE_RDTSCP,
+ rdpid_or_rdtscp_enabled, false);
+ }
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -4311,24 +4423,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (!vcpu->kvm->arch.bus_lock_detection_enabled)
exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
- vmx->secondary_exec_control = exec_control;
-}
-
-static void ept_set_mmio_spte_mask(void)
-{
- /*
- * EPT Misconfigurations can be generated if the value of bits 2:0
- * of an EPT paging-structure entry is 110b (write/execute).
- */
- kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
+ return exec_control;
}
#define VMX_XSS_EXIT_BITMAP 0
-/*
- * Noting that the initialization of Guest-state Area of VMCS is in
- * vmx_vcpu_reset().
- */
static void init_vmcs(struct vcpu_vmx *vmx)
{
if (nested)
@@ -4337,19 +4436,17 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
/* Control */
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
- }
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
- if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+ if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4410,8 +4507,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
- if (cpu_has_vmx_encls_vmexit())
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ vmx_write_encls_bitmap(&vmx->vcpu, NULL);
if (vmx_pt_mode_is_host_guest()) {
memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -4419,33 +4515,67 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(&vmx->vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vmx->vcpu.arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ vmx_setup_uret_msrs(vmx);
+}
+
+static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ init_vmcs(vmx);
+
+ if (nested)
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
+
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ vmx->nested.current_vmptr = INVALID_GPA;
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+ vmx->pi_desc.sn = 1;
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct msr_data apic_base_msr;
- u64 cr0;
+
+ if (!init_event)
+ __vmx_vcpu_reset(vcpu);
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
vmx->msr_ia32_umwait_control = 0;
- vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
- if (!init_event) {
- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(vcpu))
- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(vcpu, &apic_base_msr);
- }
-
vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
@@ -4467,16 +4597,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
- if (!init_event) {
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- }
-
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- kvm_rip_write(vcpu, 0xfff0);
-
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4489,31 +4609,13 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (kvm_mpx_supported())
vmcs_write64(GUEST_BNDCFGS, 0);
- setup_msrs(vmx);
-
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
- if (cpu_has_vmx_tpr_shadow() && !init_event) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (cpu_need_tpr_shadow(vcpu))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vcpu->arch.apic->regs));
- vmcs_write32(TPR_THRESHOLD, 0);
- }
-
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx->vcpu.arch.cr0 = cr0;
- vmx_set_cr0(vcpu, cr0); /* enter rmode */
- vmx_set_cr4(vcpu, 0);
- vmx_set_efer(vcpu, 0);
-
- vmx_update_exception_bitmap(vcpu);
-
vpid_sync_context(vmx->vpid);
- if (init_event)
- vmx_clear_hlt(vcpu);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
}
static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -4741,7 +4843,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (kvm_emulate_instruction(vcpu, 0)) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
return 1;
}
@@ -4774,7 +4876,7 @@ static int handle_machine_check(struct kvm_vcpu *vcpu)
* - Guest has #AC detection enabled in CR0
* - Guest EFLAGS has AC bit set
*/
-static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
{
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
return true;
@@ -4788,7 +4890,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 intr_info, ex_no, error_code;
- unsigned long cr2, rip, dr6;
+ unsigned long cr2, dr6;
u32 vect_info;
vect_info = vmx->idt_vectoring_info;
@@ -4797,6 +4899,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info) || is_nmi(intr_info))
return 1; /* handled by handle_exception_nmi_irqoff() */
+ /*
+ * Queue the exception here instead of in handle_nm_fault_irqoff().
+ * This ensures the nested_vmx check is not skipped so vmexit can
+ * be reflected to L1 (when it intercepts #NM) before reaching this
+ * point.
+ */
+ if (is_nm_fault(intr_info)) {
+ kvm_queue_exception(vcpu, NM_VECTOR);
+ return 1;
+ }
+
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
@@ -4860,8 +4973,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Note, skipping ICEBP also
+ * clears STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
@@ -4878,12 +5016,11 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
- rip = kvm_rip_read(vcpu);
- kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = ex_no;
break;
case AC_VECTOR:
- if (guest_inject_ac(vcpu)) {
+ if (vmx_guest_inject_ac(vcpu)) {
kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
return 1;
}
@@ -5020,7 +5157,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- val = kvm_register_readl(vcpu, reg);
+ val = kvm_register_read(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
@@ -5028,6 +5165,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, err);
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
err = kvm_set_cr3(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 4:
@@ -5053,14 +5191,13 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 2: /* clts */
- WARN_ONCE(1, "Guest should always own CR0.TS");
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- return kvm_skip_emulated_instruction(vcpu);
+ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+ return -EIO;
case 1: /*mov from cr*/
switch (cr) {
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
@@ -5100,7 +5237,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!kvm_require_dr(vcpu, dr))
return 1;
- if (kvm_x86_ops.get_cpl(vcpu) > 0)
+ if (vmx_get_cpl(vcpu) > 0)
goto out;
dr7 = vmcs_readl(GUEST_DR7);
@@ -5143,7 +5280,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
kvm_register_write(vcpu, reg, val);
err = 0;
} else {
- err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg));
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
}
out:
@@ -5161,6 +5298,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
+ * a stale dr6 from the guest.
+ */
+ set_debugreg(DR6_RESERVED, 6);
}
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5184,17 +5327,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
- /* Treat an INVD instruction as a NOP and just skip it. */
- return kvm_skip_emulated_instruction(vcpu);
-}
-
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
@@ -5203,28 +5335,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
- int err;
-
- err = kvm_rdpmc(vcpu);
- return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
- u64 new_bv = kvm_read_edx_eax(vcpu);
- u32 index = kvm_rcx_read(vcpu);
-
- int err = kvm_set_xcr(vcpu, index, new_bv);
- return kvm_complete_insn_gp(vcpu, err);
-}
-
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
@@ -5260,9 +5370,16 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
- u32 offset = exit_qualification & 0xfff;
- /* APIC-write VM exit is trap-like and thus no need to adjust IP */
+ /*
+ * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+ * hardware has done any necessary aliasing, offset adjustments, etc...
+ * for the access. I.e. the correct value has already been written to
+ * the vAPIC page for the correct 16-byte chunk. KVM needs only to
+ * retrieve the register value and emulate the access.
+ */
+ u32 offset = exit_qualification & 0xff0;
+
kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
@@ -5356,12 +5473,10 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
? PFERR_FETCH_MASK : 0;
/* ept page table entry is present? */
- error_code |= (exit_qualification &
- (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
- EPT_VIOLATION_EXECUTABLE))
+ error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
? PFERR_PRESENT_MASK : 0;
- error_code |= (exit_qualification & 0x100) != 0 ?
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
vcpu->arch.exit_qualification = exit_qualification;
@@ -5384,6 +5499,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
+ return 1;
+
/*
* A nested guest cannot optimize MMIO vmexits, because we have an
* nGPA here instead of the required GPA.
@@ -5400,7 +5518,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
- WARN_ON_ONCE(!enable_vnmi);
+ if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+ return -EIO;
+
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5408,6 +5528,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
+static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->emulation_required && !vmx->rmode.vm86_active &&
+ (vcpu->arch.exception.pending || vcpu->arch.exception.injected);
+}
+
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5427,18 +5555,14 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
- if (vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
/*
@@ -5453,6 +5577,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
+static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ return 1;
+}
+
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5485,18 +5619,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-static void vmx_enable_tdp(void)
-{
- kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
- enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
- enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK,
- cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- VMX_EPT_RWX_MASK, 0ull);
-
- ept_set_mmio_spte_mask();
-}
-
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5516,34 +5638,11 @@ static int handle_pause(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
- return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
- printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
- return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
-}
-
static int handle_monitor_trap(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
- printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
- return handle_nop(vcpu);
-}
-
static int handle_invpcid(struct kvm_vcpu *vcpu)
{
u32 vmx_instruction_info;
@@ -5553,6 +5652,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
u64 pcid;
u64 gla;
} operand;
+ int gpr_index;
if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5560,12 +5660,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
}
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
-
- if (type > 3) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
/* According to the Intel instruction reference, the memory operand
* is read even if it isn't needed (e.g., for type==all)
@@ -5632,22 +5728,28 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
/*
- * SGX virtualization is not yet supported. There is no software
- * enable bit for SGX, so we have to trap ENCLS and inject a #UD
- * to prevent the guest from executing ENCLS.
+ * SGX virtualization is disabled. There is no software enable bit for
+ * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+ * the guest from executing ENCLS (when SGX is supported by hardware).
*/
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
+#endif /* CONFIG_X86_SGX_KVM */
static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
{
- vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
- vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
- return 0;
+ /*
+ * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
+ * VM-Exits. Unconditionally set the flag here and leave the handling to
+ * vmx_handle_exit().
+ */
+ to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
+ return 1;
}
/*
@@ -5668,10 +5770,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
[EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
[EXIT_REASON_HLT] = kvm_emulate_halt,
- [EXIT_REASON_INVD] = handle_invd,
+ [EXIT_REASON_INVD] = kvm_emulate_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
- [EXIT_REASON_RDPMC] = handle_rdpmc,
- [EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
+ [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
[EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
[EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
[EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
@@ -5685,8 +5787,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_APIC_WRITE] = handle_apic_write,
[EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
- [EXIT_REASON_WBINVD] = handle_wbinvd,
- [EXIT_REASON_XSETBV] = handle_xsetbv,
+ [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
+ [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_GDTR_IDTR] = handle_desc,
@@ -5694,13 +5796,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
- [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
[EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
- [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
[EXIT_REASON_INVEPT] = handle_vmx_instruction,
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
- [EXIT_REASON_RDRAND] = handle_invalid_op,
- [EXIT_REASON_RDSEED] = handle_invalid_op,
+ [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
+ [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
@@ -5712,11 +5814,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ *reason = vmx->exit_reason.full;
*info1 = vmx_get_exit_qual(vcpu);
if (!(vmx->exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
@@ -5787,12 +5891,23 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
}
-void dump_vmcs(void)
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
{
+ unsigned int i;
+ struct vmx_msr_entry *e;
+
+ pr_err("MSR %s:\n", name);
+ for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+ pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmentry_ctl, vmexit_ctl;
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
unsigned long cr4;
- u64 efer;
+ int efer_slot;
if (!dump_invalid_vmcs) {
pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5804,11 +5919,12 @@ void dump_vmcs(void)
cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
cr4 = vmcs_readl(GUEST_CR4);
- efer = vmcs_read64(GUEST_IA32_EFER);
secondary_exec_control = 0;
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
+ vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
pr_err("*** Guest State ***\n");
pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
@@ -5816,9 +5932,7 @@ void dump_vmcs(void)
pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
- if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
- (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
- {
+ if (cpu_has_vmx_ept()) {
pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
@@ -5841,10 +5955,20 @@ void dump_vmcs(void)
vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
- if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
- (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- efer, vmcs_read64(GUEST_IA32_PAT));
+ efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+ else if (efer_slot >= 0)
+ pr_err("EFER= 0x%016llx (autoload)\n",
+ vmx->msr_autoload.guest.val[efer_slot].value);
+ else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer | (EFER_LMA | EFER_LME));
+ else
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
@@ -5860,6 +5984,10 @@ void dump_vmcs(void)
if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
pr_err("InterruptStatus = %04x\n",
vmcs_read16(GUEST_INTR_STATUS));
+ if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+ if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+ vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
pr_err("*** Host State ***\n");
pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
@@ -5881,14 +6009,16 @@ void dump_vmcs(void)
vmcs_readl(HOST_IA32_SYSENTER_ESP),
vmcs_read32(HOST_IA32_SYSENTER_CS),
vmcs_readl(HOST_IA32_SYSENTER_EIP));
- if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- vmcs_read64(HOST_IA32_EFER),
- vmcs_read64(HOST_IA32_PAT));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
if (cpu_has_load_perf_global_ctrl() &&
vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+ if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
pr_err("*** Control State ***\n");
pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -5960,16 +6090,13 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vmx_flush_pml_buffer(vcpu);
/*
- * We should never reach this point with a pending nested VM-Enter, and
- * more specifically emulation of L2 due to invalid guest state (see
- * below) should never happen as that means we incorrectly allowed a
- * nested VM-Enter with an invalid vmcs12.
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/
- WARN_ON_ONCE(vmx->nested.nested_run_pending);
-
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required)
- return handle_invalid_guest_state(vcpu);
+ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+ return -EIO;
if (is_guest_mode(vcpu)) {
/*
@@ -5992,12 +6119,32 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
if (nested_vmx_reflect_vmexit(vcpu))
return 1;
}
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
if (exit_reason.failed_vmentry) {
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason.full;
@@ -6006,7 +6153,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
}
if (unlikely(vmx->fail)) {
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
@@ -6027,19 +6174,19 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+ int ndata = 3;
+
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
vcpu->run->internal.data[1] = exit_reason.full;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.ndata++;
- vcpu->run->internal.data[3] =
+ vcpu->run->internal.data[ndata++] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
- vcpu->run->internal.data[vcpu->run->internal.ndata++] =
- vcpu->arch.last_vmentry_cpu;
+ vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
+ vcpu->run->internal.ndata = ndata;
return 0;
}
@@ -6092,7 +6239,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
unexpected_vmexit:
vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
exit_reason.full);
- dump_vmcs();
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
@@ -6107,9 +6254,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
/*
- * Even when current exit reason is handled by KVM internally, we
- * still need to exit to user space when bus lock detected to inform
- * that there is a bus lock in guest.
+ * Exit to user space when bus lock detected to inform that there is
+ * a bus lock in guest.
*/
if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
if (ret > 0)
@@ -6136,7 +6282,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
int size = PAGE_SIZE << L1D_CACHE_ORDER;
/*
- * This code is only executed when the the flush mode is 'cond' or
+ * This code is only executed when the flush mode is 'cond' or
* 'always'
*/
if (static_branch_likely(&vmx_l1d_flush_cond)) {
@@ -6232,6 +6378,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
switch (kvm_get_apic_mode(vcpu)) {
case LAPIC_MODE_INVALID:
WARN_ONCE(true, "Invalid local APIC state");
+ break;
case LAPIC_MODE_DISABLED:
break;
case LAPIC_MODE_XAPIC:
@@ -6257,7 +6404,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
}
secondary_exec_controls_set(vmx, sec_exec_control);
- vmx_update_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
@@ -6340,9 +6487,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
+
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
+ return -EIO;
- WARN_ON(!vcpu->arch.apicv_active);
if (pi_test_on(&vmx->pi_desc)) {
pi_clear_on(&vmx->pi_desc);
/*
@@ -6350,27 +6499,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
+ got_posted_interrupt =
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, we should re-evaluate
- * what should be done with this new L1 interrupt.
- * If L1 intercepts external-interrupts, we should
- * exit from L2 to L1. Otherwise, interrupt should be
- * delivered directly to L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated) {
- if (nested_exit_on_intr(vcpu))
- kvm_vcpu_exiting_guest_mode(vcpu);
- else
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
@@ -6395,46 +6550,76 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+ unsigned long entry)
{
- unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
- gate_desc *desc = (gate_desc *)host_idt_base + vector;
+ bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
- kvm_before_interrupt(vcpu);
- vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+ kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
+ vmx_do_interrupt_nmi_irqoff(entry);
kvm_after_interrupt(vcpu);
}
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Save xfd_err to guest_fpu before interrupt is enabled, so the
+ * MSR value is not clobbered by the host activity before the guest
+ * has chance to consume it.
+ *
+ * Do not blindly read xfd_err here, since this exception might
+ * be caused by L1 interception on a platform which doesn't
+ * support xfd at all.
+ *
+ * Do it conditionally upon guest_fpu::xfd. xfd_err matters
+ * only when xfd contains a non-zero value.
+ *
+ * Queuing exception is done in vmx_handle_exit. See comment there.
+ */
+ if (vcpu->arch.guest_fpu.fpstate->xfd)
+ rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
+ const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ /* if exit due to NM, handle before interrupts are enabled */
+ else if (is_nm_fault(intr_info))
+ handle_nm_fault_irqoff(&vmx->vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+ handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
u32 intr_info = vmx_get_intr_info(vcpu);
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+ gate_desc *desc = (gate_desc *)host_idt_base + vector;
- if (WARN_ONCE(!is_external_intr(intr_info),
+ if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, intr_info);
+ handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ vcpu->arch.at_instruction_boundary = true;
}
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vmx->emulation_required)
+ return;
+
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
@@ -6457,6 +6642,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
+ case MSR_AMD64_TSC_RATIO:
/* This is AMD only. */
return false;
default:
@@ -6642,31 +6828,18 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
- /*
- * VMENTER enables interrupts (host state), but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about
- * it. This is the same logic as for exit_to_user_mode().
- *
- * This ensures that e.g. latency analysis on the host observes
- * guest mode as interrupt enabled.
- *
- * guest_enter_irqoff() informs context tracking about the
- * transition to guest mode and if enabled adjusts RCU state
- * accordingly.
- */
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- guest_enter_irqoff();
- lockdep_hardirqs_on(CALLER_ADDR0);
+ guest_state_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mds_user_clear))
mds_clear_cpu_buffers();
+ else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ kvm_arch_has_assigned_device(vcpu->kvm))
+ mds_clear_cpu_buffers();
+
+ vmx_disable_fb_clear(vmx);
if (vcpu->arch.cr2 != native_read_cr2())
native_write_cr2(vcpu->arch.cr2);
@@ -6676,24 +6849,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = native_read_cr2();
- /*
- * VMEXIT disables interrupts (host state), but tracing and lockdep
- * have them in state 'on' as recorded before entering guest mode.
- * Same as enter_from_user_mode().
- *
- * guest_exit_irqoff() restores host context and reinstates RCU if
- * enabled and required.
- *
- * This needs to be done before the below as native_read_msr()
- * contains a tracepoint and x86_spec_ctrl_restore_host() calls
- * into world and some more.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- guest_exit_irqoff();
+ vmx_enable_fb_clear(vmx);
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
+ guest_state_exit_irqoff();
}
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -6706,10 +6864,22 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->soft_vnmi_blocked))
vmx->loaded_vmcs->entry_time = ktime_get();
- /* Don't enter VMX if guest state is invalid, let the exit handler
- start emulation until we arrive back to a valid state */
- if (vmx->emulation_required)
+ /*
+ * Don't enter VMX if guest state is invalid, let the exit handler
+ * start emulation until we arrive back to a valid state. Synthesize a
+ * consistency check VM-Exit due to invalid guest state and bail.
+ */
+ if (unlikely(vmx->emulation_required)) {
+ vmx->fail = 0;
+
+ vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->exit_reason.failed_vmentry = 1;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ vmx->exit_intr_info = 0;
return EXIT_FASTPATH_NONE;
+ }
trace_kvm_entry(vcpu);
@@ -6728,7 +6898,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ vcpu->arch.regs_dirty = 0;
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+ * it switches back to the current->mm, which can occur in KVM context
+ * when switching to a temporary mm to patch kernel code, e.g. if KVM
+ * toggles a static key while handling a VM-Exit.
+ */
cr3 = __get_current_cr3_fast();
if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
vmcs_writel(HOST_CR3, cr3);
@@ -6741,6 +6919,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ set_debugreg(vcpu->arch.dr6, 6);
+
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -6788,7 +6970,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
@@ -6818,13 +7000,24 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
loadsegment(es, __USER_DS);
#endif
- vmx_register_cache_reset(vcpu);
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
pt_guest_exit(vmx);
kvm_load_host_xsave_state(vcpu);
- vmx->nested.nested_run_pending = 0;
+ if (is_guest_mode(vcpu)) {
+ /*
+ * Track VMLAUNCH/VMRESUME that have made past guest state
+ * checking.
+ */
+ if (vmx->nested.nested_run_pending &&
+ !vmx->exit_reason.failed_vmentry)
+ ++vcpu->stat.nested_run;
+
+ vmx->nested.nested_run_pending = 0;
+ }
+
vmx->idt_vectoring_info = 0;
if (unlikely(vmx->fail)) {
@@ -6839,7 +7032,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (likely(!vmx->exit_reason.failed_vmentry))
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
@@ -6855,7 +7048,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return vmx_exit_handlers_fastpath(vcpu);
}
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6866,14 +7059,17 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
free_loaded_vmcs(vmx->loaded_vmcs);
}
-static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
+static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
+ struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
- int i, cpu, err;
+ int i, err;
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
+ INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+
err = -ENOMEM;
vmx->vpid = allocate_vpid();
@@ -6890,57 +7086,46 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid;
}
- BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
-
- for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
- u32 index = vmx_uret_msrs_list[i];
- u32 data_low, data_high;
- int j = vmx->nr_uret_msrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
-
- vmx->guest_uret_msrs[j].slot = i;
- vmx->guest_uret_msrs[j].data = 0;
- switch (index) {
- case MSR_IA32_TSX_CTRL:
- /*
- * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
- * interception. Keep the host value unchanged to avoid
- * changing CPUID bits under the host kernel's feet.
- *
- * hle=0, rtm=0, tsx_ctrl=1 can be found with some
- * combinations of new kernel and old userspace. If
- * those guests run on a tsx=off host, do allow guests
- * to use TSX_CTRL, but do not change the value on the
- * host so that TSX remains always disabled.
- */
- if (boot_cpu_has(X86_FEATURE_RTM))
- vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
- else
- vmx->guest_uret_msrs[j].mask = 0;
- break;
- default:
- vmx->guest_uret_msrs[j].mask = -1ull;
- break;
- }
- ++vmx->nr_uret_msrs;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i)
+ vmx->guest_uret_msrs[i].mask = -1ull;
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+ * Keep the host value unchanged to avoid changing CPUID bits
+ * under the host kernel's feet.
+ */
+ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (tsx_ctrl)
+ tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
goto free_pml;
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
@@ -6950,15 +7135,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
}
- vmx->msr_bitmap_mode = 0;
vmx->loaded_vmcs = &vmx->vmcs01;
- cpu = get_cpu();
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- init_vmcs(vmx);
- vmx_vcpu_put(vcpu);
- put_cpu();
+
if (cpu_need_virtualize_apic_accesses(vcpu)) {
err = alloc_apic_access_page(vcpu->kvm);
if (err)
@@ -6971,26 +7150,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
- if (nested)
- memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
- else
- memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
-
- vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
-
- vcpu->arch.microcode_version = 0x100000000ULL;
- vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
-
- /*
- * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
- * or POSTED_INTR_WAKEUP_VECTOR.
- */
- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
-
- vmx->ept_pointer = INVALID_PAGE;
-
return 0;
free_vmcs:
@@ -7007,8 +7166,6 @@ free_vpid:
static int vmx_vm_init(struct kvm *kvm)
{
- spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
-
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -7035,7 +7192,6 @@ static int vmx_vm_init(struct kvm *kvm)
break;
}
}
- kvm_apicv_init(kvm, enable_apicv);
return 0;
}
@@ -7065,7 +7221,6 @@ static int __init vmx_check_processor_compat(void)
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
- u64 ipat = 0;
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
@@ -7085,33 +7240,25 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
* EPT memory type is used to emulate guest CD/MTRR.
*/
- if (is_mmio) {
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
- ipat = VMX_EPT_IPAT_BIT;
- cache = MTRR_TYPE_WRBACK;
- goto exit;
- }
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
- ipat = VMX_EPT_IPAT_BIT;
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cache = MTRR_TYPE_WRBACK;
else
cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
- cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+ }
-exit:
- return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
+ return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
}
-static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
{
/*
* These bits in the secondary execution controls field
@@ -7125,7 +7272,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC;
- u32 new_ctl = vmx->secondary_exec_control;
u32 cur_ctl = secondary_exec_controls_get(vmx);
secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
@@ -7209,12 +7355,13 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
}
/* Get the number of configurable Address Ranges for filtering */
- vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
+ vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges);
/* Initialize and clear the no dependency bits */
vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
- RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+ RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
+ RTIT_CTL_BRANCH_EN);
/*
* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
@@ -7232,12 +7379,11 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
/*
- * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
- * MTCFreq can be set
+ * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
*/
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
- RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+ RTIT_CTL_MTC_RANGE);
/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
@@ -7252,12 +7398,12 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
- /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+ /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
/* unmask address range configure area */
- for (i = 0; i < vmx->pt_desc.addr_range; i++)
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
@@ -7268,17 +7414,18 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
vcpu->arch.xsaves_enabled = false;
- if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- vmcs_set_secondary_exec_control(vmx);
- }
+ vmx_setup_uret_msrs(vmx);
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(vmx,
+ vmx_secondary_exec_control(vmx));
if (nested_vmx_allowed(vcpu))
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ vmx->msr_ia32_feature_control_valid_bits &=
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
@@ -7300,8 +7447,26 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
}
}
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+
+
set_cr4_guest_host_mask(vmx);
+ vmx_write_encls_bitmap(vcpu, NULL);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_SGX_LC_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_SGX_LC_ENABLED;
+
/* Refresh #PF interception to account for MAXPHYADDR changes. */
vmx_update_exception_bitmap(vcpu);
}
@@ -7322,6 +7487,13 @@ static __init void vmx_set_cpu_caps(void)
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (!enable_sgx) {
+ kvm_cpu_cap_clear(X86_FEATURE_SGX);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ }
+
if (vmx_umip_emulated())
kvm_cpu_cap_set(X86_FEATURE_UMIP);
@@ -7330,9 +7502,11 @@ static __init void vmx_set_cpu_caps(void)
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
- /* CPUID 0x80000001 */
- if (!cpu_has_vmx_rdtscp())
+ /* CPUID 0x80000001 and 0x7 (RDPID) */
+ if (!cpu_has_vmx_rdtscp()) {
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@@ -7388,8 +7562,9 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/*
* RDPID causes #UD if disabled through secondary execution controls.
* Because it is marked as EmulateOnUD, we need to intercept it here.
+ * Note, RDPID is hidden behind ENABLE_RDTSCP.
*/
- case x86_intercept_rdtscp:
+ case x86_intercept_rdpid:
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
@@ -7464,10 +7639,10 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
delta_tsc = 0;
/* Convert to host delta tsc if tsc scaling is enabled */
- if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
delta_tsc && u64_shl_div_u64(delta_tsc,
kvm_tsc_scaling_ratio_frac_bits,
- vcpu->arch.tsc_scaling_ratio, &delta_tsc))
+ vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
return -ERANGE;
/*
@@ -7516,25 +7691,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
-{
- if (pi_pre_block(vcpu))
- return 1;
-
- if (kvm_lapic_hv_timer_in_use(vcpu))
- kvm_lapic_switch_to_sw_timer(vcpu);
-
- return 0;
-}
-
-static void vmx_post_block(struct kvm_vcpu *vcpu)
-{
- if (kvm_x86_ops.set_hv_timer)
- kvm_lapic_switch_to_hv_timer(vcpu);
-
- pi_post_block(vcpu);
-}
-
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@ -7553,7 +7709,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7567,7 +7723,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -7582,6 +7738,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
if (ret)
return ret;
+ vmx->nested.nested_run_pending = 1;
vmx->nested.smm.guest_mode = false;
}
return 0;
@@ -7607,38 +7764,45 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void hardware_unsetup(void)
+static void vmx_hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
free_kvm_area();
}
-static bool vmx_check_apicv_inhibit_reasons(ulong bit)
+static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_HYPERV);
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
+ BIT(APICV_INHIBIT_REASON_HYPERV) |
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
- return supported & BIT(bit);
+ return supported & BIT(reason);
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .hardware_unsetup = hardware_unsetup,
+ .name = "kvm_intel",
+
+ .hardware_unsetup = vmx_hardware_unsetup,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
+ .hardware_enable = vmx_hardware_enable,
+ .hardware_disable = vmx_hardware_disable,
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
.vcpu_reset = vmx_vcpu_reset,
- .prepare_guest_switch = vmx_prepare_switch_to_guest,
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
@@ -7664,21 +7828,23 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
- .tlb_flush_all = vmx_flush_tlb_all,
- .tlb_flush_current = vmx_flush_tlb_current,
- .tlb_flush_gva = vmx_flush_tlb_gva,
- .tlb_flush_guest = vmx_flush_tlb_guest,
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
- .run = vmx_vcpu_run,
+ .vcpu_pre_run = vmx_vcpu_pre_run,
+ .vcpu_run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
.set_interrupt_shadow = vmx_set_interrupt_shadow,
.get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
.cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
@@ -7698,7 +7864,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .deliver_interrupt = vmx_deliver_interrupt,
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
.set_tss_addr = vmx_set_tss_addr,
@@ -7711,7 +7877,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
+ .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .write_tsc_multiplier = vmx_write_tsc_multiplier,
.load_mmu_pgd = vmx_load_mmu_pgd,
@@ -7725,13 +7894,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
- .pre_block = vmx_pre_block,
- .post_block = vmx_post_block,
-
- .pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
- .update_pi_irte = pi_update_irte,
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
@@ -7741,8 +7907,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.setup_mce = vmx_setup_mce,
.smi_allowed = vmx_smi_allowed,
- .pre_enter_smm = vmx_pre_enter_smm,
- .pre_leave_smm = vmx_pre_leave_smm,
+ .enter_smm = vmx_enter_smm,
+ .leave_smm = vmx_leave_smm,
.enable_smi_window = vmx_enable_smi_window,
.can_emulate_instruction = vmx_can_emulate_instruction,
@@ -7755,17 +7921,83 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
+static unsigned int vmx_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+ /* '0' on failure so that the !PT case can use a RET0 static call. */
+ if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
+ return 0;
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+ return 1;
+}
+
+static __init void vmx_setup_user_return_msrs(void)
+{
+
+ /*
+ * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+ * will emulate SYSCALL in legacy mode if the vendor string in guest
+ * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+ * support this emulation, MSR_STAR is included in the list for i386,
+ * but is never loaded into hardware. MSR_CSTAR is also never loaded
+ * into hardware and is here purely for emulation purposes.
+ */
+ const u32 vmx_uret_msrs_list[] = {
+ #ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+ #endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
+ };
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
+}
+
+static void __init vmx_setup_me_spte_mask(void)
+{
+ u64 me_mask = 0;
+
+ /*
+ * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
+ * the former to avoid exposing shadow_phys_bits.
+ *
+ * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
+ * shadow_phys_bits. On MKTME and/or TDX capable systems,
+ * boot_cpu_data.x86_phys_bits holds the actual physical address
+ * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
+ * reported by CPUID. Those bits between are KeyID bits.
+ */
+ if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
+ kvm_get_shadow_phys_bits() - 1);
+ /*
+ * Unlike SME, host kernel doesn't support setting up any
+ * MKTME KeyID on Intel platforms. No memory encryption
+ * bits should be included into the SPTE.
+ */
+ kvm_mmu_set_me_spte_mask(0, me_mask);
+}
+
+static struct kvm_x86_init_ops vmx_init_ops __initdata;
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, i, ept_lpage_level;
+ int r;
store_idt(&dt);
host_idt_base = dt.address;
- for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
- kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
+ vmx_setup_user_return_msrs();
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
@@ -7792,6 +8024,12 @@ static __init int hardware_setup(void)
!cpu_has_vmx_invept_global())
enable_ept = 0;
+ /* NX support is required for shadow paging. */
+ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
@@ -7832,33 +8070,32 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
- }
- if (cpu_has_vmx_tsc_scaling()) {
+ if (cpu_has_vmx_tsc_scaling())
kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 48;
- }
+ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 48;
kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
- vmx_enable_tdp();
+ kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+ cpu_has_vmx_ept_execute_only());
- if (!enable_ept)
- ept_lpage_level = 0;
- else if (cpu_has_vmx_ept_1g_page())
- ept_lpage_level = PG_LEVEL_1G;
- else if (cpu_has_vmx_ept_2m_page())
- ept_lpage_level = PG_LEVEL_2M;
- else
- ept_lpage_level = PG_LEVEL_4K;
- kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
+ /*
+ * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
+ * bits to shadow_zero_check.
+ */
+ vmx_setup_me_spte_mask();
+
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+ ept_caps_to_lpage_level(vmx_capability.ept));
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -7900,14 +8137,18 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
return -EINVAL;
if (!enable_ept || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
+ if (pt_mode == PT_MODE_HOST_GUEST)
+ vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ else
+ vmx_init_ops.handle_intel_pt_intr = NULL;
+
+ setup_default_sgx_lepubkeyhash();
if (nested) {
nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
@@ -7921,8 +8162,11 @@ static __init int hardware_setup(void)
vmx_set_cpu_caps();
r = alloc_kvm_area();
- if (r)
+ if (r && nested)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
@@ -7931,8 +8175,10 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.disabled_by_bios = vmx_disabled_by_bios,
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
+ .handle_intel_pt_intr = NULL,
.runtime_ops = &vmx_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
};
static void vmx_cleanup_l1d_flush(void)
@@ -7978,6 +8224,8 @@ static void vmx_exit(void)
}
#endif
vmx_cleanup_l1d_flush();
+
+ allow_smaller_maxphyaddr = false;
}
module_exit(vmx_exit);
@@ -7995,7 +8243,6 @@ static int __init vmx_init(void)
ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
(ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
KVM_EVMCS_VERSION) {
- int cpu;
/* Check that we have assist pages on all online CPUs */
for_each_online_cpu(cpu) {
@@ -8037,6 +8284,8 @@ static int __init vmx_init(void)
return r;
}
+ vmx_setup_fb_clear_ctrl();
+
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 89da5e1251f1..8d2342ede0c5 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -14,8 +14,6 @@
#include "vmx_ops.h"
#include "cpuid.h"
-extern const u32 vmx_msr_index[];
-
#define MSR_TYPE_R 1
#define MSR_TYPE_W 2
#define MSR_TYPE_RW 3
@@ -36,7 +34,7 @@ struct vmx_msrs {
};
struct vmx_uret_msr {
- unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
+ bool load_into_hardware;
u64 data;
u64 mask;
};
@@ -64,7 +62,7 @@ struct pt_ctx {
struct pt_desc {
u64 ctl_bitmask;
- u32 addr_range;
+ u32 num_address_ranges;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
struct pt_ctx host;
struct pt_ctx guest;
@@ -144,6 +142,16 @@ struct nested_vmx {
struct vmcs12 *cached_shadow_vmcs12;
/*
+ * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer
+ */
+ struct gfn_to_hva_cache shadow_vmcs12_cache;
+
+ /*
+ * GPA to HVA cache for VMCS12
+ */
+ struct gfn_to_hva_cache vmcs12_cache;
+
+ /*
* Indicates if the shadow vmcs or enlightened vmcs must be updated
* with the data held by struct vmcs12.
*/
@@ -151,6 +159,15 @@ struct nested_vmx {
bool dirty_vmcs12;
/*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+
+ /*
* Indicates lazily loaded guest state has not yet been decached from
* vmcs02.
*/
@@ -166,6 +183,7 @@ struct nested_vmx {
bool change_vmcs01_virtual_apic_mode;
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
+ bool update_vmcs01_apicv_status;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -229,7 +247,7 @@ struct nested_vmx {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
u8 fail;
- u8 msr_bitmap_mode;
+ u8 x2apic_msr_bitmap_mode;
/*
* If true, host state has been stored in vmx->loaded_vmcs for
@@ -245,9 +263,13 @@ struct vcpu_vmx {
u32 idt_vectoring_info;
ulong rflags;
+ /*
+ * User return MSRs are always emulated when enabled in the guest, but
+ * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
+ * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
+ * be loaded into hardware if those conditions aren't met.
+ */
struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
- int nr_uret_msrs;
- int nr_active_uret_msrs;
bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
@@ -257,8 +279,6 @@ struct vcpu_vmx {
u64 spec_ctrl;
u32 msr_ia32_umwait_control;
- u32 secondary_exec_control;
-
/*
* loaded_vmcs points to the VMCS currently used in this vcpu. For a
* non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -298,6 +318,9 @@ struct vcpu_vmx {
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
+ /* Used if this vCPU is waiting for PI notification wakeup. */
+ struct list_head pi_wakeup_list;
+
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
@@ -314,8 +337,6 @@ struct vcpu_vmx {
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;
- u64 current_tsc_ratio;
-
unsigned long host_debugctlmsr;
/*
@@ -325,34 +346,28 @@ struct vcpu_vmx {
*/
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
- u64 ept_pointer;
+ /* SGX Launch Control public key hash */
+ u64 msr_ia32_sgxlepubkeyhash[4];
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
struct {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
} shadow_msr_intercept;
};
-enum ept_pointers_status {
- EPT_POINTERS_CHECK = 0,
- EPT_POINTERS_MATCH = 1,
- EPT_POINTERS_MISMATCH = 2
-};
-
struct kvm_vmx {
struct kvm kvm;
unsigned int tss_addr;
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
-
- enum ept_pointers_status ept_pointers_match;
- spinlock_t ept_pointer_lock;
};
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -365,6 +380,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
+bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
@@ -375,12 +391,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
- int root_level);
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
@@ -392,10 +407,52 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
-void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
- u32 msr, int type, bool value);
+
+void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+
+static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+ int type, bool value)
+{
+ if (value)
+ vmx_enable_intercept_for_msr(vcpu, msr, type);
+ else
+ vmx_disable_intercept_for_msr(vcpu, msr, type);
+}
+
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+/*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+ * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and
+ * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and
+ * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always
+ * VM-Exit.
+ */
+#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \
+static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int f = sizeof(unsigned long); \
+ \
+ if (msr <= 0x1fff) \
+ return bitop##_bit(msr, bitmap + base / f); \
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \
+ return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \
+ return (rtype)true; \
+}
+#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \
+ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800)
+
+BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set)
+
static inline u8 vmx_get_rvi(void)
{
return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
@@ -409,9 +466,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \
vmx->loaded_vmcs->controls_shadow.lname = val; \
} \
} \
+static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \
{ \
- return vmx->loaded_vmcs->controls_shadow.lname; \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
} \
static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \
{ \
@@ -427,44 +488,21 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
-static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_SEGMENTS)
- | (1 << VCPU_EXREG_CR0)
- | (1 << VCPU_EXREG_CR3)
- | (1 << VCPU_EXREG_CR4)
- | (1 << VCPU_EXREG_EXIT_INFO_1)
- | (1 << VCPU_EXREG_EXIT_INFO_2));
- vcpu->arch.regs_dirty = 0;
-}
-
-static inline u32 vmx_vmentry_ctrl(void)
-{
- u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
- if (vmx_pt_mode_is_system())
- vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
- VM_ENTRY_LOAD_IA32_RTIT_CTL);
- /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmentry_ctrl &
- ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
-}
-
-static inline u32 vmx_vmexit_ctrl(void)
-{
- u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
- if (vmx_pt_mode_is_system())
- vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
- VM_EXIT_CLEAR_IA32_RTIT_CTL);
- /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmexit_ctrl &
- ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
-}
-
-u32 vmx_exec_control(struct vcpu_vmx *vmx);
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx);
+/*
+ * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the
+ * cache on demand. Other registers not listed here are synced to
+ * the cache immediately after VM-Exit.
+ */
+#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \
+ (1 << VCPU_REGS_RSP) | \
+ (1 << VCPU_EXREG_RFLAGS) | \
+ (1 << VCPU_EXREG_PDPTR) | \
+ (1 << VCPU_EXREG_SEGMENTS) | \
+ (1 << VCPU_EXREG_CR0) | \
+ (1 << VCPU_EXREG_CR3) | \
+ (1 << VCPU_EXREG_CR4) | \
+ (1 << VCPU_EXREG_EXIT_INFO_1) | \
+ (1 << VCPU_EXREG_EXIT_INFO_2))
static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
@@ -510,15 +548,9 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
GFP_KERNEL_ACCOUNT);
}
-static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
-{
- vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
- vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
-}
-
static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
{
- return vmx->secondary_exec_control &
+ return secondary_exec_controls_get(vmx) &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
}
@@ -543,6 +575,11 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
}
-void dump_vmcs(void);
+void dump_vmcs(struct kvm_vcpu *vcpu);
+
+static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info)
+{
+ return (vmx_instr_info >> 28) & 0xf;
+}
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 692b0c31c9c8..5e7f41225780 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -4,13 +4,11 @@
#include <linux/nospec.h>
-#include <asm/kvm_host.h>
#include <asm/vmx.h>
#include "evmcs.h"
#include "vmcs.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#include "x86.h"
asmlinkage void vmread_error(unsigned long field, bool fault);
__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
@@ -37,6 +35,10 @@ static __always_inline void vmcs_check32(unsigned long field)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
"32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "32-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "32-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"32-bit accessor invalid for natural width field");
}
@@ -69,6 +71,31 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
{
unsigned long value;
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+ asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+ "jna %l[do_fail]\n\t"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [output] "=r" (value)
+ : [field] "r" (field)
+ : "cc"
+ : do_fail, do_exception);
+
+ return value;
+
+do_fail:
+ WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field);
+ pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field);
+ return 0;
+
+do_exception:
+ kvm_spurious_fault();
+ return 0;
+
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
asm volatile("1: vmread %2, %1\n\t"
".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
@@ -78,9 +105,11 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
* @field, and bounce through the trampoline to preserve
* volatile registers.
*/
- "push $0\n\t"
+ "xorl %k1, %k1\n\t"
+ "2:\n\t"
+ "push %1\n\t"
"push %2\n\t"
- "2:call vmread_error_trampoline\n\t"
+ "call vmread_error_trampoline\n\t"
/*
* Unwind the stack. Note, the trampoline zeros out the
@@ -91,14 +120,12 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
"3:\n\t"
/* VMREAD faulted. As above, except push '1' for @fault. */
- ".pushsection .fixup, \"ax\"\n\t"
- "4: push $1\n\t"
- "push %2\n\t"
- "jmp 2b\n\t"
- ".popsection\n\t"
- _ASM_EXTABLE(1b, 4b)
- : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1)
+
+ : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc");
return value;
+
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
}
static __always_inline u16 vmcs_read16(unsigned long field)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eca63625aee4..1910e1e78b15 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -58,6 +58,7 @@
#include <linux/sched/isolation.h>
#include <linux/mem_encrypt.h>
#include <linux/entry-kvm.h>
+#include <linux/suspend.h>
#include <trace/events/kvm.h>
@@ -65,8 +66,11 @@
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mce.h>
+#include <asm/pkru.h>
#include <linux/kernel_stat.h>
-#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
#include <asm/pvclock.h>
#include <asm/div64.h>
#include <asm/irq_remapping.h>
@@ -75,6 +79,7 @@
#include <asm/tlbflush.h>
#include <asm/intel_pt.h>
#include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
@@ -85,6 +90,8 @@
u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
+#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
+
#define emul_to_vcpu(ctxt) \
((struct kvm_vcpu *)(ctxt)->vcpu)
@@ -101,6 +108,10 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+
+#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
+
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -111,18 +122,21 @@ static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
+
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_x86_ops);
#define KVM_X86_OP(func) \
DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
*(((struct kvm_x86_ops *)0)->func));
-#define KVM_X86_OP_NULL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
-EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
@@ -156,9 +170,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/*
* lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
- * adaptive tuning starting from default advancment of 1000ns. '0' disables
+ * adaptive tuning starting from default advancement of 1000ns. '0' disables
* advancement entirely. Any other value is used as-is and disables adaptive
- * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ * tuning, i.e. allows privileged userspace to set an exact advancement time.
*/
static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
@@ -176,6 +190,14 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+/* Enable/disable PMU virtualization */
+bool __read_mostly enable_pmu = true;
+EXPORT_SYMBOL_GPL(enable_pmu);
+module_param(enable_pmu, bool, 0444);
+
+bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -183,11 +205,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
*/
#define KVM_MAX_NR_USER_RETURN_MSRS 16
-struct kvm_user_return_msrs_global {
- int nr;
- u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
-};
-
struct kvm_user_return_msrs {
struct user_return_notifier urn;
bool registered;
@@ -197,13 +214,15 @@ struct kvm_user_return_msrs {
} values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+u32 __read_mostly kvm_nr_uret_msrs;
+EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
+static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
- | XFEATURE_MASK_PKRU)
+ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
u64 __read_mostly host_efer;
EXPORT_SYMBOL_GPL(host_efer);
@@ -211,60 +230,90 @@ EXPORT_SYMBOL_GPL(host_efer);
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+bool __read_mostly enable_apicv = true;
+EXPORT_SYMBOL_GPL(enable_apicv);
+
u64 __read_mostly host_xss;
EXPORT_SYMBOL_GPL(host_xss);
u64 __read_mostly supported_xss;
EXPORT_SYMBOL_GPL(supported_xss);
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("pf_fixed", pf_fixed),
- VCPU_STAT("pf_guest", pf_guest),
- VCPU_STAT("tlb_flush", tlb_flush),
- VCPU_STAT("invlpg", invlpg),
- VCPU_STAT("exits", exits),
- VCPU_STAT("io_exits", io_exits),
- VCPU_STAT("mmio_exits", mmio_exits),
- VCPU_STAT("signal_exits", signal_exits),
- VCPU_STAT("irq_window", irq_window_exits),
- VCPU_STAT("nmi_window", nmi_window_exits),
- VCPU_STAT("halt_exits", halt_exits),
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("hypercalls", hypercalls),
- VCPU_STAT("request_irq", request_irq_exits),
- VCPU_STAT("irq_exits", irq_exits),
- VCPU_STAT("host_state_reload", host_state_reload),
- VCPU_STAT("fpu_reload", fpu_reload),
- VCPU_STAT("insn_emulation", insn_emulation),
- VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
- VCPU_STAT("irq_injections", irq_injections),
- VCPU_STAT("nmi_injections", nmi_injections),
- VCPU_STAT("req_event", req_event),
- VCPU_STAT("l1d_flush", l1d_flush),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
- VM_STAT("mmu_pte_write", mmu_pte_write),
- VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
- VM_STAT("mmu_flooded", mmu_flooded),
- VM_STAT("mmu_recycled", mmu_recycled),
- VM_STAT("mmu_cache_miss", mmu_cache_miss),
- VM_STAT("mmu_unsync", mmu_unsync),
- VM_STAT("remote_tlb_flush", remote_tlb_flush),
- VM_STAT("largepages", lpages, .mode = 0444),
- VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
- VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
+ STATS_DESC_COUNTER(VM, mmu_pte_write),
+ STATS_DESC_COUNTER(VM, mmu_pde_zapped),
+ STATS_DESC_COUNTER(VM, mmu_flooded),
+ STATS_DESC_COUNTER(VM, mmu_recycled),
+ STATS_DESC_COUNTER(VM, mmu_cache_miss),
+ STATS_DESC_ICOUNTER(VM, mmu_unsync),
+ STATS_DESC_ICOUNTER(VM, pages_4k),
+ STATS_DESC_ICOUNTER(VM, pages_2m),
+ STATS_DESC_ICOUNTER(VM, pages_1g),
+ STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+ STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
+ STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+};
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, pf_taken),
+ STATS_DESC_COUNTER(VCPU, pf_fixed),
+ STATS_DESC_COUNTER(VCPU, pf_emulate),
+ STATS_DESC_COUNTER(VCPU, pf_spurious),
+ STATS_DESC_COUNTER(VCPU, pf_fast),
+ STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
+ STATS_DESC_COUNTER(VCPU, pf_guest),
+ STATS_DESC_COUNTER(VCPU, tlb_flush),
+ STATS_DESC_COUNTER(VCPU, invlpg),
+ STATS_DESC_COUNTER(VCPU, exits),
+ STATS_DESC_COUNTER(VCPU, io_exits),
+ STATS_DESC_COUNTER(VCPU, mmio_exits),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, irq_window_exits),
+ STATS_DESC_COUNTER(VCPU, nmi_window_exits),
+ STATS_DESC_COUNTER(VCPU, l1d_flush),
+ STATS_DESC_COUNTER(VCPU, halt_exits),
+ STATS_DESC_COUNTER(VCPU, request_irq_exits),
+ STATS_DESC_COUNTER(VCPU, irq_exits),
+ STATS_DESC_COUNTER(VCPU, host_state_reload),
+ STATS_DESC_COUNTER(VCPU, fpu_reload),
+ STATS_DESC_COUNTER(VCPU, insn_emulation),
+ STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
+ STATS_DESC_COUNTER(VCPU, hypercalls),
+ STATS_DESC_COUNTER(VCPU, irq_injections),
+ STATS_DESC_COUNTER(VCPU, nmi_injections),
+ STATS_DESC_COUNTER(VCPU, req_event),
+ STATS_DESC_COUNTER(VCPU, nested_run),
+ STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+ STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+ STATS_DESC_COUNTER(VCPU, preemption_reported),
+ STATS_DESC_COUNTER(VCPU, preemption_other),
+ STATS_DESC_ICOUNTER(VCPU, guest_mode)
+};
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
};
u64 __read_mostly host_xcr0;
u64 __read_mostly supported_xcr0;
EXPORT_SYMBOL_GPL(supported_xcr0);
-static struct kmem_cache *x86_fpu_cache;
-
static struct kmem_cache *x86_emulator_cache;
/*
@@ -326,23 +375,53 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
- for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(user_return_msrs_global.msrs[slot], values->host);
+ wrmsrl(kvm_uret_msrs_list[slot], values->host);
values->curr = values->host;
}
}
}
-void kvm_define_user_return_msr(unsigned slot, u32 msr)
+static int kvm_probe_user_return_msr(u32 msr)
+{
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrl_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrl_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
+}
+
+int kvm_add_user_return_msr(u32 msr)
{
- BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
- user_return_msrs_global.msrs[slot] = msr;
- if (slot >= user_return_msrs_global.nr)
- user_return_msrs_global.nr = slot + 1;
+ BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+
+ if (kvm_probe_user_return_msr(msr))
+ return -1;
+
+ kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+ return kvm_nr_uret_msrs++;
}
-EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
+
+int kvm_find_user_return_msr(u32 msr)
+{
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (kvm_uret_msrs_list[i] == msr)
+ return i;
+ }
+ return -1;
+}
+EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
@@ -351,8 +430,8 @@ static void kvm_user_return_msr_cpu_online(void)
u64 value;
int i;
- for (i = 0; i < user_return_msrs_global.nr; ++i) {
- rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ rdmsrl_safe(kvm_uret_msrs_list[i], &value);
msrs->values[i].host = value;
msrs->values[i].curr = value;
}
@@ -367,7 +446,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
value = (value & mask) | (msrs->values[slot].host & ~mask);
if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
if (err)
return 1;
@@ -424,7 +503,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-asmlinkage __visible noinstr void kvm_spurious_fault(void)
+/*
+ * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
+ *
+ * Hardware virtualization extension instructions may fault if a reboot turns
+ * off virtualization while processes are running. Usually after catching the
+ * fault we just panic; during reboot instead the instruction is ignored.
+ */
+noinstr void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
BUG_ON(!kvm_rebooting);
@@ -543,8 +629,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
- if (has_error && !is_protmode(vcpu))
- has_error = false;
if (reinject) {
/*
* On vmentry, vcpu->arch.exception.pending is only
@@ -645,6 +729,17 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_COMPLETE_USER_EXIT);
+}
+
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
@@ -660,6 +755,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
+/* Returns true if the page fault was immediately morphed into a VM-Exit. */
bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -676,10 +772,28 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
if ((fault->error_code & PFERR_PRESENT_MASK) &&
!(fault->error_code & PFERR_RSVD_MASK))
kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
- fault_mmu->root_hpa);
+ fault_mmu->root.hpa);
+
+ /*
+ * A workaround for KVM's bad exception handling. If KVM injected an
+ * exception into L2, and L2 encountered a #PF while vectoring the
+ * injected exception, manually check to see if L1 wants to intercept
+ * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
+ * In all other cases, defer the check to nested_ops->check_events(),
+ * which will correctly handle priority (this does not). Note, other
+ * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
+ * most problematic, e.g. when L0 and L1 are both intercepting #PF for
+ * shadow paging.
+ *
+ * TODO: Rewrite exception handling to track injected and pending
+ * (VM-Exit) exceptions separately.
+ */
+ if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
+ kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
+ return true;
fault_mmu->inject_page_fault(vcpu, fault);
- return fault->nested_page_fault;
+ return false;
}
EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
@@ -725,37 +839,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_GPL(kvm_require_dr);
-/*
- * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_vcpu_read_guest_page is that this function
- * can read from guest physical or from the guest's guest physical memory.
- */
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t ngfn, void *data, int offset, int len,
- u32 access)
-{
- struct x86_exception exception;
- gfn_t real_gfn;
- gpa_t ngpa;
-
- ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
- if (real_gfn == UNMAPPED_GVA)
- return -EFAULT;
-
- real_gfn = gpa_to_gfn(real_gfn);
-
- return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-
-static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- void *data, int offset, int len, u32 access)
-{
- return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
- data, offset, len, access);
-}
-
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -764,73 +847,68 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
/*
* Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
*/
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ gpa_t real_gpa;
int i;
int ret;
u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
- ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
- if (ret < 0) {
- ret = 0;
- goto out;
- }
+ /*
+ * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
+ * to an L1 GPA.
+ */
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
+ PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
+
+ /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
+ cr3 & GENMASK(11, 5), sizeof(pdpte));
+ if (ret < 0)
+ return 0;
+
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] & pdptr_rsvd_bits(vcpu))) {
- ret = 0;
- goto out;
+ return 0;
}
}
- ret = 1;
+
+ /*
+ * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
+ * Shadow page roots need to be reconstructed instead.
+ */
+ if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
+ kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+ vcpu->arch.pdptrs_from_userspace = false;
-out:
-
- return ret;
+ return 1;
}
EXPORT_SYMBOL_GPL(load_pdptrs);
-bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
- u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- int offset;
- gfn_t gfn;
- int r;
-
- if (!is_pae_paging(vcpu))
- return false;
-
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- return true;
-
- gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
- offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
- r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
- PFERR_USER_MASK | PFERR_WRITE_MASK);
- if (r < 0)
- return true;
-
- return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-}
-EXPORT_SYMBOL_GPL(pdptrs_changed);
-
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
-
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
+
+ /*
+ * Clearing CR0.PG is defined to flush the TLB from the guest's
+ * perspective.
+ */
+ if (!(cr0 & X86_CR0_PG))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
- if ((cr0 ^ old_cr0) & update_bits)
+ if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
@@ -843,7 +921,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
cr0 |= X86_CR0_ET;
@@ -873,11 +950,12 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
- is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
+ !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
return 1;
- if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -909,11 +987,13 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
- (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
- (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
- vcpu->arch.pkru != vcpu->arch.host_pkru)
- __write_pkru(vcpu->arch.pkru);
+ vcpu->arch.pkru != vcpu->arch.host_pkru &&
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
+ write_pkru(vcpu->arch.pkru);
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
@@ -922,13 +1002,15 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return;
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
- (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
- (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
- __write_pkru(vcpu->arch.host_pkru);
+ write_pkru(vcpu->arch.host_pkru);
}
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
@@ -943,6 +1025,18 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
+static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_fpu.fpstate->user_xfeatures;
+}
+
+#ifdef CONFIG_X86_64
+static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
+{
+ return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC;
+}
+#endif
+
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
@@ -960,9 +1054,9 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
/*
* Do not allow the guest to set bits that we do not support
* saving. However, xcr0 bit 0 is always set, even if the
- * emulated CPU does not support XSAVE (see fx_init).
+ * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
*/
- valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
+ valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
return 1;
@@ -976,6 +1070,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
return 1;
}
+
+ if ((xcr0 & XFEATURE_MASK_XTILE) &&
+ ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
+ return 1;
+
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
@@ -983,14 +1082,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
return 0;
}
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
- if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
- return __kvm_set_xcr(vcpu, index, xcr);
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
@@ -1006,20 +1108,49 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
- unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
- if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+ /*
+ * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
+ * according to the SDM; however, stale prev_roots could be reused
+ * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
+ * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
+ * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
+ * so fall through.
+ */
+ if (!tdp_enabled &&
+ (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
+ kvm_mmu_unload(vcpu);
+
+ /*
+ * The TLB has to be flushed for all PCIDs if any of the following
+ * (architecturally required) changes happen:
+ * - CR4.PCIDE is changed from 1 to 0
+ * - CR4.PGE is toggled
+ *
+ * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
+ */
+ if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
- kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+
+ /*
+ * The TLB has to be flushed for the current PCID if any of the
+ * following (architecturally required) changes happen:
+ * - CR4.SMEP is changed from 0 to 1
+ * - CR4.PAE is toggled
+ */
+ else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
+ ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP;
if (!kvm_is_valid_cr4(vcpu, cr4))
return 1;
@@ -1030,9 +1161,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if ((cr4 ^ old_cr4) & X86_CR4_LA57)
return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
- && ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
+ && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
+ && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
return 1;
if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
@@ -1052,35 +1182,95 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
+static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ unsigned long roots_to_free = 0;
+ int i;
+
+ /*
+ * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
+ * this is reachable when running EPT=1 and unrestricted_guest=0, and
+ * also via the emulator. KVM's TDP page tables are not in the scope of
+ * the invalidation, but the guest's TLB entries need to be flushed as
+ * the CPU may have cached entries in its TLB for the target PCID.
+ */
+ if (unlikely(tdp_enabled)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /*
+ * If neither the current CR3 nor any of the prev_roots use the given
+ * PCID, then nothing needs to be done here because a resync will
+ * happen anyway before switching to any other CR3.
+ */
+ if (kvm_get_active_pcid(vcpu) == pcid) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ /*
+ * If PCID is disabled, there is no need to free prev_roots even if the
+ * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
+ * with PCIDE=0.
+ */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ return;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
+}
+
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
bool skip_tlb_flush = false;
+ unsigned long pcid = 0;
#ifdef CONFIG_X86_64
bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
if (pcid_enabled) {
skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ pcid = cr3 & X86_CR3_PCID_MASK;
}
#endif
- if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
- if (!skip_tlb_flush) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
- return 0;
- }
+ /* PDPTRs are always reloaded for PAE paging. */
+ if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+ goto handle_tlb_flush;
- if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+ /*
+ * Do not condition the GPA check on long mode, this helper is used to
+ * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
+ * the current vCPU mode is accurate.
+ */
+ if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return 1;
- else if (is_pae_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+
+ if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
return 1;
- kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
+ if (cr3 != kvm_read_cr3(vcpu))
+ kvm_mmu_new_pgd(vcpu, cr3);
+
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ /* Do not call post_set_cr3, we do not get here for confidential guests. */
+
+handle_tlb_flush:
+ /*
+ * A load of CR3 that flushes the TLB flushes only the current PCID,
+ * even if PCID is disabled, in which case PCID=0 is flushed. It's a
+ * moot point in the end because _disabling_ PCID will flush all PCIDs,
+ * and it's impossible to use a non-zero PCID when PCID is disabled,
+ * i.e. only PCID=0 can be relevant.
+ */
+ if (!skip_tlb_flush)
+ kvm_invalidate_pcid(vcpu, pcid);
return 0;
}
@@ -1114,7 +1304,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
for (i = 0; i < KVM_NR_DB_REGS; i++)
vcpu->arch.eff_db[i] = vcpu->arch.db[i];
- vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
}
}
@@ -1139,6 +1328,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ fixed |= DR6_BUS_LOCK;
return fixed;
}
@@ -1191,20 +1383,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data;
- int err;
- err = kvm_pmu_rdpmc(vcpu, ecx, &data);
- if (err)
- return err;
+ if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
kvm_rax_write(vcpu, (u32)data);
kvm_rdx_write(vcpu, data >> 32);
- return err;
+ return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -1236,7 +1429,7 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_UMWAIT_CONTROL,
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -1257,6 +1450,14 @@ static const u32 msrs_to_save_all[] = {
MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
};
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1287,7 +1488,7 @@ static const u32 emulated_msrs_all[] = {
MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
MSR_IA32_TSC_ADJUST,
- MSR_IA32_TSCDEADLINE,
+ MSR_IA32_TSC_DEADLINE,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
@@ -1299,6 +1500,7 @@ static const u32 emulated_msrs_all[] = {
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
MSR_IA32_POWER_CTL,
MSR_IA32_UCODE_REV,
@@ -1372,7 +1574,7 @@ static u64 kvm_get_arch_capabilities(void)
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
* the nested hypervisor runs with NX huge pages. If it is not,
- * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
* L1 guests, so it need not worry about its own (L2) guests.
*/
data |= ARCH_CAP_PSCHANGE_MC_NO;
@@ -1415,6 +1617,9 @@ static u64 kvm_get_arch_capabilities(void)
*/
}
+ /* Guests don't need to know "Fill buffer clear control" exists */
+ data &= ~ARCH_CAP_FB_CLEAR_CTRL;
+
return data;
}
@@ -1510,8 +1715,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return r;
}
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
return 0;
@@ -1577,9 +1781,6 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
{
struct msr_data msr;
- if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
- return KVM_MSR_RET_FILTERED;
-
switch (index) {
case MSR_FS_BASE:
case MSR_GS_BASE:
@@ -1603,7 +1804,31 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
- data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+ data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+
+ /*
+ * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+ * incomplete and conflicting architectural behavior. Current
+ * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+ * reserved and always read as zeros. Enforce Intel's reserved
+ * bits check if and only if the guest CPU is Intel, and clear
+ * the bits in all other cases. This ensures cross-vendor
+ * migration will provide consistent behavior for the guest.
+ */
+ if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ return 1;
+
+ data = (u32)data;
+ break;
}
msr.data = data;
@@ -1637,8 +1862,17 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
struct msr_data msr;
int ret;
- if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
- return KVM_MSR_RET_FILTERED;
+ switch (index) {
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+ break;
+ }
msr.index = index;
msr.host_initiated = host_initiated;
@@ -1664,6 +1898,20 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
return ret;
}
+static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return KVM_MSR_RET_FILTERED;
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
+}
+
+static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return KVM_MSR_RET_FILTERED;
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
+}
+
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
return kvm_get_msr_ignored_check(vcpu, index, data, false);
@@ -1676,22 +1924,36 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
-static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
{
- int err = vcpu->run->msr.error;
- if (!err) {
+ if (!vcpu->run->msr.error) {
kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
}
+}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
+static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
}
-static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_emulated_msr_access(vcpu);
+}
+
+static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
{
return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
+static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_fast_msr_access(vcpu);
+}
+
static u64 kvm_msr_reason(int r)
{
switch (r) {
@@ -1726,31 +1988,13 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
return 1;
}
-static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
-{
- return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
- complete_emulated_rdmsr, r);
-}
-
-static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
-{
- return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
- complete_emulated_wrmsr, r);
-}
-
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data;
int r;
- r = kvm_get_msr(vcpu, ecx, &data);
-
- /* MSR read failed? See if we should ask user space */
- if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
- /* Bounce to user space */
- return 0;
- }
+ r = kvm_get_msr_with_filter(vcpu, ecx, &data);
if (!r) {
trace_kvm_msr_read(ecx, data);
@@ -1758,6 +2002,10 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
kvm_rax_write(vcpu, data & -1u);
kvm_rdx_write(vcpu, (data >> 32) & -1u);
} else {
+ /* MSR read failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
+ complete_fast_rdmsr, r))
+ return 0;
trace_kvm_msr_read_ex(ecx);
}
@@ -1771,26 +2019,59 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
u64 data = kvm_read_edx_eax(vcpu);
int r;
- r = kvm_set_msr(vcpu, ecx, data);
-
- /* MSR write failed? See if we should ask user space */
- if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
- /* Bounce to user space */
- return 0;
-
- /* Signal all other negative errors to userspace */
- if (r < 0)
- return r;
+ r = kvm_set_msr_with_filter(vcpu, ecx, data);
- if (!r)
+ if (!r) {
trace_kvm_msr_write(ecx, data);
- else
+ } else {
+ /* MSR write failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
+ complete_fast_msr_access, r))
+ return 0;
+ /* Signal all other negative errors to userspace */
+ if (r < 0)
+ return r;
trace_kvm_msr_write_ex(ecx, data);
+ }
return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+ pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+ pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
xfer_to_guest_mode_prepare();
@@ -1811,17 +2092,10 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
return 1;
if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
- ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
- ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
- ((u32)(data >> 32) != X2APIC_BROADCAST)) {
-
- data &= ~(1 << 12);
- kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
- kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
- kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
- trace_kvm_apic_write(APIC_ICR, (u32)data);
- return 0;
- }
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+ ((u32)(data >> 32) != X2APIC_BROADCAST))
+ return kvm_x2apic_icr_write(vcpu->arch.apic, data);
return 1;
}
@@ -1849,7 +2123,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
ret = EXIT_FASTPATH_EXIT_HANDLED;
}
break;
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
data = kvm_read_edx_eax(vcpu);
if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
kvm_skip_emulated_instruction(vcpu);
@@ -1946,7 +2220,7 @@ static s64 get_kvmclock_base_ns(void)
}
#endif
-void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
{
int version;
int r;
@@ -2008,14 +2282,13 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
- vcpu->arch.pv_time_enabled = false;
- if (!(system_time & 1))
- return;
-
- if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
- &vcpu->arch.pv_time, system_time & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info)))
- vcpu->arch.pv_time_enabled = true;
+ if (system_time & 1) {
+ kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+ KVM_HOST_USES_PFN, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info));
+ } else {
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+ }
return;
}
@@ -2068,13 +2341,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
return v;
}
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+
static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
u64 ratio;
/* Guest TSC same frequency as host TSC? */
if (!scale) {
- vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
return 0;
}
@@ -2100,7 +2375,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
return -1;
}
- vcpu->arch.tsc_scaling_ratio = ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
return 0;
}
@@ -2112,7 +2387,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
/* tsc_khz can be zero if TSC calibration fails */
if (user_tsc_khz == 0) {
/* set tsc_scaling_ratio to a safe value */
- vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
return -1;
}
@@ -2146,10 +2421,12 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
+#ifdef CONFIG_X86_64
static inline int gtod_is_based_on_tsc(int mode)
{
return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
}
+#endif
static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
@@ -2194,10 +2471,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
}
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+u64 kvm_scale_tsc(u64 tsc, u64 ratio)
{
u64 _tsc = tsc;
- u64 ratio = vcpu->arch.tsc_scaling_ratio;
if (ratio != kvm_default_tsc_scaling_ratio)
_tsc = __scale_tsc(ratio, tsc);
@@ -2206,25 +2482,86 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
}
EXPORT_SYMBOL_GPL(kvm_scale_tsc);
-static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
- tsc = kvm_scale_tsc(vcpu, rdtsc());
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
return target_tsc - tsc;
}
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
- return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+ return vcpu->arch.l1_tsc_offset +
+ kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
-static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
+{
+ u64 nested_offset;
+
+ if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+ nested_offset = l1_offset;
+ else
+ nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+ kvm_tsc_scaling_ratio_frac_bits);
+
+ nested_offset += l2_offset;
+ return nested_offset;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
+{
+ if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+ return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+ kvm_tsc_scaling_ratio_frac_bits);
+
+ return l1_multiplier;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+{
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ vcpu->arch.l1_tsc_offset,
+ l1_offset);
+
+ vcpu->arch.l1_tsc_offset = l1_offset;
+
+ /*
+ * If we are here because L1 chose not to trap WRMSR to TSC then
+ * according to the spec this should set L1's TSC (as opposed to
+ * setting L1's offset for L2).
+ */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ l1_offset,
+ static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
+ static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_offset = l1_offset;
+
+ static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
{
- vcpu->arch.l1_tsc_offset = offset;
- vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
+ vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+ /* Userspace is changing the multiplier while L2 is active */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ l1_multiplier,
+ static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+ if (kvm_has_tsc_control)
+ static_call(kvm_x86_write_tsc_multiplier)(
+ vcpu, vcpu->arch.tsc_scaling_ratio);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -2240,17 +2577,68 @@ static inline bool kvm_check_tsc_unstable(void)
return check_tsc_unstable();
}
+/*
+ * Infers attempts to synchronize the guest's tsc from host writes. Sets the
+ * offset for the vcpu and tracks the TSC matching generation that the vcpu
+ * participates in.
+ */
+static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
+ u64 ns, bool matched)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = tsc;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ kvm->arch.last_tsc_offset = offset;
+
+ vcpu->arch.last_guest_tsc = tsc;
+
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+
+ if (!matched) {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computation in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = tsc;
+ kvm->arch.cur_tsc_offset = offset;
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ kvm_track_tsc_matching(vcpu);
+}
+
static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
- bool matched;
- bool already_matched;
+ bool matched = false;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = kvm_compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
ns = get_kvmclock_base_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -2289,54 +2677,13 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
- offset = kvm_compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched = true;
- already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
- } else {
- /*
- * We split periods of matched TSC writes into generations.
- * For each generation, we track the original measured
- * nanosecond time, offset, and write, so if TSCs are in
- * sync, we can match exact offset, and if not, we can match
- * exact software computation in compute_guest_tsc()
- *
- * These values are tracked in kvm->arch.cur_xxx variables.
- */
- kvm->arch.cur_tsc_generation++;
- kvm->arch.cur_tsc_nsec = ns;
- kvm->arch.cur_tsc_write = data;
- kvm->arch.cur_tsc_offset = offset;
- matched = false;
}
- /*
- * We also track th most recent recorded KHZ, write and time to
- * allow the matching interval to be extended at each write.
- */
- kvm->arch.last_tsc_nsec = ns;
- kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-
- vcpu->arch.last_guest_tsc = data;
-
- /* Keep track of which generation this VCPU has synchronized to */
- vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
- vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
- vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
-
- kvm_vcpu_write_tsc_offset(vcpu, offset);
+ __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
-
- spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
- if (!matched) {
- kvm->arch.nr_vcpus_matched_tsc = 0;
- } else if (!already_matched) {
- kvm->arch.nr_vcpus_matched_tsc++;
- }
-
- kvm_track_tsc_matching(vcpu);
- spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
}
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2348,9 +2695,10 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
{
- if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
WARN_ON(adjustment < 0);
- adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ adjustment = kvm_scale_tsc((u64) adjustment,
+ vcpu->arch.l1_tsc_scaling_ratio);
adjust_tsc_offset_guest(vcpu, adjustment);
}
@@ -2523,6 +2871,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
int vclock_mode;
bool host_tsc_clocksource, vcpus_matched;
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&kvm->online_vcpus));
@@ -2547,132 +2896,158 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
#endif
}
-void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
}
-static void kvm_gen_update_masterclock(struct kvm *kvm)
+static void __kvm_start_pvclock_update(struct kvm *kvm)
{
-#ifdef CONFIG_X86_64
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm_arch *ka = &kvm->arch;
- unsigned long flags;
-
- kvm_hv_invalidate_tsc_page(kvm);
+ raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
+ write_seqcount_begin(&kvm->arch.pvclock_sc);
+}
+static void kvm_start_pvclock_update(struct kvm *kvm)
+{
kvm_make_mclock_inprogress_request(kvm);
/* no guest entries from this point */
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- pvclock_update_vm_gtod_copy(kvm);
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ __kvm_start_pvclock_update(kvm);
+}
+static void kvm_end_pvclock_update(struct kvm *kvm)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ write_seqcount_end(&ka->pvclock_sc);
+ raw_spin_unlock_irq(&ka->tsc_write_lock);
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-#endif
}
-u64 get_kvmclock_ns(struct kvm *kvm)
+static void kvm_update_masterclock(struct kvm *kvm)
+{
+ kvm_hv_request_tsc_page_update(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+ kvm_end_pvclock_update(kvm);
+}
+
+/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
+static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- if (!ka->use_master_clock) {
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
- return get_kvmclock_base_ns() + ka->kvmclock_offset;
- }
-
- hv_clock.tsc_timestamp = ka->master_cycle_now;
- hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
- if (__this_cpu_read(cpu_tsc_khz)) {
+ data->flags = 0;
+ if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
+ struct timespec64 ts;
+
+ if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+ data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+ data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+ } else
+#endif
+ data->host_tsc = rdtsc();
+
+ data->flags |= KVM_CLOCK_TSC_STABLE;
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
- ret = __pvclock_read_cycles(&hv_clock, rdtsc());
- } else
- ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+ } else {
+ data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ }
put_cpu();
+}
- return ret;
+static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ __get_kvmclock(kvm, data);
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
}
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
- struct gfn_to_hva_cache *cache,
- unsigned int offset)
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_clock_data data;
+
+ get_kvmclock(kvm, &data);
+ return data.clock;
+}
+
+static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
+ struct gfn_to_pfn_cache *gpc,
+ unsigned int offset)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
- struct pvclock_vcpu_time_info guest_hv_clock;
+ struct pvclock_vcpu_time_info *guest_hv_clock;
+ unsigned long flags;
- if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
- &guest_hv_clock, offset, sizeof(guest_hv_clock))))
- return;
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ offset + sizeof(*guest_hv_clock))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ offset + sizeof(*guest_hv_clock)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ guest_hv_clock = (void *)(gpc->khva + offset);
- /* This VCPU is paused, but it's legal for a guest to read another
+ /*
+ * This VCPU is paused, but it's legal for a guest to read another
* VCPU's kvmclock, so we really have to follow the specification where
* it says that version is odd if data is being modified, and even after
* it is consistent.
- *
- * Version field updates must be kept separate. This is because
- * kvm_write_guest_cached might use a "rep movs" instruction, and
- * writes within a string instruction are weakly ordered. So there
- * are three writes overall.
- *
- * As a small optimization, only write the version field in the first
- * and third write. The vcpu->pv_time cache is still valid, because the
- * version field is the first in the struct.
*/
- BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
- if (guest_hv_clock.version & 1)
- ++guest_hv_clock.version; /* first time write, random junk */
-
- vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock.version));
+ guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+ vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
if (vcpu->pvclock_set_guest_stopped_request) {
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
vcpu->pvclock_set_guest_stopped_request = false;
}
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+ memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+ smp_wmb();
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock));
+ guest_hv_clock->version = ++vcpu->hv_clock.version;
- smp_wmb();
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ read_unlock_irqrestore(&gpc->lock, flags);
- vcpu->hv_clock.version++;
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock.version));
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
+ unsigned seq;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -2687,13 +3062,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- use_master_clock = ka->use_master_clock;
- if (use_master_clock) {
- host_tsc = ka->master_cycle_now;
- kernel_ns = ka->master_kernel_ns;
- }
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -2733,7 +3109,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
- tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+ tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
+ v->arch.l1_tsc_scaling_ratio);
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@ -2753,15 +3130,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
- if (vcpu->pv_time_enabled)
- kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
- if (vcpu->xen.vcpu_info_set)
- kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
- offsetof(struct compat_vcpu_info, time));
- if (vcpu->xen.vcpu_time_info_set)
- kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
- if (v == kvm_get_vcpu(v->kvm, 0))
- kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
+ if (vcpu->pv_time.active)
+ kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+ if (vcpu->xen.vcpu_info_cache.active)
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (vcpu->xen.vcpu_time_info_cache.active)
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -2783,7 +3159,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
static void kvmclock_update_fn(struct work_struct *work)
{
- int i;
+ unsigned long i;
struct delayed_work *dwork = to_delayed_work(work);
struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
kvmclock_update_work);
@@ -2948,26 +3324,64 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- vcpu->arch.pv_time_enabled = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
vcpu->arch.time = 0;
}
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_all)(vcpu);
+ static_call(kvm_x86_flush_tlb_all)(vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_guest)(vcpu);
+
+ if (!tdp_enabled) {
+ /*
+ * A TLB flush on behalf of the guest is equivalent to
+ * INVPCID(all), toggling CR4.PGE, etc., which requires
+ * a forced sync of the shadow page tables. Ensure all the
+ * roots are synced and the guest TLB in hardware is clean.
+ */
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_sync_prev_roots(vcpu);
+ }
+
+ static_call(kvm_x86_flush_tlb_guest)(vcpu);
}
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context. In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
static void record_steal_time(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ u64 steal;
+ u32 version;
if (kvm_xen_msr_enabled(vcpu->kvm)) {
kvm_xen_runstate_set_running(vcpu);
@@ -2977,43 +3391,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- /* -EAGAIN is returned in atomic context so we can just return. */
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
- &map, &vcpu->arch.st.cache, false))
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+ /* We rely on the fact that it fits in a single page. */
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ return;
+ }
+ st = (struct kvm_steal_time __user *)ghc->hva;
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+ u8 st_preempted = 0;
+ int err = -EFAULT;
+
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ asm volatile("1: xchgb %0, %2\n"
+ "xor %1, %1\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+q" (st_preempted),
+ "+&r" (err),
+ "+m" (st->preempted));
+ if (err)
+ goto out;
+
+ user_access_end();
+
+ vcpu->arch.st.preempted = 0;
+
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
- st->preempted & KVM_VCPU_FLUSH_TLB);
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ st_preempted & KVM_VCPU_FLUSH_TLB);
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
- }
- vcpu->arch.st.preempted = 0;
+ if (!user_access_begin(st, sizeof(*st)))
+ goto dirty;
+ } else {
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ unsafe_put_user(0, &st->preempted, out);
+ vcpu->arch.st.preempted = 0;
+ }
- if (st->version & 1)
- st->version += 1; /* first time write, random junk */
+ unsafe_get_user(version, &st->version, out);
+ if (version & 1)
+ version += 1; /* first time write, random junk */
- st->version += 1;
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
smp_wmb();
- st->steal += current->sched_info.run_delay -
+ unsafe_get_user(steal, &st->steal, out);
+ steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ unsafe_put_user(steal, &st->steal, out);
- smp_wmb();
-
- st->version += 1;
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+ user_access_end();
+ dirty:
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -3049,7 +3506,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
return 1;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+ if (kvm_get_msr_feature(&msr_ent))
return 1;
if (data & ~msr_ent.data)
return 1;
@@ -3087,7 +3544,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return kvm_set_apic_base(vcpu, msr_info);
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_write(vcpu, msr, data);
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
case MSR_IA32_TSC_ADJUST:
@@ -3095,6 +3552,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
}
@@ -3122,7 +3583,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated) {
kvm_synchronize_tsc(vcpu, data);
} else {
- u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj);
vcpu->arch.ia32_tsc_adjust_msr += adj;
}
@@ -3139,6 +3600,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~supported_xss)
return 1;
vcpu->arch.ia32_xss = data;
+ kvm_update_cpuid_runtime(vcpu);
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
@@ -3186,7 +3648,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
break;
case MSR_KVM_ASYNC_PF_ACK:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
if (data & 0x1) {
vcpu->arch.apf.pageready_pending = false;
@@ -3215,7 +3677,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
return 1;
- if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
+ if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
return 1;
break;
@@ -3301,6 +3763,28 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.msr_misc_features_enables = data;
break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~kvm_guest_supported_xfd(vcpu))
+ return 1;
+
+ fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~kvm_guest_supported_xfd(vcpu))
+ return 1;
+
+ vcpu->arch.guest_fpu.xfd_err = data;
+ break;
+#endif
default:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
@@ -3357,7 +3841,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
- case MSR_K8_SYSCFG:
+ case MSR_AMD64_SYSCFG:
case MSR_K8_TSEG_ADDR:
case MSR_K8_TSEG_MASK:
case MSR_VM_HSAVE_PA:
@@ -3382,6 +3866,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0;
break;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = 0;
+ break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -3418,10 +3908,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* return L1's TSC value to ensure backwards-compatible
* behavior for migration.
*/
- u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
- vcpu->arch.tsc_offset;
+ u64 offset, ratio;
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
+ if (msr_info->host_initiated) {
+ offset = vcpu->arch.l1_tsc_offset;
+ ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ } else {
+ offset = vcpu->arch.tsc_offset;
+ ratio = vcpu->arch.tsc_scaling_ratio;
+ }
+
+ msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
break;
}
case MSR_MTRRcap:
@@ -3449,7 +3946,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
case MSR_IA32_TSC_ADJUST:
@@ -3512,7 +4009,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.apf.msr_int_val;
break;
case MSR_KVM_ASYNC_PF_ACK:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
msr_info->data = 0;
@@ -3608,6 +4105,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_HWCR:
msr_info->data = vcpu->arch.msr_hwcr;
break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+ break;
+#endif
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
@@ -3745,6 +4258,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_HYPERV_SEND_IPI:
case KVM_CAP_HYPERV_CPUID:
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
case KVM_CAP_SYS_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
@@ -3771,13 +4285,31 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_X86_USER_SPACE_MSR:
case KVM_CAP_X86_MSR_FILTER:
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
+ case KVM_CAP_SREGS2:
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ case KVM_CAP_VCPU_ATTRIBUTES:
+ case KVM_CAP_SYS_ATTRIBUTES:
+ case KVM_CAP_VAPIC:
+ case KVM_CAP_ENABLE_CAP:
r = 1;
break;
+ case KVM_CAP_EXIT_HYPERCALL:
+ r = KVM_EXIT_HYPERCALL_VALID_MASK;
+ break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
#ifdef CONFIG_KVM_XEN
case KVM_CAP_XEN_HVM:
r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
- KVM_XEN_HVM_CONFIG_SHARED_INFO;
+ KVM_XEN_HVM_CONFIG_SHARED_INFO |
+ KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
break;
@@ -3786,7 +4318,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_SYNC_X86_VALID_FIELDS;
break;
case KVM_CAP_ADJUST_CLOCK:
- r = KVM_CLOCK_TSC_STABLE;
+ r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
@@ -3805,17 +4337,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
- case KVM_CAP_VAPIC:
- r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
- break;
case KVM_CAP_NR_VCPUS:
- r = KVM_SOFT_MAX_VCPUS;
+ r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
break;
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_MAX_VCPU_ID:
- r = KVM_MAX_VCPU_ID;
+ r = KVM_MAX_VCPU_IDS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
@@ -3827,6 +4356,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = boot_cpu_has(X86_FEATURE_XSAVE);
break;
case KVM_CAP_TSC_CONTROL:
+ case KVM_CAP_VM_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
case KVM_CAP_X2APIC_API:
@@ -3855,11 +4385,67 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
else
r = 0;
break;
+ case KVM_CAP_XSAVE2: {
+ u64 guest_perm = xstate_get_guest_group_perm();
+
+ r = xstate_required_size(supported_xcr0 & guest_perm, false);
+ if (r < sizeof(struct kvm_xsave))
+ r = sizeof(struct kvm_xsave);
+ break;
+ case KVM_CAP_PMU_CAPABILITY:
+ r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
+ break;
+ }
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = KVM_X86_VALID_QUIRKS;
+ break;
default:
break;
}
return r;
+}
+
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+ void __user *uaddr = (void __user*)(unsigned long)attr->addr;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return ERR_PTR_USR(-EFAULT);
+ return uaddr;
+}
+
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
+
+ if (attr->group)
+ return -ENXIO;
+
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
+
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ if (put_user(supported_xcr0, uaddr))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENXIO;
+ break;
+ }
+}
+
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+ if (attr->group)
+ return -ENXIO;
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ return 0;
+ default:
+ return -ENXIO;
+ }
}
long kvm_arch_dev_ioctl(struct file *filp,
@@ -3950,6 +4536,22 @@ long kvm_arch_dev_ioctl(struct file *filp,
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
break;
+ case KVM_GET_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_get_attr(&attr);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_has_attr(&attr);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -3998,7 +4600,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
mark_tsc_unstable("KVM discovered backwards TSC");
if (kvm_check_tsc_unstable()) {
- u64 offset = kvm_compute_tsc_offset(vcpu,
+ u64 offset = kvm_compute_l1_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
@@ -4023,62 +4625,77 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
- int idx;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
+
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+ * an instruction boundary and will not trigger guest emulation of any
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
+ * when this is true, for example allowing the vCPU to be marked
+ * preempted if and only if the VM-Exit was due to a host interrupt.
+ */
+ if (!vcpu->arch.at_instruction_boundary) {
+ vcpu->stat.preemption_other++;
+ return;
+ }
+ vcpu->stat.preemption_reported++;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
if (vcpu->arch.st.preempted)
return;
- /*
- * Take the srcu lock as memslots will be accessed to check the gfn
- * cache generation against the memslots generation.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ /* This happens on process exit */
+ if (unlikely(current->mm != vcpu->kvm->mm))
+ return;
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
- &vcpu->arch.st.cache, true))
- goto out;
+ slots = kvm_memslots(vcpu->kvm);
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ if (unlikely(slots->generation != ghc->generation ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+ return;
- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
-out:
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- if (vcpu->preempted && !vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ int idx;
- if (kvm_xen_msr_enabled(vcpu->kvm))
- kvm_xen_runstate_set_preempted(vcpu);
- else
- kvm_steal_time_set_preempted(vcpu);
+ if (vcpu->preempted) {
+ if (!vcpu->arch.guest_state_protected)
+ vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (kvm_xen_msr_enabled(vcpu->kvm))
+ kvm_xen_runstate_set_preempted(vcpu);
+ else
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
- /*
- * If userspace has set any breakpoints or watchpoints, dr6 is restored
- * on every vmexit, but if not, we might have a stale dr6 from the
- * guest. do_debug expects dr6 to be cleared after it runs, do the same.
- */
- set_debugreg(0, 6);
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -4114,8 +4731,17 @@ static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
{
- return kvm_arch_interrupt_allowed(vcpu) &&
- kvm_cpu_accept_dm_intr(vcpu);
+ /*
+ * Do not cause an interrupt window exit if an exception
+ * is pending or an event needs reinjection; userspace
+ * might want to inject the interrupt manually using KVM_SET_REGS
+ * or KVM_SET_SREGS. For that to work, we must be at an
+ * instruction boundary and with no events half-injected.
+ */
+ return (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ !vcpu->arch.exception.pending);
}
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@ -4319,7 +4945,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
memset(&events->reserved, 0, sizeof(events->reserved));
}
-static void kvm_smm_changed(struct kvm_vcpu *vcpu);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
@@ -4380,11 +5006,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
- if (events->smi.smm)
- vcpu->arch.hflags |= HF_SMM_MASK;
- else
- vcpu->arch.hflags &= ~HF_SMM_MASK;
- kvm_smm_changed(vcpu);
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+ kvm_smm_changed(vcpu, events->smi.smm);
}
vcpu->arch.smi_pending = events->smi.pending;
@@ -4442,141 +5065,37 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
return 0;
}
-#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
-
-static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
- u64 xstate_bv = xsave->header.xfeatures;
- u64 valid;
-
- /*
- * Copy legacy XSAVE area, to avoid complications with CPUID
- * leaves 0 and 1 in the loop below.
- */
- memcpy(dest, xsave, XSAVE_HDR_OFFSET);
-
- /* Set XSTATE_BV */
- xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
- *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
-
- /*
- * Copy each region from the possibly compacted offset to the
- * non-compacted offset.
- */
- valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
- while (valid) {
- u64 xfeature_mask = valid & -valid;
- int xfeature_nr = fls64(xfeature_mask) - 1;
- void *src = get_xsave_addr(xsave, xfeature_nr);
-
- if (src) {
- u32 size, offset, ecx, edx;
- cpuid_count(XSTATE_CPUID, xfeature_nr,
- &size, &offset, &ecx, &edx);
- if (xfeature_nr == XFEATURE_PKRU)
- memcpy(dest + offset, &vcpu->arch.pkru,
- sizeof(vcpu->arch.pkru));
- else
- memcpy(dest + offset, src, size);
-
- }
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return;
- valid -= xfeature_mask;
- }
+ fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+ guest_xsave->region,
+ sizeof(guest_xsave->region),
+ vcpu->arch.pkru);
}
-static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
- u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
- u64 valid;
-
- /*
- * Copy legacy XSAVE area, to avoid complications with CPUID
- * leaves 0 and 1 in the loop below.
- */
- memcpy(xsave, src, XSAVE_HDR_OFFSET);
-
- /* Set XSTATE_BV and possibly XCOMP_BV. */
- xsave->header.xfeatures = xstate_bv;
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
- /*
- * Copy each region from the non-compacted offset to the
- * possibly compacted offset.
- */
- valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
- while (valid) {
- u64 xfeature_mask = valid & -valid;
- int xfeature_nr = fls64(xfeature_mask) - 1;
- void *dest = get_xsave_addr(xsave, xfeature_nr);
-
- if (dest) {
- u32 size, offset, ecx, edx;
- cpuid_count(XSTATE_CPUID, xfeature_nr,
- &size, &offset, &ecx, &edx);
- if (xfeature_nr == XFEATURE_PKRU)
- memcpy(&vcpu->arch.pkru, src + offset,
- sizeof(vcpu->arch.pkru));
- else
- memcpy(dest, src + offset, size);
- }
-
- valid -= xfeature_mask;
- }
-}
-
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
-{
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return;
- if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- memset(guest_xsave, 0, sizeof(struct kvm_xsave));
- fill_xsave((u8 *) guest_xsave->region, vcpu);
- } else {
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu->state.fxsave,
- sizeof(struct fxregs_state));
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
- XFEATURE_MASK_FPSSE;
- }
+ fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+ state, size, vcpu->arch.pkru);
}
-#define XSAVE_MXCSR_OFFSET 24
-
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- u64 xstate_bv;
- u32 mxcsr;
-
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
- xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
- mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
-
- if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- /*
- * Here we allow setting states that are not present in
- * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
- * with old userspace.
- */
- if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
- return -EINVAL;
- load_xsave(vcpu, (u8 *)guest_xsave->region);
- } else {
- if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
- mxcsr & ~mxcsr_feature_mask)
- return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu->state.fxsave,
- guest_xsave->region, sizeof(struct fxregs_state));
- }
- return 0;
+ return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+ guest_xsave->region,
+ supported_xcr0, &vcpu->arch.pkru);
}
static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -4624,13 +5143,122 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.pv_time_enabled)
+ if (!vcpu->arch.pv_time.active)
return -EINVAL;
vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0;
}
+static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
+ int r;
+
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = -EFAULT;
+ if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
+ break;
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET: {
+ u64 offset, tsc, ns;
+ unsigned long flags;
+ bool matched;
+
+ r = -EFAULT;
+ if (get_user(offset, uaddr))
+ break;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+
+ matched = (vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_offset == offset);
+
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ ns = get_kvmclock_base_ns();
+
+ __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ r = 0;
+ break;
+ }
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
+ unsigned int ioctl,
+ void __user *argp)
+{
+ struct kvm_device_attr attr;
+ int r;
+
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ if (attr.group != KVM_VCPU_TSC_CTRL)
+ return -ENXIO;
+
+ switch (ioctl) {
+ case KVM_HAS_DEVICE_ATTR:
+ r = kvm_arch_tsc_has_attr(vcpu, &attr);
+ break;
+ case KVM_GET_DEVICE_ATTR:
+ r = kvm_arch_tsc_get_attr(vcpu, &attr);
+ break;
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_arch_tsc_set_attr(vcpu, &attr);
+ break;
+ }
+
+ return r;
+}
+
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
struct kvm_enable_cap *cap)
{
@@ -4669,13 +5297,15 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
+ return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
if (vcpu->arch.pv_cpuid.enforce)
kvm_update_pv_runtime(vcpu);
return 0;
-
default:
return -EINVAL;
}
@@ -4688,6 +5318,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
int r;
union {
+ struct kvm_sregs2 *sregs2;
struct kvm_lapic_state *lapic;
struct kvm_xsave *xsave;
struct kvm_xcrs *xcrs;
@@ -4890,6 +5521,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XSAVE: {
+ r = -EINVAL;
+ if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+ break;
+
u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xsave)
@@ -4904,7 +5539,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_XSAVE: {
- u.xsave = memdup_user(argp, sizeof(*u.xsave));
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = memdup_user(argp, size);
if (IS_ERR(u.xsave)) {
r = PTR_ERR(u.xsave);
goto out_nofree;
@@ -4913,6 +5550,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
break;
}
+
+ case KVM_GET_XSAVE2: {
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, size))
+ break;
+
+ r = 0;
+ break;
+ }
+
case KVM_GET_XCRS: {
u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
@@ -5060,6 +5716,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
#endif
+ case KVM_GET_SREGS2: {
+ u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.sregs2)
+ goto out;
+ __get_sregs2(vcpu, u.sregs2);
+ r = -EFAULT;
+ if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SREGS2: {
+ u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+ if (IS_ERR(u.sregs2)) {
+ r = PTR_ERR(u.sregs2);
+ u.sregs2 = NULL;
+ goto out;
+ }
+ r = __set_sregs2(vcpu, u.sregs2);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR:
+ case KVM_GET_DEVICE_ATTR:
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
+ break;
default:
r = -EINVAL;
}
@@ -5250,7 +5933,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
* VM-Exit.
*/
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
@@ -5277,6 +5960,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return -EINVAL;
switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
+ break;
+ fallthrough;
case KVM_CAP_DISABLE_QUIRKS:
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
@@ -5298,6 +5986,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
r = 0;
split_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -5357,6 +6046,64 @@ split_irqchip_unlock:
kvm->arch.bus_lock_detection_enabled = true;
r = 0;
break;
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE: {
+ unsigned long allowed_attributes = 0;
+
+ r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+ if (r)
+ break;
+
+ /* KVM only supports the PROVISIONKEY privileged attribute. */
+ if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+ !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+ kvm->arch.sgx_provisioning_allowed = true;
+ else
+ r = -EINVAL;
+ break;
+ }
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (!kvm_x86_ops.vm_copy_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
+ break;
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (!kvm_x86_ops.vm_move_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
+ break;
+ case KVM_CAP_EXIT_HYPERCALL:
+ if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+ r = -EINVAL;
+ break;
+ }
+ kvm->arch.hypercall_exit_enabled = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ r = -EINVAL;
+ if (cap->args[0] & ~1)
+ break;
+ kvm->arch.exit_on_emulation_error = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_PMU_CAPABILITY:
+ r = -EINVAL;
+ if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
default:
r = -EINVAL;
break;
@@ -5392,14 +6139,18 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
struct kvm_msr_filter_range *user_range)
{
- struct msr_bitmap_range range;
unsigned long *bitmap = NULL;
size_t bitmap_size;
- int r;
if (!user_range->nmsrs)
return 0;
+ if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+ return -EINVAL;
+
+ if (!user_range->flags)
+ return -EINVAL;
+
bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
return -EINVAL;
@@ -5408,31 +6159,15 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
if (IS_ERR(bitmap))
return PTR_ERR(bitmap);
- range = (struct msr_bitmap_range) {
+ msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
.flags = user_range->flags,
.base = user_range->base,
.nmsrs = user_range->nmsrs,
.bitmap = bitmap,
};
- if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
- r = -EINVAL;
- goto err;
- }
-
- if (!range.flags) {
- r = -EINVAL;
- goto err;
- }
-
- /* Everything ok, add this range identifier. */
- msr_filter->ranges[msr_filter->count] = range;
msr_filter->count++;
-
return 0;
-err:
- kfree(bitmap);
- return r;
}
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@@ -5483,6 +6218,99 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
return 0;
}
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_arch_suspend_notifier(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret = 0;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!vcpu->arch.pv_time.active)
+ continue;
+
+ ret = kvm_set_guest_paused(vcpu);
+ if (ret) {
+ kvm_err("Failed to pause guest VCPU%d: %d\n",
+ vcpu->vcpu_id, ret);
+ break;
+ }
+ }
+ mutex_unlock(&kvm->lock);
+
+ return ret ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ return kvm_arch_suspend_notifier(kvm);
+ }
+
+ return NOTIFY_DONE;
+}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+
+static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_clock_data data = { 0 };
+
+ get_kvmclock(kvm, &data);
+ if (copy_to_user(argp, &data, sizeof(data)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_clock_data data;
+ u64 now_raw_ns;
+
+ if (copy_from_user(&data, argp, sizeof(data)))
+ return -EFAULT;
+
+ /*
+ * Only KVM_CLOCK_REALTIME is used, but allow passing the
+ * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
+ */
+ if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
+ return -EINVAL;
+
+ kvm_hv_request_tsc_page_update(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+
+ /*
+ * This pairs with kvm_guest_time_update(): when masterclock is
+ * in use, we use master_kernel_ns + kvmclock_offset to set
+ * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+ * is slightly ahead) here we risk going negative on unsigned
+ * 'system_time' when 'data.clock' is very small.
+ */
+ if (data.flags & KVM_CLOCK_REALTIME) {
+ u64 now_real_ns = ktime_get_real_ns();
+
+ /*
+ * Avoid stepping the kvmclock backwards.
+ */
+ if (now_real_ns > data.realtime)
+ data.clock += now_real_ns - data.realtime;
+ }
+
+ if (ka->use_master_clock)
+ now_raw_ns = ka->master_kernel_ns;
+ else
+ now_raw_ns = get_kvmclock_base_ns();
+ ka->kvmclock_offset = data.clock - now_raw_ns;
+ kvm_end_pvclock_update(kvm);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5555,6 +6383,7 @@ set_identity_unlock:
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -5725,65 +6554,50 @@ set_pit2_out:
r = kvm_xen_hvm_set_attr(kvm, &xha);
break;
}
-#endif
- case KVM_SET_CLOCK: {
- struct kvm_arch *ka = &kvm->arch;
- struct kvm_clock_data user_ns;
- u64 now_ns;
+ case KVM_XEN_HVM_EVTCHN_SEND: {
+ struct kvm_irq_routing_xen_evtchn uxe;
r = -EFAULT;
- if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
+ if (copy_from_user(&uxe, argp, sizeof(uxe)))
goto out;
+ r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
+ break;
+ }
+#endif
+ case KVM_SET_CLOCK:
+ r = kvm_vm_ioctl_set_clock(kvm, argp);
+ break;
+ case KVM_GET_CLOCK:
+ r = kvm_vm_ioctl_get_clock(kvm, argp);
+ break;
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
r = -EINVAL;
- if (user_ns.flags)
+ user_tsc_khz = (u32)arg;
+
+ if (kvm_has_tsc_control &&
+ user_tsc_khz >= kvm_max_guest_tsc_khz)
goto out;
- r = 0;
- /*
- * TODO: userspace has to take care of races with VCPU_RUN, so
- * kvm_gen_update_masterclock() can be cut down to locked
- * pvclock_update_vm_gtod_copy().
- */
- kvm_gen_update_masterclock(kvm);
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
- /*
- * This pairs with kvm_guest_time_update(): when masterclock is
- * in use, we use master_kernel_ns + kvmclock_offset to set
- * unsigned 'system_time' so if we use get_kvmclock_ns() (which
- * is slightly ahead) here we risk going negative on unsigned
- * 'system_time' when 'user_ns.clock' is very small.
- */
- spin_lock_irq(&ka->pvclock_gtod_sync_lock);
- if (kvm->arch.use_master_clock)
- now_ns = ka->master_kernel_ns;
- else
- now_ns = get_kvmclock_base_ns();
- ka->kvmclock_offset = user_ns.clock - now_ns;
- spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
- kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
- break;
+ goto out;
}
- case KVM_GET_CLOCK: {
- struct kvm_clock_data user_ns;
- u64 now_ns;
-
- now_ns = get_kvmclock_ns(kvm);
- user_ns.clock = now_ns;
- user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
- memset(&user_ns.pad, 0, sizeof(user_ns.pad));
-
- r = -EFAULT;
- if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
- goto out;
- r = 0;
- break;
+ case KVM_GET_TSC_KHZ: {
+ r = READ_ONCE(kvm->arch.default_tsc_khz);
+ goto out;
}
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_op)
- r = static_call(kvm_x86_mem_enc_op)(kvm, argp);
+ if (!kvm_x86_ops.mem_enc_ioctl)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -5794,8 +6608,10 @@ set_pit2_out:
goto out;
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_reg_region)
- r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region);
+ if (!kvm_x86_ops.mem_enc_register_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -5806,8 +6622,10 @@ set_pit2_out:
goto out;
r = -ENOTTY;
- if (kvm_x86_ops.mem_enc_unreg_region)
- r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region);
+ if (!kvm_x86_ops.mem_enc_unregister_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -5838,7 +6656,7 @@ static void kvm_init_msr_list(void)
u32 dummy[2];
unsigned i;
- BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
+ BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
"Please update the fixed PMCs in msrs_to_saved_all[]");
perf_get_x86_pmu_capability(&x86_pmu);
@@ -5861,7 +6679,8 @@ static void kvm_init_msr_list(void)
continue;
break;
case MSR_TSC_AUX:
- if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
continue;
break;
case MSR_IA32_UMWAIT_CONTROL:
@@ -5901,6 +6720,11 @@ static void kvm_init_msr_list(void)
min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
continue;
break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ continue;
+ break;
default:
break;
}
@@ -5981,16 +6805,17 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
static_call(kvm_x86_get_segment)(vcpu, var, seg);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
gpa_t t_gpa;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
+ t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
return t_gpa;
}
@@ -5998,43 +6823,53 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
/* uses this to access any guest's mapped memory without checking CPL */
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 access,
+ struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -6062,13 +6897,14 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
/* Inline kvm_read_guest_virt_helper for speed. */
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
+ exception);
if (unlikely(gpa == UNMAPPED_GVA))
return X86EMUL_PROPAGATE_FAULT;
@@ -6087,7 +6923,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -6106,9 +6942,11 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception, bool system)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = 0;
+ u64 access = 0;
- if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ if (system)
+ access |= PFERR_IMPLICIT_ACCESS;
+ else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -6124,16 +6962,15 @@ static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
}
static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 access,
+ struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
- access,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -6159,9 +6996,11 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
bool system)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = PFERR_WRITE_MASK;
+ u64 access = PFERR_WRITE_MASK;
- if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ if (system)
+ access |= PFERR_IMPLICIT_ACCESS;
+ else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -6179,6 +7018,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
+}
+
int handle_ud(struct kvm_vcpu *vcpu)
{
static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
@@ -6186,7 +7032,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
- if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
+ if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
return 1;
if (force_emulation_prefix &&
@@ -6220,7 +7066,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
/*
@@ -6228,16 +7075,16 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
* there is no pkey in EPT page table for L1 guest or EPT
* shadow page table for L2 guest.
*/
- if (vcpu_match_mmio_gva(vcpu, gva)
- && !permission_fault(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.mmio_access, 0, access)) {
+ if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
+ !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.mmio_access, 0, access))) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
return 1;
}
- *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
if (*gpa == UNMAPPED_GVA)
return -1;
@@ -6454,15 +7301,8 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
exception, &write_emultor);
}
-#define CMPXCHG_TYPE(t, ptr, old, new) \
- (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-
-#ifdef CONFIG_X86_64
-# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
-#else
-# define CMPXCHG64(ptr, old, new) \
- (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
-#endif
+#define emulator_try_cmpxchg_user(t, ptr, old, new) \
+ (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
@@ -6471,12 +7311,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
unsigned int bytes,
struct x86_exception *exception)
{
- struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u64 page_line_mask;
+ unsigned long hva;
gpa_t gpa;
- char *kaddr;
- bool exchanged;
+ int r;
/* guests cmpxchg8b have to be emulated atomically */
if (bytes > 8 || (bytes & (bytes - 1)))
@@ -6500,31 +7339,32 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
goto emul_write;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
+ hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
goto emul_write;
- kaddr = map.hva + offset_in_page(gpa);
+ hva += offset_in_page(gpa);
switch (bytes) {
case 1:
- exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u8, hva, old, new);
break;
case 2:
- exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u16, hva, old, new);
break;
case 4:
- exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u32, hva, old, new);
break;
case 8:
- exchanged = CMPXCHG64(kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u64, hva, old, new);
break;
default:
BUG();
}
- kvm_vcpu_unmap(vcpu, &map, true);
-
- if (!exchanged)
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+ if (r)
return X86EMUL_CMPXCHG_FAILED;
kvm_page_track_write(vcpu, gpa, new, bytes);
@@ -6557,7 +7397,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
}
static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
- unsigned short port, void *val,
+ unsigned short port,
unsigned int count, bool in)
{
vcpu->arch.pio.port = port;
@@ -6565,10 +7405,8 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
vcpu->arch.pio.count = count;
vcpu->arch.pio.size = size;
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- vcpu->arch.pio.count = 0;
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data))
return 1;
- }
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -6580,26 +7418,44 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
return 0;
}
-static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
- unsigned short port, void *val, unsigned int count)
+static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, unsigned int count)
{
- int ret;
+ WARN_ON(vcpu->arch.pio.count);
+ memset(vcpu->arch.pio_data, 0, size * count);
+ return emulator_pio_in_out(vcpu, size, port, count, true);
+}
- if (vcpu->arch.pio.count)
- goto data_avail;
+static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
+{
+ int size = vcpu->arch.pio.size;
+ unsigned count = vcpu->arch.pio.count;
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
+ vcpu->arch.pio.count = 0;
+}
- memset(vcpu->arch.pio_data, 0, size * count);
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val, unsigned int count)
+{
+ if (vcpu->arch.pio.count) {
+ /*
+ * Complete a previous iteration that required userspace I/O.
+ * Note, @count isn't guaranteed to match pio.count as userspace
+ * can modify ECX before rerunning the vCPU. Ignore any such
+ * shenanigans as KVM doesn't support modifying the rep count,
+ * and the emulator ensures @count doesn't overflow the buffer.
+ */
+ } else {
+ int r = __emulator_pio_in(vcpu, size, port, count);
+ if (!r)
+ return r;
- ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
- if (ret) {
-data_avail:
- memcpy(val, vcpu->arch.pio_data, size * count);
- trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
- vcpu->arch.pio.count = 0;
- return 1;
+ /* Results already available, fall through. */
}
- return 0;
+ complete_emulator_pio_in(vcpu, val);
+ return 1;
}
static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
@@ -6614,9 +7470,15 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, const void *val,
unsigned int count)
{
+ int ret;
+
memcpy(vcpu->arch.pio_data, val, size * count);
trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
- return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+ ret = emulator_pio_in_out(vcpu, size, port, count, false);
+ if (ret)
+ vcpu->arch.pio.count = 0;
+
+ return ret;
}
static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
@@ -6841,15 +7703,16 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
return;
}
-static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 *pdata)
+static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r;
- r = kvm_get_msr(vcpu, msr_index, pdata);
+ r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
- if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+ if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r)) {
/* Bounce to user space */
return X86EMUL_IO_NEEDED;
}
@@ -6857,15 +7720,16 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
return r;
}
-static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 data)
+static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r;
- r = kvm_set_msr(vcpu, msr_index, data);
+ r = kvm_set_msr_with_filter(vcpu, msr_index, data);
- if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+ if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_msr_access, r)) {
/* Bounce to user space */
return X86EMUL_IO_NEEDED;
}
@@ -6873,6 +7737,18 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return r;
}
+static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
+{
+ return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+}
+
+static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
+{
+ return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+}
+
static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
@@ -6890,7 +7766,9 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
- return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
+ if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
+ return 0;
+ return -EINVAL;
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -6934,14 +7812,19 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
}
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
- return kvm_register_read(emul_to_vcpu(ctxt), reg);
+ return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
}
static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
{
- kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+ kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
}
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
@@ -6954,20 +7837,22 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
return emul_to_vcpu(ctxt)->arch.hflags;
}
-static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
{
- emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ kvm_smm_changed(vcpu, false);
}
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
- return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
+ return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
}
-static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
{
- kvm_smm_changed(emul_to_vcpu(ctxt));
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
}
static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
@@ -7002,6 +7887,8 @@ static const struct x86_emulate_ops emulate_ops = {
.set_dr = emulator_set_dr,
.get_smbase = emulator_get_smbase,
.set_smbase = emulator_set_smbase,
+ .set_msr_with_filter = emulator_set_msr_with_filter,
+ .get_msr_with_filter = emulator_get_msr_with_filter,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
@@ -7014,11 +7901,12 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_long_mode = emulator_guest_has_long_mode,
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
+ .guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
- .set_hflags = emulator_set_hflags,
- .pre_leave_smm = emulator_pre_leave_smm,
- .post_leave_smm = emulator_post_leave_smm,
+ .exiting_smm = emulator_exiting_smm,
+ .leave_smm = emulator_leave_smm,
+ .triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
};
@@ -7093,6 +7981,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
@@ -7119,8 +8012,82 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata, u8 *insn_bytes, u8 insn_size)
+{
+ struct kvm_run *run = vcpu->run;
+ u64 info[5];
+ u8 info_start;
+
+ /*
+ * Zero the whole array used to retrieve the exit info, as casting to
+ * u32 for select entries will leave some chunks uninitialized.
+ */
+ memset(&info, 0, sizeof(info));
+
+ static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
+ &info[2], (u32 *)&info[3],
+ (u32 *)&info[4]);
+
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
+
+ /*
+ * There's currently space for 13 entries, but 5 are used for the exit
+ * reason and info. Restrict to 4 to reduce the maintenance burden
+ * when expanding kvm_run.emulation_failure in the future.
+ */
+ if (WARN_ON_ONCE(ndata > 4))
+ ndata = 4;
+
+ /* Always include the flags as a 'data' entry. */
+ info_start = 1;
+ run->emulation_failure.flags = 0;
+
+ if (insn_size) {
+ BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
+ sizeof(run->emulation_failure.insn_bytes) != 16));
+ info_start += 2;
+ run->emulation_failure.flags |=
+ KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
+ run->emulation_failure.insn_size = insn_size;
+ memset(run->emulation_failure.insn_bytes, 0x90,
+ sizeof(run->emulation_failure.insn_bytes));
+ memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
+ }
+
+ memcpy(&run->internal.data[info_start], info, sizeof(info));
+ memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
+ ndata * sizeof(data[0]));
+
+ run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
+}
+
+static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
+ ctxt->fetch.end - ctxt->fetch.data);
+}
+
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata)
+{
+ prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
+
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
+ struct kvm *kvm = vcpu->kvm;
+
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
@@ -7129,19 +8096,16 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
return 1;
}
- if (emulation_type & EMULTYPE_SKIP) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (kvm->arch.exit_on_emulation_error ||
+ (emulation_type & EMULTYPE_SKIP)) {
+ prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
kvm_queue_exception(vcpu, UD_VECTOR);
if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
@@ -7162,7 +8126,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
return false;
- if (!vcpu->arch.mmu->direct_map) {
+ if (!vcpu->arch.mmu->root_role.direct) {
/*
* Write permission should be allowed since only
* write access need to be emulated.
@@ -7195,7 +8159,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_release_pfn_clean(pfn);
/* The instructions are well-emulated on direct mmu. */
- if (vcpu->arch.mmu->direct_map) {
+ if (vcpu->arch.mmu->root_role.direct) {
unsigned int indirect_shadow_pages;
write_lock(&vcpu->kvm->mmu_lock);
@@ -7263,7 +8227,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
vcpu->arch.last_retry_eip = ctxt->eip;
vcpu->arch.last_retry_addr = cr2_or_gpa;
- if (!vcpu->arch.mmu->direct_map)
+ if (!vcpu->arch.mmu->root_role.direct)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -7274,14 +8238,24 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
-static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
{
- if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
- /* This is a good place to trace that we are exiting SMM. */
- trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
/* Process a latched INIT or SMI, if any. */
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
+ * on SMM exit we still need to reload them from
+ * guest memory
+ */
+ vcpu->arch.pdptrs_from_userspace = false;
}
kvm_mmu_reset_context(vcpu);
@@ -7326,6 +8300,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (unlikely(!r))
return 0;
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+
/*
* rflags is the old, "raw" value of the flags. The new value has
* not been saved yet.
@@ -7340,7 +8316,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
{
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
@@ -7409,33 +8385,24 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
}
/*
- * Decode to be emulated instruction. Return EMULATION_OK if success.
+ * Decode an instruction for emulation. The caller is responsible for handling
+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
+ * code breakpoints have higher priority and thus have already been done by
+ * hardware.
+ *
+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
+ * response to a machine check.
*/
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
void *insn, int insn_len)
{
- int r = EMULATION_OK;
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int r;
init_emulate_ctxt(vcpu);
- /*
- * We will reenter on the same instruction since we do not set
- * complete_userspace_io. This does not handle watchpoints yet,
- * those would be handled in the emulate_ops.
- */
- if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_breakpoint(vcpu, &r))
- return r;
-
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
+ r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
@@ -7452,7 +8419,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool writeback = true;
bool write_fault_to_spt;
- if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
+ if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -7467,6 +8434,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
kvm_clear_exception_queue(vcpu);
+ /*
+ * Return immediately if RIP hits a code breakpoint, such #DBs
+ * are fault-like and are higher priority than any faults on
+ * the code fetch itself.
+ */
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_code_breakpoint(vcpu, &r))
+ return r;
+
r = x86_decode_emulated_instruction(vcpu, emulation_type,
insn, insn_len);
if (r != EMULATION_OK) {
@@ -7500,12 +8476,23 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
/*
- * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
- * for kvm_skip_emulated_instruction(). The caller is responsible for
- * updating interruptibility state and injecting single-step #DBs.
+ * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
+ * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
+ * The caller is responsible for updating interruptibility state and
+ * injecting single-step #DBs.
*/
if (emulation_type & EMULTYPE_SKIP) {
- kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
+ else
+ ctxt->eip = ctxt->_eip;
+
+ if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
+ r = 1;
+ goto writeback;
+ }
+
+ kvm_rip_write(vcpu, ctxt->eip);
if (ctxt->eflags & X86_EFLAGS_RF)
kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
return 1;
@@ -7527,7 +8514,7 @@ restart:
ctxt->exception.address = cr2_or_gpa;
/* With shadow page tables, cr2 contains a GVA or nGPA. */
- if (vcpu->arch.mmu->direct_map) {
+ if (vcpu->arch.mmu->root_role.direct) {
ctxt->gpa_available = true;
ctxt->gpa_val = cr2_or_gpa;
}
@@ -7569,22 +8556,28 @@ restart:
writeback = false;
r = 0;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ } else if (vcpu->arch.complete_userspace_io) {
+ writeback = false;
+ r = 0;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = 1;
+writeback:
if (writeback) {
unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ if (ctxt->is_branch)
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
- if (kvm_x86_ops.update_emulated_instruction)
- static_call(kvm_x86_update_emulated_instruction)(vcpu);
+ static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -7738,14 +8731,13 @@ static void tsc_khz_changed(void *data)
static void kvm_hyperv_tsc_notifier(void)
{
struct kvm *kvm;
- struct kvm_vcpu *vcpu;
int cpu;
- unsigned long flags;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
hyperv_stop_tsc_emulation();
/* TSC frequency always matches when on Hyper-V */
@@ -7754,18 +8746,11 @@ static void kvm_hyperv_tsc_notifier(void)
kvm_max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) {
- struct kvm_arch *ka = &kvm->arch;
-
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+ __kvm_start_pvclock_update(kvm);
pvclock_update_vm_gtod_copy(kvm);
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-
- kvm_for_each_vcpu(cpu, vcpu, kvm)
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-
- kvm_for_each_vcpu(cpu, vcpu, kvm)
- kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+ kvm_end_pvclock_update(kvm);
}
+
mutex_unlock(&kvm_lock);
}
#endif
@@ -7774,7 +8759,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i, send_ipi = 0;
+ int send_ipi = 0;
+ unsigned long i;
/*
* We allow guests to temporarily run on slowing clocks,
@@ -7875,22 +8861,22 @@ static int kvmclock_cpu_online(unsigned int cpu)
static void kvm_timer_init(void)
{
- max_tsc_khz = tsc_khz;
-
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy *policy;
- int cpu;
-
- cpu = get_cpu();
- policy = cpufreq_cpu_get(cpu);
- if (policy) {
- if (policy->cpuinfo.max_freq)
- max_tsc_khz = policy->cpuinfo.max_freq;
- cpufreq_cpu_put(policy);
+ max_tsc_khz = tsc_khz;
+
+ if (IS_ENABLED(CONFIG_CPU_FREQ)) {
+ struct cpufreq_policy *policy;
+ int cpu;
+
+ cpu = get_cpu();
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ if (policy->cpuinfo.max_freq)
+ max_tsc_khz = policy->cpuinfo.max_freq;
+ cpufreq_cpu_put(policy);
+ }
+ put_cpu();
}
- put_cpu();
-#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
@@ -7899,57 +8885,12 @@ static void kvm_timer_init(void)
kvmclock_cpu_online, kvmclock_cpu_down_prep);
}
-DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
-EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
-
-int kvm_is_in_guest(void)
-{
- return __this_cpu_read(current_vcpu) != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
- int user_mode = 3;
-
- if (__this_cpu_read(current_vcpu))
- user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
-
- return user_mode != 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
- unsigned long ip = 0;
-
- if (__this_cpu_read(current_vcpu))
- ip = kvm_rip_read(__this_cpu_read(current_vcpu));
-
- return ip;
-}
-
-static void kvm_handle_intel_pt_intr(void)
-{
- struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
-
- kvm_make_request(KVM_REQ_PMI, vcpu);
- __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
- (unsigned long *)&vcpu->arch.pmu.global_status);
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
- .is_in_guest = kvm_is_in_guest,
- .is_user_mode = kvm_is_user_mode,
- .get_guest_ip = kvm_get_guest_ip,
- .handle_intel_pt_intr = kvm_handle_intel_pt_intr,
-};
-
#ifdef CONFIG_X86_64
static void pvclock_gtod_update_fn(struct work_struct *work)
{
struct kvm *kvm;
-
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
@@ -7962,6 +8903,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+ queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
+/*
* Notification about pvclock gtod data update.
*/
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
@@ -7972,13 +8925,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
update_pvclock_gtod(tk);
- /* disable master clock if host does not trust, or does not
- * use, TSC based clocksource.
+ /*
+ * Disable master clock if host does not trust, or does not use,
+ * TSC based clocksource. Delegate queue_work() to irq_work as
+ * this is invoked with tk_core.seq write held.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
- queue_work(system_long_wq, &pvclock_gtod_work);
-
+ irq_work_queue(&pvclock_irq_work);
return 0;
}
@@ -7993,18 +8947,20 @@ int kvm_arch_init(void *opaque)
int r;
if (kvm_x86_ops.hardware_enable) {
- printk(KERN_ERR "kvm: already loaded the other module\n");
+ pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support\n");
+ pr_err_ratelimited("kvm: no hardware support for '%s'\n",
+ ops->runtime_ops->name);
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: disabled by bios\n");
+ pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
+ ops->runtime_ops->name);
r = -EOPNOTSUPP;
goto out;
}
@@ -8020,19 +8976,18 @@ int kvm_arch_init(void *opaque)
goto out;
}
- r = -ENOMEM;
- x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
- __alignof__(struct fpu), SLAB_ACCOUNT,
- NULL);
- if (!x86_fpu_cache) {
- printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
+ r = -EOPNOTSUPP;
goto out;
}
+ r = -ENOMEM;
+
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
pr_err("kvm: failed to allocate cache for x86 emulator\n");
- goto out_free_x86_fpu_cache;
+ goto out;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
@@ -8040,25 +8995,21 @@ int kvm_arch_init(void *opaque)
printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
goto out_free_x86_emulator_cache;
}
+ kvm_nr_uret_msrs = 0;
- r = kvm_mmu_module_init();
+ r = kvm_mmu_vendor_module_init();
if (r)
goto out_free_percpu;
- kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0,
- PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
- perf_register_guest_info_callbacks(&kvm_guest_cbs);
-
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
if (pi_inject_timer == -1)
- pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
+ pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
@@ -8072,8 +9023,6 @@ out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
-out_free_x86_fpu_cache:
- kmem_cache_destroy(x86_fpu_cache);
out:
return r;
}
@@ -8085,7 +9034,6 @@ void kvm_arch_exit(void)
clear_hv_tscchange_cb();
#endif
kvm_lapic_exit();
- perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
@@ -8093,19 +9041,28 @@ void kvm_arch_exit(void)
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ irq_work_sync(&pvclock_irq_work);
+ cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops.hardware_enable = NULL;
- kvm_mmu_module_exit();
+ kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
- kmem_cache_destroy(x86_fpu_cache);
+ kmem_cache_destroy(x86_emulator_cache);
#ifdef CONFIG_KVM_XEN
static_key_deferred_flush(&kvm_xen_enabled);
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
}
-static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
+ /*
+ * The vCPU has halted, e.g. executed HLT. Update the run state if the
+ * local APIC is in-kernel, the run loop will detect the non-runnable
+ * state and halt the vCPU. Exit to userspace if the local APIC is
+ * managed by userspace, in which case userspace is responsible for
+ * handling wake events.
+ */
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = state;
@@ -8116,11 +9073,11 @@ static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
}
}
-int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
{
- return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
@@ -8129,7 +9086,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
* TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
* KVM_EXIT_DEBUG here.
*/
- return kvm_vcpu_halt(vcpu) && ret;
+ return kvm_emulate_halt_noskip(vcpu) && ret;
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
@@ -8137,7 +9094,8 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
{
int ret = kvm_skip_emulated_instruction(vcpu);
- return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret;
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ KVM_EXIT_AP_RESET_HOLD) && ret;
}
EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
@@ -8153,6 +9111,13 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
return -KVM_EOPNOTSUPP;
+ /*
+ * When tsc is in permanent catchup mode guests won't be able to use
+ * pvclock_read_retry loop to get consistent view of pvclock
+ */
+ if (vcpu->arch.tsc_always_catchup)
+ return -KVM_EOPNOTSUPP;
+
if (!kvm_get_walltime_and_clockread(&ts, &cycle))
return -KVM_EOPNOTSUPP;
@@ -8176,7 +9141,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
*
* @apicid - apicid of vcpu to be kicked.
*/
-static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
{
struct kvm_lapic_irq lapic_irq;
@@ -8196,32 +9161,82 @@ bool kvm_apicv_activated(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_apicv_activated);
-void kvm_apicv_init(struct kvm *kvm, bool enable)
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
+{
+ ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
+ ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+
+ return (vm_reasons | vcpu_reasons) == 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
+
+static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
+ enum kvm_apicv_inhibit reason, bool set)
{
- if (enable)
- clear_bit(APICV_INHIBIT_REASON_DISABLE,
- &kvm->arch.apicv_inhibit_reasons);
+ if (set)
+ __set_bit(reason, inhibits);
else
- set_bit(APICV_INHIBIT_REASON_DISABLE,
- &kvm->arch.apicv_inhibit_reasons);
+ __clear_bit(reason, inhibits);
+
+ trace_kvm_apicv_inhibit_changed(reason, set, *inhibits);
+}
+
+static void kvm_apicv_init(struct kvm *kvm)
+{
+ unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
+
+ init_rwsem(&kvm->arch.apicv_update_lock);
+
+ set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
+
+ if (!enable_apicv)
+ set_or_clear_apicv_inhibit(inhibits,
+ APICV_INHIBIT_REASON_DISABLE, true);
}
-EXPORT_SYMBOL_GPL(kvm_apicv_init);
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
{
struct kvm_vcpu *target = NULL;
struct kvm_apic_map *map;
+ vcpu->stat.directed_yield_attempted++;
+
+ if (single_task_running())
+ goto no_yield;
+
rcu_read_lock();
- map = rcu_dereference(kvm->arch.apic_map);
+ map = rcu_dereference(vcpu->kvm->arch.apic_map);
if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
target = map->phys_map[dest_id]->vcpu;
rcu_read_unlock();
- if (target && READ_ONCE(target->ready))
- kvm_vcpu_yield_to(target);
+ if (!target || !READ_ONCE(target->ready))
+ goto no_yield;
+
+ /* Ignore requests to yield to self */
+ if (vcpu == target)
+ goto no_yield;
+
+ if (kvm_vcpu_yield_to(target) <= 0)
+ goto no_yield;
+
+ vcpu->stat.directed_yield_successful++;
+
+no_yield:
+ return;
+}
+
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+ u64 ret = vcpu->run->hypercall.ret;
+
+ if (!is_64_bit_mode(vcpu))
+ ret = (u32)ret;
+ kvm_rax_write(vcpu, ret);
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -8243,7 +9258,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- op_64_bit = is_64_bit_mode(vcpu);
+ op_64_bit = is_64_bit_hypercall(vcpu);
if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
@@ -8267,8 +9282,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
break;
- kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
- kvm_sched_yield(vcpu->kvm, a1);
+ kvm_pv_kick_cpu_op(vcpu->kvm, a1);
+ kvm_sched_yield(vcpu, a1);
ret = 0;
break;
#ifdef CONFIG_X86_64
@@ -8286,9 +9301,31 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
break;
- kvm_sched_yield(vcpu->kvm, a0);
+ kvm_sched_yield(vcpu, a0);
ret = 0;
break;
+ case KVM_HC_MAP_GPA_RANGE: {
+ u64 gpa = a0, npages = a1, attrs = a2;
+
+ ret = -KVM_ENOSYS;
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+ break;
+
+ if (!PAGE_ALIGNED(gpa) || !npages ||
+ gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+ ret = -KVM_EINVAL;
+ break;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = attrs;
+ vcpu->run->hypercall.longmode = op_64_bit;
+ vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ return 0;
+ }
default:
ret = -KVM_ENOSYS;
break;
@@ -8309,6 +9346,17 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
+ /*
+ * If the quirk is disabled, synthesize a #UD and let the guest pick up
+ * the pieces.
+ */
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+ ctxt->exception.error_code_valid = false;
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->have_exception = true;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
@@ -8321,20 +9369,15 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
likely(!pic_in_kernel(vcpu->kvm));
}
+/* Called within kvm->srcu read side. */
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * if_flag is obsolete and useless, so do not bother
- * setting it for SEV-ES guests. Userspace can just
- * use kvm_run->ready_for_interrupt_injection.
- */
- kvm_run->if_flag = !vcpu->arch.guest_state_protected
- && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-
+ kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
+
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
@@ -8369,7 +9412,25 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
}
-static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ return 1;
+ }
+
+ return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ static_call(kvm_x86_queue_exception)(vcpu);
+}
+
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
{
int r;
bool can_inject = true;
@@ -8377,7 +9438,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected) {
- static_call(kvm_x86_queue_exception)(vcpu);
+ kvm_inject_exception(vcpu);
can_inject = false;
}
/*
@@ -8396,10 +9457,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
*/
else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected) {
- static_call(kvm_x86_set_nmi)(vcpu);
+ static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
- static_call(kvm_x86_set_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu);
can_inject = false;
}
}
@@ -8414,9 +9475,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
* from L2 to L1.
*/
if (is_guest_mode(vcpu)) {
- r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ r = kvm_check_nested_events(vcpu);
if (r < 0)
- goto busy;
+ goto out;
}
/* try to inject new event if pending */
@@ -8440,10 +9501,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
}
}
- static_call(kvm_x86_queue_exception)(vcpu);
+ kvm_inject_exception(vcpu);
can_inject = false;
}
+ /* Don't inject interrupts if the user asked to avoid doing so */
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
+ return 0;
+
/*
* Finally, inject interrupt events. If an event cannot be injected
* due to architectural conditions (e.g. IF=0) a window-open exit
@@ -8458,7 +9523,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (vcpu->arch.smi_pending) {
r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
vcpu->arch.smi_pending = false;
++vcpu->arch.smi_count;
@@ -8471,11 +9536,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (vcpu->arch.nmi_pending) {
r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- static_call(kvm_x86_set_nmi)(vcpu);
+ static_call(kvm_x86_inject_nmi)(vcpu);
can_inject = false;
WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
}
@@ -8486,10 +9551,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (kvm_cpu_has_injectable_intr(vcpu)) {
r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- static_call(kvm_x86_set_irq)(vcpu);
+ static_call(kvm_x86_inject_irq)(vcpu);
WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
}
if (kvm_cpu_has_injectable_intr(vcpu))
@@ -8502,11 +9567,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
*req_immediate_exit = true;
WARN_ON(vcpu->arch.exception.pending);
- return;
+ return 0;
-busy:
- *req_immediate_exit = true;
- return;
+out:
+ if (r == -EBUSY) {
+ *req_immediate_exit = true;
+ r = 0;
+ }
+ return r;
}
static void process_nmi(struct kvm_vcpu *vcpu)
@@ -8589,7 +9657,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
for (i = 0; i < 8; i++)
- put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+ put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i));
kvm_get_dr(vcpu, 6, &val);
put_smstate(u32, buf, 0x7fcc, (u32)val);
@@ -8635,7 +9703,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
int i;
for (i = 0; i < 16; i++)
- put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+ put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i));
put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
@@ -8685,10 +9753,9 @@ static void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
+ unsigned long cr0;
char buf[512];
- u32 cr0;
- trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
@@ -8698,13 +9765,13 @@ static void enter_smm(struct kvm_vcpu *vcpu)
enter_smm_save_state_32(vcpu, buf);
/*
- * Give pre_enter_smm() a chance to make ISA-specific changes to the
- * vCPU state (e.g. leave guest mode) after we've saved the state into
- * the SMM state-save area.
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
*/
- static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
+ static_call(kvm_x86_enter_smm)(vcpu, buf);
- vcpu->arch.hflags |= HF_SMM_MASK;
+ kvm_smm_changed(vcpu, true);
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (static_call(kvm_x86_get_nmi_mask)(vcpu))
@@ -8770,14 +9837,7 @@ static void process_smi(struct kvm_vcpu *vcpu)
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap)
{
- cpumask_var_t cpus;
-
- zalloc_cpumask_var(&cpus, GFP_ATOMIC);
-
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
- NULL, vcpu_bitmap, cpus);
-
- free_cpumask_var(cpus);
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
}
void kvm_make_scan_ioapic_request(struct kvm *kvm)
@@ -8787,62 +9847,87 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
+ bool activate;
+
if (!lapic_in_kernel(vcpu))
return;
- vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+ down_read(&vcpu->kvm->arch.apicv_update_lock);
+ preempt_disable();
+
+ activate = kvm_vcpu_apicv_activated(vcpu);
+
+ if (vcpu->arch.apicv_active == activate)
+ goto out;
+
+ vcpu->arch.apicv_active = activate;
kvm_apic_update_apicv(vcpu);
static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+
+ /*
+ * When APICv gets disabled, we may still have injected interrupts
+ * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+ * still active when the interrupt got accepted. Make sure
+ * inject_pending_event() is called to check for that.
+ */
+ if (!vcpu->arch.apicv_active)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+out:
+ preempt_enable();
+ up_read(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
-/*
- * NOTE: Do not hold any lock prior to calling this.
- *
- * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
- * locked, because it calls __x86_set_memory_region() which does
- * synchronize_srcu(&kvm->srcu).
- */
-void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set)
{
- struct kvm_vcpu *except;
- unsigned long old, new, expected;
+ unsigned long old, new;
+
+ lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
- if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
- !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
+ if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
return;
- old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
- do {
- expected = new = old;
- if (activate)
- __clear_bit(bit, &new);
- else
- __set_bit(bit, &new);
- if (new == old)
- break;
- old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
- } while (old != expected);
+ old = new = kvm->arch.apicv_inhibit_reasons;
- if (!!old == !!new)
- return;
+ set_or_clear_apicv_inhibit(&new, reason, set);
- trace_kvm_apicv_update_request(activate, bit);
- if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
- static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
+ if (!!old != !!new) {
+ /*
+ * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
+ * false positives in the sanity check WARN in svm_vcpu_run().
+ * This task will wait for all vCPUs to ack the kick IRQ before
+ * updating apicv_inhibit_reasons, and all other vCPUs will
+ * block on acquiring apicv_update_lock so that vCPUs can't
+ * redo svm_vcpu_run() without seeing the new inhibit state.
+ *
+ * Note, holding apicv_update_lock and taking it in the read
+ * side (handling the request) also prevents other vCPUs from
+ * servicing the request with a stale apicv_inhibit_reasons.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+ kvm->arch.apicv_inhibit_reasons = new;
+ if (new) {
+ unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+ kvm_zap_gfn_range(kvm, gfn, gfn+1);
+ }
+ } else {
+ kvm->arch.apicv_inhibit_reasons = new;
+ }
+}
- /*
- * Sending request to update APICV for all other vcpus,
- * while update the calling vcpu immediately instead of
- * waiting for another #VMEXIT to handle the request.
- */
- except = kvm_get_running_vcpu();
- kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
- except);
- if (except)
- kvm_vcpu_update_apicv(except);
+void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set)
+{
+ if (!enable_apicv)
+ return;
+
+ down_write(&kvm->arch.apicv_update_lock);
+ __kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
+ up_write(&kvm->arch.apicv_update_lock);
}
-EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
+EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit);
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
@@ -8854,8 +9939,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -8873,12 +9957,16 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- if (to_hv_vcpu(vcpu))
+ if (to_hv_vcpu(vcpu)) {
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
+ static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ return;
+ }
- static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ static_call_cond(kvm_x86_load_eoi_exitmap)(
+ vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -8895,15 +9983,17 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
- if (!lapic_in_kernel(vcpu))
- return;
+ static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
+}
- if (!kvm_x86_ops.set_apic_access_page_addr)
+static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
return;
- static_call(kvm_x86_set_apic_access_page_addr)(vcpu);
+ static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -8913,6 +10003,7 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
/*
+ * Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
* userspace.
@@ -8937,18 +10028,22 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
+ r = -EIO;
+ goto out;
+ }
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
goto out;
}
}
- if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
- kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
- kvm_gen_update_masterclock(vcpu->kvm);
+ kvm_update_masterclock(vcpu->kvm);
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update(vcpu);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
@@ -8966,10 +10061,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
- if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
- kvm_vcpu_flush_tlb_guest(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -8977,10 +10069,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
- vcpu->mmio_needed = 0;
- r = 0;
- goto out;
+ if (is_guest_mode(vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ } else {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ r = 0;
+ goto out;
+ }
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
@@ -9018,12 +10114,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
@@ -9057,13 +10155,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
kvm_xen_has_interrupt(vcpu)) {
++vcpu->stat.req_event;
- kvm_apic_accept_events(vcpu);
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
goto out;
}
- inject_pending_event(vcpu, &req_immediate_exit);
+ r = inject_pending_event(vcpu, &req_immediate_exit);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
if (req_int_win)
static_call(kvm_x86_enable_irq_window)(vcpu);
@@ -9080,7 +10186,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- static_call(kvm_x86_prepare_guest_switch)(vcpu);
+ static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -9088,9 +10194,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* result in virtual interrupt delivery.
*/
local_irq_disable();
- vcpu->mode = IN_GUEST_MODE;
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ /* Store vcpu->apicv_active before vcpu->mode. */
+ smp_store_release(&vcpu->mode, IN_GUEST_MODE);
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
/*
* 1) We should set ->mode before checking ->requests. Please see
@@ -9107,18 +10215,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_mb__after_srcu_read_unlock();
/*
- * This handles the case where a posted interrupt was
- * notified with kvm_vcpu_kick.
+ * Process pending posted interrupts to handle the case where the
+ * notification IRQ arrived in the host, or was never sent (because the
+ * target vCPU wasn't running). Do this regardless of the vCPU's APICv
+ * status, KVM doesn't update assigned devices when APICv is inhibited,
+ * i.e. they can post interrupts even if APICv is temporarily disabled.
*/
- if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
preempt_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
r = 1;
goto cancel_injection;
}
@@ -9132,29 +10243,42 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_return();
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
- set_debugreg(vcpu->arch.dr6, 6);
- vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(0, 7);
}
+ guest_timing_enter_irqoff();
+
for (;;) {
- exit_fastpath = static_call(kvm_x86_run)(vcpu);
+ /*
+ * Assert that vCPU vs. VM APICv state is consistent. An APICv
+ * update must kick and wait for all vCPUs before toggling the
+ * per-VM state, and responsing vCPUs must wait for the update
+ * to complete before servicing KVM_REQ_APICV_UPDATE.
+ */
+ WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
+
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
- if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+
+ if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
break;
}
-
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
- }
+ }
/*
* Do this here before restoring debug registers on the host. And
@@ -9167,7 +10291,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
- vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
/*
@@ -9186,8 +10309,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ /*
+ * Sync xfd before calling handle_exit_irqoff() which may
+ * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+ * in #NM irqoff handler).
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ fpu_sync_guest_vmexit_xfd_state();
+
static_call(kvm_x86_handle_exit_irqoff)(vcpu);
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, 0);
+
/*
* Consume any pending interrupts, including the possible source of
* VM-Exit on SVM and any ticks that occur between VM-Exit and now.
@@ -9195,24 +10329,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* interrupts on processors that implement an interrupt shadow, the
* stat.exits increment will do nicely.
*/
- kvm_before_interrupt(vcpu);
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
local_irq_enable();
++vcpu->stat.exits;
local_irq_disable();
kvm_after_interrupt(vcpu);
- if (lapic_in_kernel(vcpu)) {
- s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
- if (delta != S64_MIN) {
- trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
- vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
- }
- }
+ /*
+ * Wait until after servicing IRQs to account guest time so that any
+ * ticks that occurred while running the guest are properly accounted
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
+ * of accounting via context tracking, but the loss of accuracy is
+ * acceptable for all known use cases.
+ */
+ guest_timing_exit_irqoff();
local_irq_enable();
preempt_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
/*
* Profile KVM exit RIPs:
@@ -9241,22 +10376,39 @@ out:
return r;
}
-static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+/* Called within kvm->srcu read side. */
+static inline int vcpu_block(struct kvm_vcpu *vcpu)
{
- if (!kvm_arch_vcpu_runnable(vcpu) &&
- (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ bool hv_timer;
+
+ if (!kvm_arch_vcpu_runnable(vcpu)) {
+ /*
+ * Switch to the software timer before halt-polling/blocking as
+ * the guest's timer may be a break event for the vCPU, and the
+ * hypervisor timer runs only when the CPU is in guest mode.
+ * Switch before halt-polling so that KVM recognizes an expired
+ * timer before blocking.
+ */
+ hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
+ if (hv_timer)
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ kvm_vcpu_halt(vcpu);
+ else
+ kvm_vcpu_block(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
- if (kvm_x86_ops.post_block)
- static_call(kvm_x86_post_block)(vcpu);
+ if (hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
}
- kvm_apic_accept_events(vcpu);
+ if (kvm_apic_accept_events(vcpu) < 0)
+ return 0;
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
case KVM_MP_STATE_AP_RESET_HOLD:
@@ -9278,31 +10430,40 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu))
- kvm_x86_ops.nested_ops->check_events(vcpu);
+ kvm_check_nested_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
}
+/* Called within kvm->srcu read side. */
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
- struct kvm *kvm = vcpu->kvm;
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
+ /*
+ * If another guest vCPU requests a PV TLB flush in the middle
+ * of instruction emulation, the rest of the emulation could
+ * use a stale page translation. Assume that any code after
+ * this point can start executing an instruction.
+ */
+ vcpu->arch.at_instruction_boundary = false;
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
- r = vcpu_block(kvm, vcpu);
+ r = vcpu_block(vcpu);
}
if (r <= 0)
break;
- kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
+ if (kvm_xen_has_pending_events(vcpu))
+ kvm_xen_inject_pending_events(vcpu);
+
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
@@ -9315,27 +10476,20 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
}
if (__xfer_to_guest_mode_work_pending()) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
r = xfer_to_guest_mode_handle_work(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (r)
return r;
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-
return r;
}
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
- int r;
-
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- return r;
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
}
static int complete_emulated_pio(struct kvm_vcpu *vcpu)
@@ -9408,58 +10562,18 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
-static void kvm_save_current_fpu(struct fpu *fpu)
-{
- /*
- * If the target FPU state is not resident in the CPU registers, just
- * memcpy() from current, else save CPU state directly to the target.
- */
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- memcpy(&fpu->state, &current->thread.fpu.state,
- fpu_kernel_xstate_size);
- else
- copy_fpregs_to_fpstate(fpu);
-}
-
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- fpregs_lock();
-
- kvm_save_current_fpu(vcpu->arch.user_fpu);
-
- /*
- * Guests with protected state can't have it set by the hypervisor,
- * so skip trying to set it.
- */
- if (vcpu->arch.guest_fpu)
- /* PKRU is separately restored in kvm_x86_ops.run. */
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
- ~XFEATURE_MASK_PKRU);
-
- fpregs_mark_activate();
- fpregs_unlock();
-
+ /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
}
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- fpregs_lock();
-
- /*
- * Guests with protected state can't have it read by the hypervisor,
- * so skip trying to save it.
- */
- if (vcpu->arch.guest_fpu)
- kvm_save_current_fpu(vcpu->arch.guest_fpu);
-
- copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
-
- fpregs_mark_activate();
- fpregs_unlock();
-
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
}
@@ -9474,13 +10588,26 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
goto out;
}
+ /*
+ * It should be impossible for the hypervisor timer to be in
+ * use before KVM has ever run the vCPU.
+ */
+ WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_block(vcpu);
- kvm_apic_accept_events(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ if (kvm_apic_accept_events(vcpu) < 0) {
+ r = 0;
+ goto out;
+ }
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
r = -EAGAIN;
if (signal_pending(current)) {
@@ -9491,7 +10618,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
+ (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
r = -EINVAL;
goto out;
}
@@ -9519,18 +10647,25 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
- if (kvm_run->immediate_exit)
+ if (kvm_run->immediate_exit) {
r = -EINTR;
- else
- r = vcpu_run(vcpu);
+ goto out;
+ }
+
+ r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+ if (r <= 0)
+ goto out;
+
+ r = vcpu_run(vcpu);
out:
kvm_put_guest_fpu(vcpu);
if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
- kvm_sigset_deactivate(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ kvm_sigset_deactivate(vcpu);
vcpu_put(vcpu);
return r;
}
@@ -9619,17 +10754,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
return 0;
}
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- struct kvm_segment cs;
-
- kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
- *db = cs.db;
- *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
-static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
@@ -9662,14 +10787,36 @@ skip_protected_regs:
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
+}
+
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ __get_sregs_common(vcpu, sregs);
- memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
+ if (vcpu->arch.guest_state_protected)
+ return;
if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
}
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int i;
+
+ __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ for (i = 0 ; i < 4 ; i++)
+ sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+ sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ }
+}
+
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@@ -9682,11 +10829,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ int r;
+
vcpu_load(vcpu);
if (kvm_mpx_supported())
kvm_load_guest_fpu(vcpu);
- kvm_apic_accept_events(vcpu);
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0)
+ goto out;
+ r = 0;
+
if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
vcpu->arch.pv.pv_unhalted)
@@ -9694,10 +10847,11 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
else
mp_state->mp_state = vcpu->arch.mp_state;
+out:
if (kvm_mpx_supported())
kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
- return 0;
+ return r;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
@@ -9781,24 +10935,23 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return kvm_is_valid_cr4(vcpu, sregs->cr4);
}
-static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+ int *mmu_reset_needed, bool update_pdptrs)
{
struct msr_data apic_base_msr;
- int mmu_reset_needed = 0;
- int pending_vec, max_bits, idx;
+ int idx;
struct desc_ptr dt;
- int ret = -EINVAL;
if (!kvm_is_valid_sregs(vcpu, sregs))
- goto out;
+ return -EINVAL;
apic_base_msr.data = sregs->apic_base;
apic_base_msr.host_initiated = true;
if (kvm_set_apic_base(vcpu, &apic_base_msr))
- goto out;
+ return -EINVAL;
if (vcpu->arch.guest_state_protected)
- goto skip_protected_regs;
+ return 0;
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
@@ -9808,31 +10961,31 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
static_call(kvm_x86_set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
kvm_set_cr8(vcpu, sregs->cr8);
- mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
- mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
- mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (is_pae_paging(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
- mmu_reset_needed = 1;
+ if (update_pdptrs) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (is_pae_paging(vcpu)) {
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
+ *mmu_reset_needed = 1;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- if (mmu_reset_needed)
- kvm_mmu_reset_context(vcpu);
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -9852,20 +11005,63 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-skip_protected_regs:
+ return 0;
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ int pending_vec, max_bits;
+ int mmu_reset_needed = 0;
+ int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+
+ if (ret)
+ return ret;
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit(
(const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
if (pending_vec < max_bits) {
kvm_queue_interrupt(vcpu, pending_vec, false);
pr_debug("Set back pending irq %d\n", pending_vec);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+ return 0;
+}
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int mmu_reset_needed = 0;
+ bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+ !(sregs2->efer & EFER_LMA);
+ int i, ret;
- ret = 0;
-out:
- return ret;
+ if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+ return -EINVAL;
+
+ if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+ return -EINVAL;
+
+ ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+ &mmu_reset_needed, !valid_pdptrs);
+ if (ret)
+ return ret;
+
+ if (valid_pdptrs) {
+ for (i = 0; i < 4 ; i++)
+ kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ mmu_reset_needed = 1;
+ vcpu->arch.pdptrs_from_userspace = true;
+ }
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+ return 0;
}
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -9879,6 +11075,27 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
return ret;
}
+static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
+{
+ bool set = false;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (!enable_apicv)
+ return;
+
+ down_write(&kvm->arch.apicv_update_lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
+ set = true;
+ break;
+ }
+ }
+ __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set);
+ up_write(&kvm->arch.apicv_update_lock);
+}
+
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
@@ -9921,8 +11138,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
kvm_update_dr7(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
+ vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
/*
* Trigger an rflags update that will inject or remove the trace
@@ -9932,6 +11148,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
static_call(kvm_x86_update_exception_bitmap)(vcpu);
+ kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
+
r = 0;
out:
@@ -9967,12 +11185,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
vcpu_load(vcpu);
- fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
fpu->fsw = fxsave->swd;
@@ -9990,12 +11208,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
vcpu_load(vcpu);
- fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -10027,9 +11245,6 @@ static void store_regs(struct kvm_vcpu *vcpu)
static int sync_regs(struct kvm_vcpu *vcpu)
{
- if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
- return -EINVAL;
-
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
__set_regs(vcpu, &vcpu->run->s.regs.regs);
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
@@ -10049,33 +11264,6 @@ static int sync_regs(struct kvm_vcpu *vcpu)
return 0;
}
-static void fx_init(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->arch.guest_fpu)
- return;
-
- fpstate_init(&vcpu->arch.guest_fpu->state);
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
- host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
- /*
- * Ensure guest xcr0 is valid for loading
- */
- vcpu->arch.xcr0 = XFEATURE_MASK_FP;
-
- vcpu->arch.cr0 |= X86_CR0_ET;
-}
-
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.guest_fpu) {
- kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
- vcpu->arch.guest_fpu = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
-
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
@@ -10090,13 +11278,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
struct page *page;
int r;
+ vcpu->arch.last_vmentry_cpu = -1;
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
+
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
r = kvm_mmu_create(vcpu);
if (r < 0)
return r;
@@ -10105,8 +11295,21 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
- if (kvm_apicv_activated(vcpu->kvm))
+
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as
+ * this vCPU will not get notified of any changes until this
+ * vCPU is visible to other vCPUs (marked online and added to
+ * the set of vCPUs). Opportunistically mark APICv active as
+ * VMX in particularly is highly unlikely to have inhibits.
+ * Ignore the current per-VM APICv state so that vCPU creation
+ * is guaranteed to run with a deterministic value, the request
+ * will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
vcpu->arch.apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
} else
static_branch_inc(&kvm_has_noapic_vcpu);
@@ -10130,20 +11333,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (!alloc_emulate_ctxt(vcpu))
goto free_wbinvd_dirty_mask;
- vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.user_fpu) {
- pr_err("kvm: failed to allocate userspace's fpu\n");
- goto free_emulate_ctxt;
- }
-
- vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.guest_fpu) {
+ if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
pr_err("kvm: failed to allocate vcpu's fpu\n");
- goto free_user_fpu;
+ goto free_emulate_ctxt;
}
- fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
@@ -10156,23 +11349,27 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pending_external_vector = -1;
vcpu->arch.preempted_in_kernel = false;
+#if IS_ENABLED(CONFIG_HYPERV)
+ vcpu->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
r = static_call(kvm_x86_vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+ kvm_xen_init_vcpu(vcpu);
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
+ kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
- kvm_init_mmu(vcpu, false);
+ kvm_init_mmu(vcpu);
vcpu_put(vcpu);
return 0;
free_guest_fpu:
- kvm_free_guest_fpu(vcpu);
-free_user_fpu:
- kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
free_emulate_ctxt:
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_wbinvd_dirty_mask:
@@ -10210,20 +11407,17 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
int idx;
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
kvmclock_reset(vcpu);
static_call(kvm_x86_vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
- kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
- kvm_free_guest_fpu(vcpu);
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
+ kvm_xen_destroy_vcpu(vcpu);
kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
kfree(vcpu->arch.mce_banks);
@@ -10239,6 +11433,20 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
+ struct kvm_cpuid_entry2 *cpuid_0x1;
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long new_cr0;
+
+ /*
+ * Several of the "set" flows, e.g. ->set_cr0(), read other registers
+ * to handle side effects. RESET emulation hits those flows and relies
+ * on emulated/virtualized registers, including those that are loaded
+ * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
+ * to detect improper or missing initialization.
+ */
+ WARN_ON_ONCE(!init_event &&
+ (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
+
kvm_lapic_reset(vcpu, init_event);
vcpu->arch.hflags = 0;
@@ -10270,8 +11478,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
- void *mpx_state_buffer;
+ if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+ struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
/*
* To avoid have the INIT path from kvm_apic_has_events() that be
@@ -10279,14 +11487,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
*/
if (init_event)
kvm_put_guest_fpu(vcpu);
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
- XFEATURE_BNDREGS);
- if (mpx_state_buffer)
- memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
- XFEATURE_BNDCSR);
- if (mpx_state_buffer)
- memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+
+ fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+ fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
if (init_event)
kvm_load_guest_fpu(vcpu);
}
@@ -10297,17 +11501,74 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.msr_misc_features_enables = 0;
- vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+ __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+ __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
}
+ /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
+ kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
- vcpu->arch.ia32_xss = 0;
+ /*
+ * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
+ * if no CPUID match is found. Note, it's impossible to get a match at
+ * RESET since KVM emulates RESET before exposing the vCPU to userspace,
+ * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
+ * on RESET. But, go through the motions in case that's ever remedied.
+ */
+ cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
+ kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0xfff0);
+
+ vcpu->arch.cr3 = 0;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+
+ /*
+ * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
+ * of Intel's SDM list CD/NW as being set on INIT, but they contradict
+ * (or qualify) that with a footnote stating that CD/NW are preserved.
+ */
+ new_cr0 = X86_CR0_ET;
+ if (init_event)
+ new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
+ else
+ new_cr0 |= X86_CR0_NW | X86_CR0_CD;
+
+ static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
+ static_call(kvm_x86_set_efer)(vcpu, 0);
+ static_call(kvm_x86_update_exception_bitmap)(vcpu);
+
+ /*
+ * On the standard CR0/CR4/EFER modification paths, there are several
+ * complex conditions determining whether the MMU has to be reset and/or
+ * which PCIDs have to be flushed. However, CR0.WP and the paging-related
+ * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
+ * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
+ * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here.
+ */
+ if (old_cr0 & X86_CR0_PG) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ kvm_mmu_reset_context(vcpu);
+ }
+
+ /*
+ * Intel's SDM states that all TLB entries are flushed on INIT. AMD's
+ * APM states the TLBs are untouched by INIT, but it also states that
+ * the TLBs are flushed on "External initialization of the processor."
+ * Flush the guest TLB regardless of vendor, there is no meaningful
+ * benefit in relying on the guest to flush the TLB immediately after
+ * INIT. A spurious TLB flush is benign and likely negligible from a
+ * performance perspective.
+ */
+ if (init_event)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
@@ -10325,7 +11586,7 @@ int kvm_arch_hardware_enable(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
int ret;
u64 local_tsc;
u64 max_tsc = 0;
@@ -10418,6 +11679,24 @@ void kvm_arch_hardware_disable(void)
drop_user_return_notifiers();
}
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
int kvm_arch_hardware_setup(void *opaque)
{
struct kvm_x86_init_ops *ops = opaque;
@@ -10432,8 +11711,9 @@ int kvm_arch_hardware_setup(void *opaque)
if (r != 0)
return r;
- memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
- kvm_ops_static_call_update();
+ kvm_ops_update(ops);
+
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
supported_xss = 0;
@@ -10452,16 +11732,16 @@ int kvm_arch_hardware_setup(void *opaque)
u64 max = min(0x7fffffffULL,
__scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
kvm_max_guest_tsc_khz = max;
-
- kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
}
-
+ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
kvm_init_msr_list();
return 0;
}
void kvm_arch_hardware_unsetup(void)
{
+ kvm_unregister_perf_callbacks();
+
static_call(kvm_x86_hardware_unsetup)();
}
@@ -10508,19 +11788,27 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_free_vm(struct kvm *kvm)
{
kfree(to_kvm_hv(kvm)->hv_pa_pg);
- vfree(kvm);
+ __kvm_arch_free_vm(kvm);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ int ret;
+ unsigned long flags;
+
if (type)
return -EINVAL;
+ ret = kvm_page_track_init(kvm);
+ if (ret)
+ goto out;
+
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_page_track;
+
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
- INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
- INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -10532,21 +11820,35 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
- spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
-
+ seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+ kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
kvm->arch.guest_can_read_msr_platform_info = true;
+ kvm->arch.enable_pmu = enable_pmu;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+ kvm->arch.hv_root_tdp = INVALID_PAGE;
+#endif
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+ kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
- kvm_page_track_init(kvm);
- kvm_mmu_init_vm(kvm);
+ kvm_xen_init_vm(kvm);
return static_call(kvm_x86_vm_init)(kvm);
+
+out_page_track:
+ kvm_page_track_cleanup(kvm);
+out:
+ return ret;
}
int kvm_arch_post_init_vm(struct kvm *kvm)
@@ -10561,27 +11863,15 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
vcpu_put(vcpu);
}
-static void kvm_free_vcpus(struct kvm *kvm)
+static void kvm_unload_vcpu_mmus(struct kvm *kvm)
{
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
- /*
- * Unpin any mmu pages first.
- */
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
}
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vcpu_destroy(vcpu);
-
- mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
-
- atomic_set(&kvm->online_vcpus, 0);
- mutex_unlock(&kvm->lock);
}
void kvm_arch_sync_events(struct kvm *kvm)
@@ -10591,8 +11881,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
}
-#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
-
/**
* __x86_set_memory_region: Setup KVM internal memory slot
*
@@ -10678,7 +11966,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
if (current->mm == kvm->mm) {
/*
* Free memory regions allocated on behalf of userspace,
- * unless the the memory map has changed due to process exit
+ * unless the memory map has changed due to process exit
* or fd copying.
*/
mutex_lock(&kvm->slots_lock);
@@ -10689,11 +11977,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
mutex_unlock(&kvm->slots_lock);
}
+ kvm_unload_vcpu_mmus(kvm);
static_call_cond(kvm_x86_vm_destroy)(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
- kvm_free_vcpus(kvm);
+ kvm_destroy_vcpus(kvm);
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@@ -10702,17 +11991,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_hv_destroy_vm(kvm);
}
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+static void memslot_rmap_free(struct kvm_memory_slot *slot)
{
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.rmap[i]);
slot->arch.rmap[i] = NULL;
+ }
+}
- if (i == 0)
- continue;
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ int i;
+
+ memslot_rmap_free(slot);
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
@@ -10720,11 +12015,34 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
kvm_page_track_free_memslot(slot);
}
-static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
- unsigned long npages)
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
{
+ const int sz = sizeof(*slot->arch.rmap[0]);
int i;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ int level = i + 1;
+ int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
+
+ if (slot->arch.rmap[i])
+ continue;
+
+ slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+ if (!slot->arch.rmap[i]) {
+ memslot_rmap_free(slot);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ unsigned long npages = slot->npages;
+ int i, r;
+
/*
* Clear out the previous array pointers for the KVM_MR_MOVE case. The
* old arrays will be freed by __kvm_set_memory_region() if installing
@@ -10732,24 +12050,21 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
*/
memset(&slot->arch, 0, sizeof(slot->arch));
- for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (kvm_memslots_have_rmaps(kvm)) {
+ r = memslot_rmap_alloc(slot, npages);
+ if (r)
+ return r;
+ }
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
int lpages;
int level = i + 1;
- lpages = gfn_to_index(slot->base_gfn + npages - 1,
- slot->base_gfn, level) + 1;
-
- slot->arch.rmap[i] =
- kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
- GFP_KERNEL_ACCOUNT);
- if (!slot->arch.rmap[i])
- goto out_free;
- if (i == 0)
- continue;
+ lpages = __kvm_mmu_slot_lpages(slot, npages, level);
- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
+ linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -10772,18 +12087,15 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
}
}
- if (kvm_page_track_create_memslot(slot, npages))
+ if (kvm_page_track_create_memslot(kvm, slot, npages))
goto out_free;
return 0;
out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- kvfree(slot->arch.rmap[i]);
- slot->arch.rmap[i] = NULL;
- if (i == 0)
- continue;
+ memslot_rmap_free(slot);
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
@@ -10793,7 +12105,7 @@ out_free:
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
/*
* memslots->generation has been incremented.
@@ -10807,13 +12119,22 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- const struct kvm_userspace_memory_region *mem,
- enum kvm_mr_change change)
+ const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
- if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
- return kvm_alloc_memslot_metadata(memslot,
- mem->memory_size >> PAGE_SHIFT);
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
+ if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
+ return -EINVAL;
+
+ return kvm_alloc_memslot_metadata(kvm, new);
+ }
+
+ if (change == KVM_MR_FLAGS_ONLY)
+ memcpy(&new->arch, &old->arch, sizeof(old->arch));
+ else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
+ return -EIO;
+
return 0;
}
@@ -10834,16 +12155,18 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
struct kvm_memory_slot *old,
- struct kvm_memory_slot *new,
+ const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+ u32 old_flags = old ? old->flags : 0;
+ u32 new_flags = new ? new->flags : 0;
+ bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
/*
* Update CPU dirty logging if dirty logging is being toggled. This
* applies to all operations.
*/
- if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)
+ if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
/*
@@ -10861,7 +12184,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* MOVE/DELETE: The old mappings will already have been cleaned up by
* kvm_arch_flush_shadow_memslot().
*/
- if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
+ if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
return;
/*
@@ -10869,7 +12192,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
* logging isn't being toggled on or off.
*/
- if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)))
+ if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
return;
if (!log_dirty_pages) {
@@ -10888,53 +12211,40 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
*/
kvm_mmu_zap_collapsible_sptes(kvm, new);
} else {
- /* By default, write-protect everything to log writes. */
- int level = PG_LEVEL_4K;
+ /*
+ * Initially-all-set does not require write protecting any page,
+ * because they're all assumed to be dirty.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
- if (kvm_x86_ops.cpu_dirty_log_size) {
- /*
- * Clear all dirty bits, unless pages are treated as
- * dirty from the get-go.
- */
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
- kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
- /*
- * Write-protect large pages on write so that dirty
- * logging happens at 4k granularity. No need to
- * write-protect small SPTEs since write accesses are
- * logged by the CPU via dirty bits.
- */
- level = PG_LEVEL_2M;
- } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- /*
- * If we're with initial-all-set, we don't need
- * to write protect any small page because
- * they're reported as dirty already. However
- * we still need to write-protect huge pages
- * so that the page split can happen lazily on
- * the first write to the huge page.
- */
- level = PG_LEVEL_2M;
+ if (kvm_x86_ops.cpu_dirty_log_size) {
+ kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
+ } else {
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
}
- kvm_mmu_slot_remove_write_access(kvm, new, level);
}
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- if (!kvm->arch.n_requested_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm,
- kvm_mmu_calculate_default_mmu_pages(kvm));
+ if (!kvm->arch.n_requested_mmu_pages &&
+ (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
+ unsigned long nr_mmu_pages;
- /*
- * FIXME: const-ify all uses of struct kvm_memory_slot.
- */
- kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
+ nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ }
+
+ kvm_mmu_slot_apply_flags(kvm, old, new, change);
/* Free the arrays associated with the old memslot. */
if (change == KVM_MR_MOVE)
@@ -10955,8 +12265,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
- kvm_x86_ops.guest_apic_has_interrupt &&
- static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
+ static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -10996,6 +12305,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
return true;
+ if (kvm_xen_has_pending_events(vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
+ return true;
+
return false;
}
@@ -11004,6 +12319,14 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
}
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+ return true;
+
+ return false;
+}
+
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
{
if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
@@ -11014,17 +12337,22 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
- if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
- return true;
-
- return false;
+ return kvm_arch_dy_has_pending_interrupt(vcpu);
}
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
+ if (vcpu->arch.guest_state_protected)
+ return true;
+
return vcpu->arch.preempted_in_kernel;
}
+unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
+{
+ return kvm_rip_read(vcpu);
+}
+
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@ -11080,25 +12408,6 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
-void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
-{
- int r;
-
- if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
- work->wakeup_all)
- return;
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- return;
-
- if (!vcpu->arch.mmu->direct_map &&
- work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
- return;
-
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
-}
-
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
{
BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
@@ -11196,14 +12505,28 @@ static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+
+ if (!kvm_pv_async_pf_enabled(vcpu))
return false;
- if (!kvm_pv_async_pf_enabled(vcpu) ||
- (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0))
+ if (vcpu->arch.apf.send_user_only &&
+ static_call(kvm_x86_get_cpl)(vcpu) == 0)
return false;
- return true;
+ if (is_guest_mode(vcpu)) {
+ /*
+ * L1 needs to opt into the special #PF vmexits that are
+ * used to deliver async page faults.
+ */
+ return vcpu->arch.apf.delivery_as_pf_vmexit;
+ } else {
+ /*
+ * Play it safe in case the guest temporarily disables paging.
+ * The real mode IDT in particular is unlikely to have a #PF
+ * exception setup.
+ */
+ return is_paging(vcpu);
+ }
}
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -11292,12 +12615,13 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
if (!kvm_pv_async_pf_enabled(vcpu))
return true;
else
- return apf_pageready_slot_free(vcpu);
+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)
{
- atomic_inc(&kvm->arch.assigned_device_count);
+ if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
+ static_call_cond(kvm_x86_pi_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -11345,7 +12669,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm,
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
@@ -11370,7 +12694,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -11381,7 +12705,16 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
+ return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
+}
+
+bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI)
+ return true;
+
+ return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
}
bool kvm_vector_hashing_enabled(void)
@@ -11424,12 +12757,13 @@ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
struct x86_exception fault;
- u32 access = error_code &
+ u64 access = error_code &
(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
if (!(error_code & PFERR_PRESENT_MASK) ||
- vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
+ mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) {
/*
* If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
* tables probably do not match the TLB. Just proceed
@@ -11465,9 +12799,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
* doesn't seem to be a real use-case behind such requests, just return
* KVM_EXIT_INTERNAL_ERROR for now.
*/
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -11477,8 +12809,6 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
{
bool pcid_enabled;
struct x86_exception e;
- unsigned i;
- unsigned long roots_to_free = 0;
struct {
u64 pcid;
u64 gla;
@@ -11512,23 +12842,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
return 1;
}
- if (kvm_get_active_pcid(vcpu) == operand.pcid) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
- == operand.pcid)
- roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
- /*
- * If neither the current cr3 nor any of the prev_roots use the
- * given PCID, then nothing needs to be done here because a
- * resync will happen anyway before switching to any other CR3.
- */
-
+ kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);
case INVPCID_TYPE_ALL_NON_GLOBAL:
@@ -11541,11 +12855,12 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
fallthrough;
case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_mmu_unload(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
return kvm_skip_emulated_instruction(vcpu);
default:
- BUG(); /* We have already checked above that type <= 3 */
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
}
EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
@@ -11673,44 +12988,81 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
}
EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
-static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port);
+
+static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
{
- memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data,
- vcpu->arch.pio.count * vcpu->arch.pio.size);
- vcpu->arch.pio.count = 0;
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+ vcpu->arch.pio.count = 0;
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_outs(vcpu, size, port);
return 1;
}
static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
- unsigned int port, void *data, unsigned int count)
+ unsigned int port)
{
- int ret;
-
- ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port,
- data, count);
- if (ret)
- return ret;
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
+
+ /* memcpy done already by emulator_pio_out. */
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+ if (!ret)
+ break;
- vcpu->arch.pio.count = 0;
+ /* Emulation done by the kernel. */
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
+ }
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
return 0;
}
static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
- unsigned int port, void *data, unsigned int count)
+ unsigned int port);
+
+static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
{
- int ret;
+ unsigned count = vcpu->arch.pio.count;
+ complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+}
- ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port,
- data, count);
- if (ret) {
- vcpu->arch.pio.count = 0;
- } else {
- vcpu->arch.guest_ins_data = data;
- vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+
+ advance_sev_es_emulated_ins(vcpu);
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_ins(vcpu, size, port);
+ return 1;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port)
+{
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ if (!__emulator_pio_in(vcpu, size, port, count))
+ break;
+
+ /* Emulation done by the kernel. */
+ advance_sev_es_emulated_ins(vcpu);
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
}
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
return 0;
}
@@ -11718,8 +13070,10 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
unsigned int port, void *data, unsigned int count,
int in)
{
- return in ? kvm_sev_es_ins(vcpu, size, port, data, count)
- : kvm_sev_es_outs(vcpu, size, port, data, count);
+ vcpu->arch.sev_pio_data = data;
+ vcpu->arch.sev_pio_count = count;
+ return in ? kvm_sev_es_ins(vcpu, size, port)
+ : kvm_sev_es_outs(vcpu, size, port);
}
EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
@@ -11745,8 +13099,25 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+
+static int __init kvm_x86_init(void)
+{
+ kvm_mmu_x86_module_init();
+ return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+ /*
+ * If module_init() is implemented, module_exit() must also be
+ * implemented to allow module unload.
+ */
+}
+module_exit(kvm_x86_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9035e34aa156..588792f00334 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,16 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+void kvm_spurious_fault(void);
+
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
+({ \
+ bool failed = (consistency_check); \
+ if (failed) \
+ trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+ failed; \
+})
+
#define KVM_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
#define KVM_DEFAULT_PLE_WINDOW_GROW 2
@@ -48,6 +58,9 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
@@ -96,20 +109,22 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
{
int cs_db, cs_l;
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected);
+
if (!is_long_mode(vcpu))
return false;
static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
-static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
+static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu)
{
-#ifdef CONFIG_X86_64
- return (vcpu->arch.efer & EFER_LMA) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
-#else
- return 0;
-#endif
+ /*
+ * If running with protected guest state, the CS register is not
+ * accessible. The hypercall register values will have had to been
+ * provided in 64-bit mode, so assume the guest is in 64-bit.
+ */
+ return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu);
}
static inline bool x86_exception_has_error_code(unsigned int vector)
@@ -126,12 +141,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
-}
-
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -157,14 +166,9 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
}
-static inline u64 get_canonical(u64 la, u8 vaddr_bits)
-{
- return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
-}
-
static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
{
- return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
+ return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu));
}
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
@@ -222,19 +226,19 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
return false;
}
-static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, int reg)
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
{
- unsigned long val = kvm_register_read(vcpu, reg);
+ unsigned long val = kvm_register_read_raw(vcpu, reg);
return is_64_bit_mode(vcpu) ? val : (u32)val;
}
-static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
int reg, unsigned long val)
{
if (!is_64_bit_mode(vcpu))
val = (u32)val;
- return kvm_register_write(vcpu, reg, val);
+ return kvm_register_write_raw(vcpu, reg, val);
}
static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
@@ -247,7 +251,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
}
-void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
@@ -283,6 +286,7 @@ extern u64 host_xcr0;
extern u64 supported_xcr0;
extern u64 host_xss;
extern u64 supported_xss;
+extern bool enable_pmu;
static inline bool kvm_mpx_supported(void)
{
@@ -296,10 +300,10 @@ extern bool enable_vmware_backdoor;
extern int pi_inject_timer;
-extern struct static_key kvm_no_apic_vcpu;
-
extern bool report_ignored_msrs;
+extern bool eager_page_split;
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -340,18 +344,27 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm)
return kvm->arch.cstate_in_guest;
}
-DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+enum kvm_intr_type {
+ /* Values are arbitrary, but must be non-zero. */
+ KVM_HANDLING_IRQ = 1,
+ KVM_HANDLING_NMI,
+};
-static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
{
- __this_cpu_write(current_vcpu, vcpu);
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
}
static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
- __this_cpu_write(current_vcpu, NULL);
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
}
+static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI;
+}
static inline bool kvm_pat_valid(u64 data)
{
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index ae17250e1efe..610beba35907 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -9,53 +9,100 @@
#include "x86.h"
#include "xen.h"
#include "hyperv.h"
+#include "lapic.h"
+#include <linux/eventfd.h>
#include <linux/kvm_host.h>
#include <linux/sched/stat.h>
#include <trace/events/kvm.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/version.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/sched.h>
#include "trace.h"
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
+
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct pvclock_wall_clock *wc;
gpa_t gpa = gfn_to_gpa(gfn);
- int wc_ofs, sec_hi_ofs;
- int ret;
+ u32 *wc_sec_hi;
+ u32 wc_version;
+ u64 wall_nsec;
+ int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache,
- gpa, PAGE_SIZE);
- if (ret)
+ if (gfn == GPA_INVALID) {
+ kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
goto out;
+ }
+
+ do {
+ ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
+ gpa, PAGE_SIZE);
+ if (ret)
+ goto out;
- kvm->arch.xen.shinfo_set = true;
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+
+ /* It could be invalid again already, so we need to check */
+ read_lock_irq(&gpc->lock);
+
+ if (gpc->valid)
+ break;
+
+ read_unlock_irq(&gpc->lock);
+ } while (1);
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
- /* 32-bit location by default */
- wc_ofs = offsetof(struct compat_shared_info, wc);
- sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi);
-
#ifdef CONFIG_X86_64
/* Paranoia checks on the 64-bit struct layout */
BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
- if (kvm->arch.xen.long_mode) {
- wc_ofs = offsetof(struct shared_info, wc);
- sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi);
- }
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->wc_sec_hi;
+ wc = &shinfo->wc;
+ } else
#endif
+ {
+ struct compat_shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->arch.wc_sec_hi;
+ wc = &shinfo->wc;
+ }
+
+ /* Increment and ensure an odd value */
+ wc_version = wc->version = (wc->version + 1) | 1;
+ smp_wmb();
+
+ wc->nsec = do_div(wall_nsec, 1000000000);
+ wc->sec = (u32)wall_nsec;
+ *wc_sec_hi = wall_nsec >> 32;
+ smp_wmb();
+
+ wc->version = wc_version + 1;
+ read_unlock_irq(&gpc->lock);
- kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs);
kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
out:
@@ -63,6 +110,66 @@ out:
return ret;
}
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
+ struct kvm_xen_evtchn e;
+
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ kvm_xen_set_evtchn(&e, vcpu->kvm);
+
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ }
+}
+
+static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
+ arch.xen.timer);
+ if (atomic_read(&vcpu->arch.xen.timer_pending))
+ return HRTIMER_NORESTART;
+
+ atomic_inc(&vcpu->arch.xen.timer_pending);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+{
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ vcpu->arch.xen.timer_expires = guest_abs;
+
+ if (delta_ns <= 0) {
+ xen_timer_callback(&vcpu->arch.xen.timer);
+ } else {
+ ktime_t ktime_now = ktime_get();
+ hrtimer_start(&vcpu->arch.xen.timer,
+ ktime_add_ns(ktime_now, delta_ns),
+ HRTIMER_MODE_ABS_HARD);
+ }
+}
+
+static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+}
+
+static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+ vcpu->arch.xen.timer.function = xen_timer_callback;
+}
+
static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
@@ -94,49 +201,76 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
- uint64_t state_entry_time;
- unsigned int offset;
+ struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
+ uint64_t *user_times;
+ unsigned long flags;
+ size_t user_len;
+ int *user_state;
kvm_xen_update_runstate(v, state);
- if (!vx->runstate_set)
+ if (!vx->runstate_cache.active)
return;
- BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+ user_len = sizeof(struct vcpu_runstate_info);
+ else
+ user_len = sizeof(struct compat_vcpu_runstate_info);
+
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ user_len)) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (state == RUNSTATE_runnable)
+ return;
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
- offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
-#ifdef CONFIG_X86_64
/*
- * The only difference is alignment of uint64_t in 32-bit.
- * So the first field 'state' is accessed directly using
- * offsetof() (where its offset happens to be zero), while the
- * remaining fields which are all uint64_t, start at 'offset'
- * which we tweak here by adding 4.
+ * The only difference between 32-bit and 64-bit versions of the
+ * runstate struct us the alignment of uint64_t in 32-bit, which
+ * means that the 64-bit version has an additional 4 bytes of
+ * padding after the first field 'state'.
+ *
+ * So we use 'int __user *user_state' to point to the state field,
+ * and 'uint64_t __user *user_times' for runstate_entry_time. So
+ * the actual array of time[] in each state starts at user_times[1].
*/
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
-
- if (v->kvm->arch.xen.long_mode)
- offset = offsetof(struct vcpu_runstate_info, state_entry_time);
#endif
+
+ user_state = gpc->khva;
+
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+ user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
+ state_entry_time);
+ else
+ user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time);
+
/*
* First write the updated state_entry_time at the appropriate
* location determined by 'offset'.
*/
- state_entry_time = vx->runstate_entry_time;
- state_entry_time |= XEN_RUNSTATE_UPDATE;
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
+ sizeof(user_times[0]));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
+ sizeof(user_times[0]));
- BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) !=
- sizeof(state_entry_time));
- BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) !=
- sizeof(state_entry_time));
-
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &state_entry_time, offset,
- sizeof(state_entry_time)))
- return;
+ user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
smp_wmb();
/*
@@ -145,16 +279,12 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
offsetof(struct compat_vcpu_runstate_info, state));
- BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &vx->current_runstate,
- offsetof(struct vcpu_runstate_info, state),
- sizeof(vx->current_runstate)))
- return;
+ *user_state = vx->current_runstate;
/*
* Write the actual runstate times immediately after the
@@ -164,66 +294,157 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
- BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
- sizeof(((struct compat_vcpu_runstate_info *)0)->time));
- BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+ sizeof_field(struct compat_vcpu_runstate_info, time));
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &vx->runstate_times[0],
- offset + sizeof(u64),
- sizeof(vx->runstate_times)))
- return;
-
+ memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
smp_wmb();
/*
* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
* runstate_entry_time field.
*/
+ user_times[0] &= ~XEN_RUNSTATE_UPDATE;
+ smp_wmb();
+
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+}
+
+static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+{
+ struct kvm_lapic_irq irq = { };
+ int r;
+
+ irq.dest_id = v->vcpu_id;
+ irq.vector = v->arch.xen.upcall_vector;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.level = 1;
+
+ /* The fast version will always work for physical unicast */
+ WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+}
+
+/*
+ * On event channel delivery, the vcpu_info may not have been accessible.
+ * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
+ * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
+ * Do so now that we can sleep in the context of the vCPU to bring the
+ * page in, and refresh the pfn cache for it.
+ */
+void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
+{
+ unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
- state_entry_time &= ~XEN_RUNSTATE_UPDATE;
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &state_entry_time, offset,
- sizeof(state_entry_time)))
+ if (!evtchn_pending_sel)
return;
+
+ /*
+ * Yes, this is an open-coded loop. But that's just what put_user()
+ * does anyway. Page it in and retry the instruction. We're just a
+ * little more honest about it.
+ */
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ /* Now gpc->khva is a valid kernel address for the vcpu_info */
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ struct vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orq %0, %1\n"
+ "notq %0\n"
+ LOCK_PREFIX "andq %0, %2\n"
+ : "=r" (evtchn_pending_sel),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ } else {
+ u32 evtchn_pending_sel32 = evtchn_pending_sel;
+ struct compat_vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orl %0, %1\n"
+ "notl %0\n"
+ LOCK_PREFIX "andl %0, %2\n"
+ : "=r" (evtchn_pending_sel32),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel32));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ }
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (v->arch.xen.upcall_vector)
+ kvm_xen_inject_vcpu_vector(v);
+
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
{
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
u8 rc = 0;
/*
* If the global upcall vector (HVMIRQ_callback_vector) is set and
* the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
*/
- struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
- struct kvm_memslots *slots = kvm_memslots(v->kvm);
- unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
/* No need for compat handling here */
BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
BUILD_BUG_ON(sizeof(rc) !=
- sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending));
+ sizeof_field(struct vcpu_info, evtchn_upcall_pending));
BUILD_BUG_ON(sizeof(rc) !=
- sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending));
+ sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
- /*
- * For efficiency, this mirrors the checks for using the valid
- * cache in kvm_read_guest_offset_cached(), but just uses
- * __get_user() instead. And falls back to the slow path.
- */
- if (likely(slots->generation == ghc->generation &&
- !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
- /* Fast path */
- __get_user(rc, (u8 __user *)ghc->hva + offset);
- } else {
- /* Slow path */
- kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
- sizeof(rc));
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ */
+ if (in_atomic() || !task_is_running(current))
+ return 1;
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
+ /*
+ * If this failed, userspace has screwed up the
+ * vcpu_info mapping. No interrupts for you.
+ */
+ return 0;
+ }
+ read_lock_irqsave(&gpc->lock, flags);
}
+ rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
+ read_unlock_irqrestore(&gpc->lock, flags);
return rc;
}
@@ -231,42 +452,51 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
- mutex_lock(&kvm->lock);
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
r = -EINVAL;
} else {
+ mutex_lock(&kvm->lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
+ mutex_unlock(&kvm->lock);
r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (data->u.shared_info.gfn == GPA_INVALID) {
- kvm->arch.xen.shinfo_set = false;
- r = 0;
- break;
- }
+ mutex_lock(&kvm->lock);
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+ mutex_unlock(&kvm->lock);
break;
-
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
+ mutex_lock(&kvm->lock);
kvm->arch.xen.upcall_vector = data->u.vector;
+ mutex_unlock(&kvm->lock);
r = 0;
}
break;
+ case KVM_XEN_ATTR_TYPE_EVTCHN:
+ r = kvm_xen_setattr_evtchn(kvm, data);
+ break;
+
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ mutex_lock(&kvm->lock);
+ kvm->arch.xen.xen_version = data->u.xen_version;
+ mutex_unlock(&kvm->lock);
+ r = 0;
+ break;
+
default:
break;
}
- mutex_unlock(&kvm->lock);
return r;
}
@@ -283,7 +513,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (kvm->arch.xen.shinfo_set)
+ if (kvm->arch.xen.shinfo_cache.active)
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
else
data->u.shared_info.gfn = GPA_INVALID;
@@ -295,6 +525,11 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ data->u.xen_version = kvm->arch.xen.xen_version;
+ r = 0;
+ break;
+
default:
break;
}
@@ -319,36 +554,34 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
offsetof(struct compat_vcpu_info, time));
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.vcpu_info_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_info_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct vcpu_info));
- if (!r) {
- vcpu->arch.xen.vcpu_info_set = true;
+ if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
+
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.vcpu_time_info_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_time_info_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct pvclock_vcpu_time_info));
- if (!r) {
- vcpu->arch.xen.vcpu_time_info_set = true;
+ if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
@@ -357,18 +590,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.runstate_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache);
r = 0;
break;
}
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.runstate_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct vcpu_runstate_info));
- if (!r) {
- vcpu->arch.xen.runstate_set = true;
- }
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -466,6 +697,46 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = 0;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ if (data->u.vcpu_id >= KVM_MAX_VCPUS)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ if (data->u.timer.port) {
+ if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
+ r = -EINVAL;
+ break;
+ }
+ vcpu->arch.xen.timer_virq = data->u.timer.port;
+ kvm_xen_init_timer(vcpu);
+
+ /* Restart the timer if it's set */
+ if (data->u.timer.expires_ns)
+ kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
+ data->u.timer.expires_ns -
+ get_kvmclock_ns(vcpu->kvm));
+ } else if (kvm_xen_timer_enabled(vcpu)) {
+ kvm_xen_stop_timer(vcpu);
+ vcpu->arch.xen.timer_virq = 0;
+ }
+
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.upcall_vector = data->u.vector;
+ r = 0;
+ }
+ break;
+
default:
break;
}
@@ -483,7 +754,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
- if (vcpu->arch.xen.vcpu_info_set)
+ if (vcpu->arch.xen.vcpu_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
data->u.gpa = GPA_INVALID;
@@ -491,7 +762,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
- if (vcpu->arch.xen.vcpu_time_info_set)
+ if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
else
data->u.gpa = GPA_INVALID;
@@ -503,7 +774,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = -EOPNOTSUPP;
break;
}
- if (vcpu->arch.xen.runstate_set) {
+ if (vcpu->arch.xen.runstate_cache.active) {
data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
r = 0;
}
@@ -541,6 +812,23 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = -EINVAL;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ data->u.timer.port = vcpu->arch.xen.timer_virq;
+ data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+ data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = vcpu->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
default:
break;
}
@@ -576,7 +864,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
instructions[0] = 0xb8;
/* vmcall / vmmcall */
- kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
/* ret */
instructions[8] = 0xc3;
@@ -621,7 +909,11 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
{
- if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+ /* Only some feature flags need to be *enabled* by userspace */
+ u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
+ if (xhc->flags & ~permitted_flags)
return -EINVAL;
/*
@@ -646,12 +938,6 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
return 0;
}
-void kvm_xen_destroy_vm(struct kvm *kvm)
-{
- if (kvm->arch.xen_hvm_config.msr)
- static_branch_slow_dec_deferred(&kvm_xen_enabled);
-}
-
static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
kvm_rax_write(vcpu, result);
@@ -668,10 +954,268 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
}
+static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+ evtchn_port_t *ports)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ unsigned long *pending_bits;
+ unsigned long flags;
+ bool ret = true;
+ int idx, i;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ idx = srcu_read_lock(&kvm->srcu);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ goto out_rcu;
+
+ ret = false;
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ }
+
+ for (i = 0; i < nr_ports; i++) {
+ if (test_bit(ports[i], pending_bits)) {
+ ret = true;
+ break;
+ }
+ }
+
+ out_rcu:
+ srcu_read_unlock(&kvm->srcu, idx);
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ return ret;
+}
+
+static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
+ u64 param, u64 *r)
+{
+ int idx, i;
+ struct sched_poll sched_poll;
+ evtchn_port_t port, *ports;
+ gpa_t gpa;
+
+ if (!longmode || !lapic_in_kernel(vcpu) ||
+ !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
+ return false;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
+ sizeof(sched_poll))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ if (unlikely(sched_poll.nr_ports > 1)) {
+ /* Xen (unofficially) limits number of pollers to 128 */
+ if (sched_poll.nr_ports > 128) {
+ *r = -EINVAL;
+ return true;
+ }
+
+ ports = kmalloc_array(sched_poll.nr_ports,
+ sizeof(*ports), GFP_KERNEL);
+ if (!ports) {
+ *r = -ENOMEM;
+ return true;
+ }
+ } else
+ ports = &port;
+
+ for (i = 0; i < sched_poll.nr_ports; i++) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu,
+ (gva_t)(sched_poll.ports + i),
+ NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa,
+ &ports[i], sizeof(port))) {
+ *r = -EFAULT;
+ goto out;
+ }
+ }
+
+ if (sched_poll.nr_ports == 1)
+ vcpu->arch.xen.poll_evtchn = port;
+ else
+ vcpu->arch.xen.poll_evtchn = -1;
+
+ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+
+ if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+ if (sched_poll.timeout)
+ mod_timer(&vcpu->arch.xen.poll_timer,
+ jiffies + nsecs_to_jiffies(sched_poll.timeout));
+
+ kvm_vcpu_halt(vcpu);
+
+ if (sched_poll.timeout)
+ del_timer(&vcpu->arch.xen.poll_timer);
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+ }
+
+ vcpu->arch.xen.poll_evtchn = 0;
+ *r = 0;
+out:
+ /* Really, this is only needed in case of timeout */
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+
+ if (unlikely(sched_poll.nr_ports > 1))
+ kfree(ports);
+ return true;
+}
+
+static void cancel_evtchn_poll(struct timer_list *t)
+{
+ struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
+
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+}
+
+static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
+ int cmd, u64 param, u64 *r)
+{
+ switch (cmd) {
+ case SCHEDOP_poll:
+ if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
+ return true;
+ fallthrough;
+ case SCHEDOP_yield:
+ kvm_vcpu_on_spin(vcpu, true);
+ *r = 0;
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+struct compat_vcpu_set_singleshot_timer {
+ uint64_t timeout_abs_ns;
+ uint32_t flags;
+} __attribute__((packed));
+
+static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
+ int vcpu_id, u64 param, u64 *r)
+{
+ struct vcpu_set_singleshot_timer oneshot;
+ s64 delta;
+ gpa_t gpa;
+ int idx;
+
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ switch (cmd) {
+ case VCPUOP_set_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ /*
+ * The only difference for 32-bit compat is the 4 bytes of
+ * padding after the interesting part of the structure. So
+ * for a faithful emulation of Xen we have to *try* to copy
+ * the padding and return -EFAULT if we can't. Otherwise we
+ * might as well just have copied the 12-byte 32-bit struct.
+ */
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
+ offsetof(struct vcpu_set_singleshot_timer, flags));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, flags));
+
+ if (!gpa ||
+ kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) :
+ sizeof(struct compat_vcpu_set_singleshot_timer))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
+ if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
+ *r = -ETIME;
+ return true;
+ }
+
+ kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
+ *r = 0;
+ return true;
+
+ case VCPUOP_stop_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+ kvm_xen_stop_timer(vcpu);
+ *r = 0;
+ return true;
+ }
+
+ return false;
+}
+
+static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
+ u64 *r)
+{
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ if (timeout) {
+ uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
+ int64_t delta = timeout - guest_now;
+
+ /* Xen has a 'Linux workaround' in do_set_timer_op() which
+ * checks for negative absolute timeout values (caused by
+ * integer overflow), and for values about 13 days in the
+ * future (2^50ns) which would be caused by jiffies
+ * overflow. For those cases, it sets the timeout 100ms in
+ * the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep
+ * churning CPU time by waking it up).
+ */
+ if (unlikely((int64_t)timeout < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ timeout = guest_now + delta;
+ }
+
+ kvm_xen_start_timer(vcpu, timeout, delta);
+ } else {
+ kvm_xen_stop_timer(vcpu);
+ }
+
+ *r = 0;
+ return true;
+}
+
int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
{
bool longmode;
- u64 input, params[6];
+ u64 input, params[6], r = -ENOSYS;
+ bool handled = false;
input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -680,7 +1224,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
- longmode = is_64_bit_mode(vcpu);
+ longmode = is_64_bit_hypercall(vcpu);
if (!longmode) {
params[0] = (u32)kvm_rbx_read(vcpu);
params[1] = (u32)kvm_rcx_read(vcpu);
@@ -702,10 +1246,44 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
params[3], params[4], params[5]);
+ switch (input) {
+ case __HYPERVISOR_xen_version:
+ if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+ r = vcpu->kvm->arch.xen.xen_version;
+ handled = true;
+ }
+ break;
+ case __HYPERVISOR_event_channel_op:
+ if (params[0] == EVTCHNOP_send)
+ handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
+ break;
+ case __HYPERVISOR_sched_op:
+ handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
+ params[1], &r);
+ break;
+ case __HYPERVISOR_vcpu_op:
+ handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
+ params[2], &r);
+ break;
+ case __HYPERVISOR_set_timer_op: {
+ u64 timeout = params[0];
+ /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
+ if (!longmode)
+ timeout |= params[1] << 32;
+ handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (handled)
+ return kvm_xen_hypercall_set_result(vcpu, r);
+
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
- vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
+ vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
vcpu->run->xen.u.hcall.input = input;
vcpu->run->xen.u.hcall.params[0] = params[0];
vcpu->run->xen.u.hcall.params[1] = params[1];
@@ -719,3 +1297,561 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+ return EVTCHN_2L_NR_CHANNELS;
+ else
+ return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
+static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+{
+ int poll_evtchn = vcpu->arch.xen.poll_evtchn;
+
+ if ((poll_evtchn == port || poll_evtchn == -1) &&
+ test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
+/*
+ * The return value from this function is propagated to kvm_set_irq() API,
+ * so it returns:
+ * < 0 Interrupt was ignored (masked or not delivered for other reasons)
+ * = 0 Interrupt was coalesced (previous irq is still pending)
+ * > 0 Number of CPUs interrupt was delivered to
+ *
+ * It is also called directly from kvm_arch_set_irq_inatomic(), where the
+ * only check on its return value is a comparison with -EWOULDBLOCK'.
+ */
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
+{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct kvm_vcpu *vcpu;
+ unsigned long *pending_bits, *mask_bits;
+ unsigned long flags;
+ int port_word_bit;
+ bool kick_vcpu = false;
+ int vcpu_idx, idx, rc;
+
+ vcpu_idx = READ_ONCE(xe->vcpu_idx);
+ if (vcpu_idx >= 0)
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ else {
+ vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
+ if (!vcpu)
+ return -EINVAL;
+ WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu));
+ }
+
+ if (!vcpu->arch.xen.vcpu_info_cache.active)
+ return -EINVAL;
+
+ if (xe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ rc = -EWOULDBLOCK;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ goto out_rcu;
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = xe->port / 64;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = xe->port / 32;
+ }
+
+ /*
+ * If this port wasn't already set, and if it isn't masked, then
+ * we try to set the corresponding bit in the in-kernel shadow of
+ * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+ * already set, then we kick the vCPU in question to write to the
+ * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+ */
+ if (test_and_set_bit(xe->port, pending_bits)) {
+ rc = 0; /* It was already raised */
+ } else if (test_bit(xe->port, mask_bits)) {
+ rc = -ENOTCONN; /* Masked */
+ kvm_xen_check_poller(vcpu, xe->port);
+ } else {
+ rc = 1; /* Delivered to the bitmap in shared_info. */
+ /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
+ read_unlock_irqrestore(&gpc->lock, flags);
+ gpc = &vcpu->arch.xen.vcpu_info_cache;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+ /*
+ * Could not access the vcpu_info. Set the bit in-kernel
+ * and prod the vCPU to deliver it for itself.
+ */
+ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+ goto out_rcu;
+ }
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ } else {
+ struct compat_vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit,
+ (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ }
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
+ kvm_xen_inject_vcpu_vector(vcpu);
+ kick_vcpu = false;
+ }
+ }
+
+ out_rcu:
+ read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (kick_vcpu) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ return rc;
+}
+
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
+{
+ bool mm_borrowed = false;
+ int rc;
+
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
+ if (rc != -EWOULDBLOCK)
+ return rc;
+
+ if (current->mm != kvm->mm) {
+ /*
+ * If not on a thread which already belongs to this KVM,
+ * we'd better be in the irqfd workqueue.
+ */
+ if (WARN_ON_ONCE(current->mm))
+ return -EINVAL;
+
+ kthread_use_mm(kvm->mm);
+ mm_borrowed = true;
+ }
+
+ /*
+ * For the irqfd workqueue, using the main kvm->lock mutex is
+ * fine since this function is invoked from kvm_set_irq() with
+ * no other lock held, no srcu. In future if it will be called
+ * directly from a vCPU thread (e.g. on hypercall for an IPI)
+ * then it may need to switch to using a leaf-node mutex for
+ * serializing the shared_info mapping.
+ */
+ mutex_lock(&kvm->lock);
+
+ /*
+ * It is theoretically possible for the page to be unmapped
+ * and the MMU notifier to invalidate the shared_info before
+ * we even get to use it. In that case, this looks like an
+ * infinite loop. It was tempting to do it via the userspace
+ * HVA instead... but that just *hides* the fact that it's
+ * an infinite loop, because if a fault occurs and it waits
+ * for the page to come back, it can *still* immediately
+ * fault and have to wait again, repeatedly.
+ *
+ * Conversely, the page could also have been reinstated by
+ * another thread before we even obtain the mutex above, so
+ * check again *first* before remapping it.
+ */
+ do {
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ int idx;
+
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
+ if (rc != -EWOULDBLOCK)
+ break;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
+ srcu_read_unlock(&kvm->srcu, idx);
+ } while(!rc);
+
+ mutex_unlock(&kvm->lock);
+
+ if (mm_borrowed)
+ kthread_unuse_mm(kvm->mm);
+
+ return rc;
+}
+
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ if (!level)
+ return -EINVAL;
+
+ return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
+}
+
+/*
+ * Set up an event channel interrupt from the KVM IRQ routing table.
+ * Used for e.g. PIRQ from passed through physical devices.
+ */
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+
+{
+ struct kvm_vcpu *vcpu;
+
+ if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ /*
+ * Xen gives us interesting mappings from vCPU index to APIC ID,
+ * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
+ * to find it. Do that once at setup time, instead of every time.
+ * But beware that on live update / live migration, the routing
+ * table might be reinstated before the vCPU threads have finished
+ * recreating their vCPUs.
+ */
+ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
+ if (vcpu)
+ e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu);
+ else
+ e->xen_evtchn.vcpu_idx = -1;
+
+ e->xen_evtchn.port = ue->u.xen_evtchn.port;
+ e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
+ e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+ e->set = evtchn_set_fn;
+
+ return 0;
+}
+
+/*
+ * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
+ */
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
+{
+ struct kvm_xen_evtchn e;
+ int ret;
+
+ if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ e.port = uxe->port;
+ e.vcpu_id = uxe->vcpu;
+ e.vcpu_idx = -1;
+ e.priority = uxe->priority;
+
+ ret = kvm_xen_set_evtchn(&e, kvm);
+
+ /*
+ * None of that 'return 1 if it actually got delivered' nonsense.
+ * We don't care if it was masked (-ENOTCONN) either.
+ */
+ if (ret > 0 || ret == -ENOTCONN)
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
+ */
+struct evtchnfd {
+ u32 send_port;
+ u32 type;
+ union {
+ struct kvm_xen_evtchn port;
+ struct {
+ u32 port; /* zero */
+ struct eventfd_ctx *ctx;
+ } eventfd;
+ } deliver;
+};
+
+/*
+ * Update target vCPU or priority for a registered sending channel.
+ */
+static int kvm_xen_eventfd_update(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct evtchnfd *evtchnfd;
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
+ mutex_unlock(&kvm->lock);
+
+ if (!evtchnfd)
+ return -ENOENT;
+
+ /* For an UPDATE, nothing may change except the priority/vcpu */
+ if (evtchnfd->type != data->u.evtchn.type)
+ return -EINVAL;
+
+ /*
+ * Port cannot change, and if it's zero that was an eventfd
+ * which can't be changed either.
+ */
+ if (!evtchnfd->deliver.port.port ||
+ evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ }
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+/*
+ * Configure the target (eventfd or local port delivery) for sending on
+ * a given event channel.
+ */
+static int kvm_xen_eventfd_assign(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct eventfd_ctx *eventfd = NULL;
+ struct evtchnfd *evtchnfd = NULL;
+ int ret = -EINVAL;
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
+ if (!evtchnfd)
+ return -ENOMEM;
+
+ switch(data->u.evtchn.type) {
+ case EVTCHNSTAT_ipi:
+ /* IPI must map back to the same port# */
+ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
+ goto out; /* -EINVAL */
+ break;
+
+ case EVTCHNSTAT_interdomain:
+ if (data->u.evtchn.deliver.port.port) {
+ if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
+ goto out; /* -EINVAL */
+ } else {
+ eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto out;
+ }
+ }
+ break;
+
+ case EVTCHNSTAT_virq:
+ case EVTCHNSTAT_closed:
+ case EVTCHNSTAT_unbound:
+ case EVTCHNSTAT_pirq:
+ default: /* Unknown event channel type */
+ goto out; /* -EINVAL */
+ }
+
+ evtchnfd->send_port = data->u.evtchn.send_port;
+ evtchnfd->type = data->u.evtchn.type;
+ if (eventfd) {
+ evtchnfd->deliver.eventfd.ctx = eventfd;
+ } else {
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ goto out; /* -EINVAL; */
+
+ evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ }
+
+ mutex_lock(&kvm->lock);
+ ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
+ GFP_KERNEL);
+ mutex_unlock(&kvm->lock);
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+out:
+ if (eventfd)
+ eventfd_ctx_put(eventfd);
+ kfree(evtchnfd);
+ return ret;
+}
+
+static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
+{
+ struct evtchnfd *evtchnfd;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
+ mutex_unlock(&kvm->lock);
+
+ if (!evtchnfd)
+ return -ENOENT;
+
+ if (kvm)
+ synchronize_srcu(&kvm->srcu);
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ return 0;
+}
+
+static int kvm_xen_eventfd_reset(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd;
+ int i;
+
+ mutex_lock(&kvm->lock);
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
+ synchronize_srcu(&kvm->srcu);
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ mutex_unlock(&kvm->lock);
+
+ return 0;
+}
+
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
+ return kvm_xen_eventfd_reset(kvm);
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
+ return kvm_xen_eventfd_deassign(kvm, port);
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
+ return kvm_xen_eventfd_update(kvm, data);
+ if (data->u.evtchn.flags)
+ return -EINVAL;
+
+ return kvm_xen_eventfd_assign(kvm, data);
+}
+
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
+{
+ struct evtchnfd *evtchnfd;
+ struct evtchn_send send;
+ gpa_t gpa;
+ int idx;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /* The evtchn_ports idr is protected by vcpu->kvm->srcu */
+ evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+ if (!evtchnfd)
+ return false;
+
+ if (evtchnfd->deliver.port.port) {
+ int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
+ if (ret < 0 && ret != -ENOTCONN)
+ return false;
+ } else {
+ eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+ }
+
+ *r = 0;
+ return true;
+}
+
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+ vcpu->arch.xen.poll_evtchn = 0;
+ timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
+}
+
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_stop_timer(vcpu);
+
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache);
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache);
+ del_timer_sync(&vcpu->arch.xen.poll_timer);
+}
+
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+ idr_init(&kvm->arch.xen.evtchn_ports);
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd;
+ int i;
+
+ kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ idr_destroy(&kvm->arch.xen.evtchn_ports);
+
+ if (kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 463a7844a8ca..532a535a9e99 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -15,13 +15,23 @@
extern struct static_key_false_deferred kvm_xen_enabled;
int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt);
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_init_vm(struct kvm *kvm);
void kvm_xen_destroy_vm(struct kvm *kvm);
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu);
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu);
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
+ struct kvm *kvm);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue);
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
@@ -39,21 +49,55 @@ static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
{
if (static_branch_unlikely(&kvm_xen_enabled.key) &&
- vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->kvm->arch.xen.upcall_vector)
return __kvm_xen_has_interrupt(vcpu);
return 0;
}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.evtchn_pending_sel;
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return !!vcpu->arch.xen.timer_virq;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
+ return atomic_read(&vcpu->arch.xen.timer_pending);
+
+ return 0;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
#else
static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
{
return 1;
}
+static inline void kvm_xen_init_vm(struct kvm *kvm)
+{
+}
+
static inline void kvm_xen_destroy_vm(struct kvm *kvm)
{
}
+static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return false;
@@ -68,6 +112,29 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
{
return 0;
}
+
+static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
#endif
int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
@@ -92,8 +159,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
* behalf of the vCPU. Only if the VMM does actually block
* does it need to enter RUNSTATE_blocked.
*/
- if (vcpu->preempted)
- kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+ if (WARN_ON_ONCE(!vcpu->preempted))
+ return;
+
+ kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
}
/* 32-bit compatibility definitions, also used natively in 32-bit build */
@@ -129,6 +198,9 @@ struct compat_shared_info {
struct compat_arch_shared_info arch;
};
+#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 * \
+ sizeof_field(struct compat_shared_info, \
+ evtchn_pending))
struct compat_vcpu_runstate_info {
int state;
uint64_t state_entry_time;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index bad4dee4f0e4..f76747862bd2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o misc.o cmdline.o cpu.o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
+lib-y += pc-conf-reg.o
lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
@@ -62,7 +63,6 @@ ifeq ($(CONFIG_X86_32),y)
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
- lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 3b6544111ac9..e768815e58ae 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -6,84 +6,86 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
/* if you want SMP support, implement these with real spinlocks */
-.macro LOCK reg
+.macro IRQ_SAVE reg
pushfl
cli
.endm
-.macro UNLOCK reg
+.macro IRQ_RESTORE reg
popfl
.endm
-#define BEGIN(op) \
+#define BEGIN_IRQ_SAVE(op) \
.macro endp; \
SYM_FUNC_END(atomic64_##op##_386); \
.purgem endp; \
.endm; \
SYM_FUNC_START(atomic64_##op##_386); \
- LOCK v;
+ IRQ_SAVE v;
#define ENDP endp
-#define RET \
- UNLOCK v; \
- ret
-
-#define RET_ENDP \
- RET; \
- ENDP
+#define RET_IRQ_RESTORE \
+ IRQ_RESTORE v; \
+ RET
#define v %ecx
-BEGIN(read)
+BEGIN_IRQ_SAVE(read)
movl (v), %eax
movl 4(v), %edx
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(set)
+BEGIN_IRQ_SAVE(set)
movl %ebx, (v)
movl %ecx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(xchg)
+BEGIN_IRQ_SAVE(xchg)
movl (v), %eax
movl 4(v), %edx
movl %ebx, (v)
movl %ecx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(add)
+BEGIN_IRQ_SAVE(add)
addl %eax, (v)
adcl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(add_return)
+BEGIN_IRQ_SAVE(add_return)
addl (v), %eax
adcl 4(v), %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(sub)
+BEGIN_IRQ_SAVE(sub)
subl %eax, (v)
sbbl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(sub_return)
+BEGIN_IRQ_SAVE(sub_return)
negl %edx
negl %eax
sbbl $0, %edx
@@ -91,47 +93,52 @@ BEGIN(sub_return)
adcl 4(v), %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(inc)
+BEGIN_IRQ_SAVE(inc)
addl $1, (v)
adcl $0, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(inc_return)
+BEGIN_IRQ_SAVE(inc_return)
movl (v), %eax
movl 4(v), %edx
addl $1, %eax
adcl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(dec)
+BEGIN_IRQ_SAVE(dec)
subl $1, (v)
sbbl $0, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(dec_return)
+BEGIN_IRQ_SAVE(dec_return)
movl (v), %eax
movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(add_unless)
+BEGIN_IRQ_SAVE(add_unless)
addl %eax, %ecx
adcl %edx, %edi
addl (v), %eax
@@ -143,7 +150,7 @@ BEGIN(add_unless)
movl %edx, 4(v)
movl $1, %eax
2:
- RET
+ RET_IRQ_RESTORE
3:
cmpl %edx, %edi
jne 1b
@@ -153,7 +160,7 @@ ENDP
#undef v
#define v %esi
-BEGIN(inc_not_zero)
+BEGIN_IRQ_SAVE(inc_not_zero)
movl (v), %eax
movl 4(v), %edx
testl %eax, %eax
@@ -165,7 +172,7 @@ BEGIN(inc_not_zero)
movl %edx, 4(v)
movl $1, %eax
2:
- RET
+ RET_IRQ_RESTORE
3:
testl %edx, %edx
jne 1b
@@ -174,7 +181,7 @@ ENDP
#undef v
#define v %esi
-BEGIN(dec_if_positive)
+BEGIN_IRQ_SAVE(dec_if_positive)
movl (v), %eax
movl 4(v), %edx
subl $1, %eax
@@ -183,5 +190,6 @@ BEGIN(dec_if_positive)
movl %eax, (v)
movl %edx, 4(v)
1:
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 1c5c81c16b06..90afb488b396 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -6,7 +6,7 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
.macro read64 reg
movl %ebx, %eax
@@ -18,7 +18,7 @@
SYM_FUNC_START(atomic64_read_cx8)
read64 %ecx
- ret
+ RET
SYM_FUNC_END(atomic64_read_cx8)
SYM_FUNC_START(atomic64_set_cx8)
@@ -28,7 +28,7 @@ SYM_FUNC_START(atomic64_set_cx8)
cmpxchg8b (%esi)
jne 1b
- ret
+ RET
SYM_FUNC_END(atomic64_set_cx8)
SYM_FUNC_START(atomic64_xchg_cx8)
@@ -37,7 +37,7 @@ SYM_FUNC_START(atomic64_xchg_cx8)
cmpxchg8b (%esi)
jne 1b
- ret
+ RET
SYM_FUNC_END(atomic64_xchg_cx8)
.macro addsub_return func ins insc
@@ -68,7 +68,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
popl %esi
popl %ebx
popl %ebp
- ret
+ RET
SYM_FUNC_END(atomic64_\func\()_return_cx8)
.endm
@@ -93,7 +93,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
movl %ebx, %eax
movl %ecx, %edx
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_\func\()_return_cx8)
.endm
@@ -118,7 +118,7 @@ SYM_FUNC_START(atomic64_dec_if_positive_cx8)
movl %ebx, %eax
movl %ecx, %edx
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_dec_if_positive_cx8)
SYM_FUNC_START(atomic64_add_unless_cx8)
@@ -149,7 +149,7 @@ SYM_FUNC_START(atomic64_add_unless_cx8)
addl $8, %esp
popl %ebx
popl %ebp
- ret
+ RET
4:
cmpl %edx, 4(%esp)
jne 2b
@@ -176,5 +176,5 @@ SYM_FUNC_START(atomic64_inc_not_zero_cx8)
movl $1, %eax
3:
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 4304320e51f4..23318c338db0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -127,7 +127,7 @@ SYM_FUNC_START(csum_partial)
8:
popl %ebx
popl %esi
- ret
+ RET
SYM_FUNC_END(csum_partial)
#else
@@ -245,7 +245,7 @@ SYM_FUNC_START(csum_partial)
90:
popl %ebx
popl %esi
- ret
+ RET
SYM_FUNC_END(csum_partial)
#endif
@@ -260,9 +260,9 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
* Copy from ds while checksumming, otherwise like csum_partial
*/
-#define EXC(y...) \
- 9999: y; \
- _ASM_EXTABLE_UA(9999b, 6001f)
+#define EXC(y...) \
+ 9999: y; \
+ _ASM_EXTABLE_TYPE(9999b, 7f, EX_TYPE_UACCESS | EX_FLAG_CLEAR_AX)
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
@@ -358,20 +358,11 @@ EXC( movb %cl, (%edi) )
adcl $0, %eax
7:
-# Exception handler:
-.section .fixup, "ax"
-
-6001:
- xorl %eax, %eax
- jmp 7b
-
-.previous
-
popl %ebx
popl %esi
popl %edi
popl %ecx # equivalent to addl $4,%esp
- ret
+ RET
SYM_FUNC_END(csum_partial_copy_generic)
#else
@@ -439,15 +430,11 @@ EXC( movb %dl, (%edi) )
6: addl %edx, %eax
adcl $0, %eax
7:
-.section .fixup, "ax"
-6001: xorl %eax, %eax
- jmp 7b
-.previous
popl %esi
popl %edi
popl %ebx
- ret
+ RET
SYM_FUNC_END(csum_partial_copy_generic)
#undef ROUND
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index c4c7dd115953..fe59b8ac4fcc 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -17,7 +17,7 @@ SYM_FUNC_START(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
- ret
+ RET
SYM_FUNC_END(clear_page_rep)
EXPORT_SYMBOL_GPL(clear_page_rep)
@@ -39,7 +39,7 @@ SYM_FUNC_START(clear_page_orig)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
- ret
+ RET
SYM_FUNC_END(clear_page_orig)
EXPORT_SYMBOL_GPL(clear_page_orig)
@@ -47,6 +47,6 @@ SYM_FUNC_START(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
- ret
+ RET
SYM_FUNC_END(clear_page_erms)
EXPORT_SYMBOL_GPL(clear_page_erms)
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 3542502faa3b..33c70c0160ea 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -37,11 +37,11 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu)
popfq
mov $1, %al
- ret
+ RET
.Lnot_same:
popfq
xor %al,%al
- ret
+ RET
SYM_FUNC_END(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index ca01ed6029f4..6a912d58fecc 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(cmpxchg8b_emu)
movl %ecx, 4(%esi)
popfl
- ret
+ RET
.Lnot_same:
movl (%esi), %eax
@@ -40,7 +40,7 @@ SYM_FUNC_START(cmpxchg8b_emu)
movl 4(%esi), %edx
popfl
- ret
+ RET
SYM_FUNC_END(cmpxchg8b_emu)
EXPORT_SYMBOL(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index e5f77e293034..c859a8a09860 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -77,10 +77,8 @@ SYM_FUNC_START(copy_mc_fragile)
.L_done_memcpy_trap:
xorl %eax, %eax
.L_done:
- ret
-SYM_FUNC_END(copy_mc_fragile)
+ RET
- .section .fixup, "ax"
/*
* Return number of bytes not copied for any failure. Note that
* there is no "tail" handling since the source buffer is 8-byte
@@ -105,14 +103,14 @@ SYM_FUNC_END(copy_mc_fragile)
movl %ecx, %edx
jmp copy_mc_fragile_handle_tail
- .previous
-
- _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
- _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
- _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+ _ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
+ _ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE)
+ _ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
_ASM_EXTABLE(.L_write_words, .E_write_words)
_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
+
+SYM_FUNC_END(copy_mc_fragile)
#endif /* CONFIG_X86_MCE */
/*
@@ -132,10 +130,8 @@ SYM_FUNC_START(copy_mc_enhanced_fast_string)
rep movsb
/* Copy successful. Return zero */
xorl %eax, %eax
- ret
-SYM_FUNC_END(copy_mc_enhanced_fast_string)
+ RET
- .section .fixup, "ax"
.E_copy:
/*
* On fault %rcx is updated such that the copy instruction could
@@ -145,9 +141,9 @@ SYM_FUNC_END(copy_mc_enhanced_fast_string)
* user-copy routines.
*/
movq %rcx, %rax
- ret
+ RET
- .previous
+ _ASM_EXTABLE_TYPE(.L_copy, .E_copy, EX_TYPE_DEFAULT_MCE_SAFE)
- _ASM_EXTABLE_FAULT(.L_copy, .E_copy)
+SYM_FUNC_END(copy_mc_enhanced_fast_string)
#endif /* !CONFIG_UML */
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 2402d4c489d2..30ea644bf446 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -3,7 +3,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
/*
@@ -17,7 +17,7 @@ SYM_FUNC_START(copy_page)
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
- ret
+ RET
SYM_FUNC_END(copy_page)
EXPORT_SYMBOL(copy_page)
@@ -85,5 +85,5 @@ SYM_FUNC_START_LOCAL(copy_page_regs)
movq (%rsp), %rbx
movq 1*8(%rsp), %r12
addq $2*8, %rsp
- ret
+ RET
SYM_FUNC_END(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 77b9b2a3b5c8..9dec1b38a98f 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -11,7 +11,7 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>
@@ -32,14 +32,10 @@
decl %ecx
jnz 100b
102:
- .section .fixup,"ax"
-103: addl %ecx,%edx /* ecx is zerorest also */
- jmp .Lcopy_user_handle_tail
- .previous
- _ASM_EXTABLE_CPY(100b, 103b)
- _ASM_EXTABLE_CPY(101b, 103b)
- .endm
+ _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
+ _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
+.endm
/*
* copy_user_generic_unrolled - memory copy with exception handling.
@@ -57,12 +53,12 @@
SYM_FUNC_START(copy_user_generic_unrolled)
ASM_STAC
cmpl $8,%edx
- jb 20f /* less then 8 bytes, go to byte copy loop */
+ jb .Lcopy_user_short_string_bytes
ALIGN_DESTINATION
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
- jz .L_copy_short_string
+ jz copy_user_short_string
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
@@ -83,39 +79,11 @@ SYM_FUNC_START(copy_user_generic_unrolled)
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
-.L_copy_short_string:
- movl %edx,%ecx
- andl $7,%edx
- shrl $3,%ecx
- jz 20f
-18: movq (%rsi),%r8
-19: movq %r8,(%rdi)
- leaq 8(%rsi),%rsi
- leaq 8(%rdi),%rdi
- decl %ecx
- jnz 18b
-20: andl %edx,%edx
- jz 23f
- movl %edx,%ecx
-21: movb (%rsi),%al
-22: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 21b
-23: xor %eax,%eax
- ASM_CLAC
- ret
+ jmp copy_user_short_string
- .section .fixup,"ax"
30: shll $6,%ecx
addl %ecx,%edx
- jmp 60f
-40: leal (%rdx,%rcx,8),%edx
- jmp 60f
-50: movl %ecx,%edx
-60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
- .previous
+ jmp .Lcopy_user_handle_tail
_ASM_EXTABLE_CPY(1b, 30b)
_ASM_EXTABLE_CPY(2b, 30b)
@@ -133,10 +101,6 @@ SYM_FUNC_START(copy_user_generic_unrolled)
_ASM_EXTABLE_CPY(14b, 30b)
_ASM_EXTABLE_CPY(15b, 30b)
_ASM_EXTABLE_CPY(16b, 30b)
- _ASM_EXTABLE_CPY(18b, 40b)
- _ASM_EXTABLE_CPY(19b, 40b)
- _ASM_EXTABLE_CPY(21b, 50b)
- _ASM_EXTABLE_CPY(22b, 50b)
SYM_FUNC_END(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)
@@ -166,20 +130,16 @@ SYM_FUNC_START(copy_user_generic_string)
movl %edx,%ecx
shrl $3,%ecx
andl $7,%edx
-1: rep
- movsq
+1: rep movsq
2: movl %edx,%ecx
-3: rep
- movsb
+3: rep movsb
xorl %eax,%eax
ASM_CLAC
- ret
+ RET
- .section .fixup,"ax"
11: leal (%rdx,%rcx,8),%ecx
12: movl %ecx,%edx /* ecx is zerorest also */
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, 11b)
_ASM_EXTABLE_CPY(3b, 12b)
@@ -200,19 +160,16 @@ EXPORT_SYMBOL(copy_user_generic_string)
*/
SYM_FUNC_START(copy_user_enhanced_fast_string)
ASM_STAC
- cmpl $64,%edx
- jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
+ /* CPUs without FSRM should avoid rep movsb for short copies */
+ ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
movl %edx,%ecx
-1: rep
- movsb
+1: rep movsb
xorl %eax,%eax
ASM_CLAC
- ret
+ RET
- .section .fixup,"ax"
12: movl %ecx,%edx /* ecx is zerorest also */
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, 12b)
SYM_FUNC_END(copy_user_enhanced_fast_string)
@@ -225,6 +182,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
* Don't try to copy the tail if machine check happened
*
* Input:
+ * eax trap number written by ex_handler_copy()
* rdi destination
* rsi source
* rdx count
@@ -233,29 +191,76 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
* eax uncopied bytes or 0 if successful.
*/
SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
- movl %edx,%ecx
- cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */
+ cmp $X86_TRAP_MC,%eax
je 3f
+
+ movl %edx,%ecx
1: rep movsb
2: mov %ecx,%eax
ASM_CLAC
- ret
-
- /*
- * Return zero to pretend that this copy succeeded. This
- * is counter-intuitive, but needed to prevent the code
- * in lib/iov_iter.c from retrying and running back into
- * the poison cache line again. The machine check handler
- * will ensure that a SIGBUS is sent to the task.
- */
-3: xorl %eax,%eax
+ RET
+
+3:
+ movl %edx,%eax
ASM_CLAC
- ret
+ RET
_ASM_EXTABLE_CPY(1b, 2b)
+
+.Lcopy_user_handle_align:
+ addl %ecx,%edx /* ecx is zerorest also */
+ jmp .Lcopy_user_handle_tail
+
SYM_CODE_END(.Lcopy_user_handle_tail)
/*
+ * Finish memcpy of less than 64 bytes. #AC should already be set.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count (< 64)
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+SYM_CODE_START_LOCAL(copy_user_short_string)
+ movl %edx,%ecx
+ andl $7,%edx
+ shrl $3,%ecx
+ jz .Lcopy_user_short_string_bytes
+18: movq (%rsi),%r8
+19: movq %r8,(%rdi)
+ leaq 8(%rsi),%rsi
+ leaq 8(%rdi),%rdi
+ decl %ecx
+ jnz 18b
+.Lcopy_user_short_string_bytes:
+ andl %edx,%edx
+ jz 23f
+ movl %edx,%ecx
+21: movb (%rsi),%al
+22: movb %al,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz 21b
+23: xor %eax,%eax
+ ASM_CLAC
+ RET
+
+40: leal (%rdx,%rcx,8),%edx
+ jmp 60f
+50: movl %ecx,%edx /* ecx is zerorest also */
+60: jmp .Lcopy_user_handle_tail
+
+ _ASM_EXTABLE_CPY(18b, 40b)
+ _ASM_EXTABLE_CPY(19b, 40b)
+ _ASM_EXTABLE_CPY(21b, 50b)
+ _ASM_EXTABLE_CPY(22b, 50b)
+SYM_CODE_END(copy_user_short_string)
+
+/*
* copy_user_nocache - Uncached memory copy with exception handling
* This will force destination out of cache for more performance.
*
@@ -361,9 +366,8 @@ SYM_FUNC_START(__copy_user_nocache)
xorl %eax,%eax
ASM_CLAC
sfence
- ret
+ RET
- .section .fixup,"ax"
.L_fixup_4x8b_copy:
shll $6,%ecx
addl %ecx,%edx
@@ -379,7 +383,6 @@ SYM_FUNC_START(__copy_user_nocache)
.L_fixup_handle_tail:
sfence
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 1fbd8ee9642d..d9e16a2cf285 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -201,7 +201,7 @@ SYM_FUNC_START(csum_partial_copy_generic)
movq 3*8(%rsp), %r13
movq 4*8(%rsp), %r15
addq $5*8, %rsp
- ret
+ RET
.Lshort:
movl %ecx, %r10d
jmp .L1
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d668b68..50734a23034c 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -9,6 +9,7 @@
#include <linux/compiler.h>
#include <linux/export.h>
#include <asm/checksum.h>
+#include <asm/word-at-a-time.h>
static inline unsigned short from32to16(unsigned a)
{
@@ -21,120 +22,93 @@ static inline unsigned short from32to16(unsigned a)
}
/*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
*
* This isn't as time critical as it used to be because many NICs
* do hardware checksumming these days.
- *
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
*/
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+__wsum csum_partial(const void *buff, int len, __wsum sum)
{
- unsigned odd, count;
- unsigned long result = 0;
+ u64 temp64 = (__force u64)sum;
+ unsigned odd, result;
- if (unlikely(len == 0))
- return result;
odd = 1 & (unsigned long) buff;
if (unlikely(odd)) {
- result = *buff << 8;
+ if (unlikely(len == 0))
+ return sum;
+ temp64 = ror32((__force u32)sum, 8);
+ temp64 += (*(unsigned char *)buff << 8);
len--;
buff++;
}
- count = len >> 1; /* nr of 16-bit words.. */
- if (count) {
- if (2 & (unsigned long) buff) {
- result += *(unsigned short *)buff;
- count--;
- len -= 2;
- buff += 2;
- }
- count >>= 1; /* nr of 32-bit words.. */
- if (count) {
- unsigned long zero;
- unsigned count64;
- if (4 & (unsigned long) buff) {
- result += *(unsigned int *) buff;
- count--;
- len -= 4;
- buff += 4;
- }
- count >>= 1; /* nr of 64-bit words.. */
- /* main loop using 64byte blocks */
- zero = 0;
- count64 = count >> 3;
- while (count64) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq 5*8(%[src]),%[res]\n\t"
- "adcq 6*8(%[src]),%[res]\n\t"
- "adcq 7*8(%[src]),%[res]\n\t"
- "adcq %[zero],%[res]"
- : [res] "=r" (result)
- : [src] "r" (buff), [zero] "r" (zero),
- "[res]" (result));
- buff += 64;
- count64--;
- }
+ while (unlikely(len >= 64)) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq 4*8(%[src]),%[res]\n\t"
+ "adcq 5*8(%[src]),%[res]\n\t"
+ "adcq 6*8(%[src]),%[res]\n\t"
+ "adcq 7*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 64;
+ len -= 64;
+ }
+
+ if (len & 32) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 32;
+ }
+ if (len & 16) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 16;
+ }
+ if (len & 8) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 8;
+ }
+ if (len & 7) {
+ unsigned int shift = (8 - (len & 7)) * 8;
+ unsigned long trail;
- /* last up to 7 8byte blocks */
- count %= 8;
- while (count) {
- asm("addq %1,%0\n\t"
- "adcq %2,%0\n"
- : "=r" (result)
- : "m" (*(unsigned long *)buff),
- "r" (zero), "0" (result));
- --count;
- buff += 8;
- }
- result = add32_with_carry(result>>32,
- result&0xffffffff);
+ trail = (load_unaligned_zeropad(buff) << shift) >> shift;
- if (len & 4) {
- result += *(unsigned int *) buff;
- buff += 4;
- }
- }
- if (len & 2) {
- result += *(unsigned short *) buff;
- buff += 2;
- }
+ asm("addq %[trail],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [trail] "r" (trail));
}
- if (len & 1)
- result += *buff;
- result = add32_with_carry(result>>32, result & 0xffffffff);
- if (unlikely(odd)) {
+ result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
+ if (unlikely(odd)) {
result = from32to16(result);
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
}
- return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
- return (__force __wsum)add32_with_carry(do_csum(buff, len),
- (__force u32)sum);
+ return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);
@@ -147,4 +121,3 @@ __sum16 ip_compute_csum(const void *buff, int len)
return csum_fold(csum_partial(buff,len,0));
}
EXPORT_SYMBOL(ip_compute_csum);
-
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 189344924a2b..145f9a0bde29 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -32,7 +32,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len)
user_access_end();
return sum;
}
-EXPORT_SYMBOL(csum_and_copy_from_user);
/**
* csum_and_copy_to_user - Copy and checksum to user space.
@@ -57,7 +56,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len)
user_access_end();
return sum;
}
-EXPORT_SYMBOL(csum_and_copy_to_user);
/**
* csum_partial_copy_nocheck - Copy and checksum.
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 65d15df6212d..0e65d00e2339 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -54,8 +54,8 @@ static void delay_loop(u64 __loops)
" jnz 2b \n"
"3: dec %0 \n"
- : /* we don't need output */
- :"a" (loops)
+ : "+a" (loops)
+ :
);
}
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
index be5b5fb1598b..1e3de0769b81 100644
--- a/arch/x86/lib/error-inject.c
+++ b/arch/x86/lib/error-inject.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/linkage.h>
#include <linux/error-injection.h>
#include <linux/kprobes.h>
+#include <linux/objtool.h>
asmlinkage void just_return_func(void);
@@ -10,7 +12,8 @@ asm(
".type just_return_func, @function\n"
".globl just_return_func\n"
"just_return_func:\n"
- " ret\n"
+ ANNOTATE_NOENDBR
+ ASM_RET
".size just_return_func, .-just_return_func\n"
);
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index fa1bc2104b32..b70d98d79a9d 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -57,7 +57,7 @@ SYM_FUNC_START(__get_user_1)
1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_1)
EXPORT_SYMBOL(__get_user_1)
@@ -71,7 +71,7 @@ SYM_FUNC_START(__get_user_2)
2: movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_2)
EXPORT_SYMBOL(__get_user_2)
@@ -85,7 +85,7 @@ SYM_FUNC_START(__get_user_4)
3: movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
@@ -100,7 +100,7 @@ SYM_FUNC_START(__get_user_8)
4: movq (%_ASM_AX),%rdx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
#else
LOAD_TASK_SIZE_MINUS_N(7)
cmp %_ASM_DX,%_ASM_AX
@@ -112,7 +112,7 @@ SYM_FUNC_START(__get_user_8)
5: movl 4(%_ASM_AX),%ecx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
#endif
SYM_FUNC_END(__get_user_8)
EXPORT_SYMBOL(__get_user_8)
@@ -124,7 +124,7 @@ SYM_FUNC_START(__get_user_nocheck_1)
6: movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_1)
EXPORT_SYMBOL(__get_user_nocheck_1)
@@ -134,7 +134,7 @@ SYM_FUNC_START(__get_user_nocheck_2)
7: movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_2)
EXPORT_SYMBOL(__get_user_nocheck_2)
@@ -144,7 +144,7 @@ SYM_FUNC_START(__get_user_nocheck_4)
8: movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_4)
EXPORT_SYMBOL(__get_user_nocheck_4)
@@ -159,7 +159,7 @@ SYM_FUNC_START(__get_user_nocheck_8)
#endif
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_8)
EXPORT_SYMBOL(__get_user_nocheck_8)
@@ -169,7 +169,7 @@ SYM_CODE_START_LOCAL(.Lbad_get_user_clac)
bad_get_user:
xor %edx,%edx
mov $(-EFAULT),%_ASM_AX
- ret
+ RET
SYM_CODE_END(.Lbad_get_user_clac)
#ifdef CONFIG_X86_32
@@ -179,7 +179,7 @@ bad_get_user_8:
xor %edx,%edx
xor %ecx,%ecx
mov $(-EFAULT),%_ASM_AX
- ret
+ RET
SYM_CODE_END(.Lbad_get_user_8_clac)
#endif
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index dbf8cc97b7f5..12c16c6aa44a 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(__sw_hweight32)
imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
shrl $24, %eax # w = w_tmp >> 24
__ASM_SIZE(pop,) %__ASM_REG(dx)
- ret
+ RET
SYM_FUNC_END(__sw_hweight32)
EXPORT_SYMBOL(__sw_hweight32)
@@ -65,7 +65,7 @@ SYM_FUNC_START(__sw_hweight64)
popq %rdx
popq %rdi
- ret
+ RET
#else /* CONFIG_X86_32 */
/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
pushl %ecx
@@ -77,7 +77,7 @@ SYM_FUNC_START(__sw_hweight64)
addl %ecx, %eax # result
popl %ecx
- ret
+ RET
#endif
SYM_FUNC_END(__sw_hweight64)
EXPORT_SYMBOL(__sw_hweight64)
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index 12539fca75c4..b0f3b2a62ae2 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -4,7 +4,7 @@
*
* Written by Masami Hiramatsu <mhiramat@redhat.com>
*/
-#include <asm/insn.h>
+#include <asm/insn.h> /* __ignore_sync_check__ */
/* Attribute tables are generated from opcode map */
#include "inat-tables.c"
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index bb0b3fe1e0a0..21104c41cba0 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -37,8 +37,6 @@ enum reg_type {
*/
static bool is_string_insn(struct insn *insn)
{
- insn_get_opcode(insn);
-
/* All string instructions have a 1-byte opcode. */
if (insn->opcode.nbytes != 1)
return false;
@@ -232,7 +230,7 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
* resolve_seg_reg() - obtain segment register index
* @insn: Instruction with operands
* @regs: Register values as seen when entering kernel mode
- * @regoff: Operand offset, in pt_regs, used to deterimine segment register
+ * @regoff: Operand offset, in pt_regs, used to determine segment register
*
* Determine the segment register associated with the operands and, if
* applicable, prefixes and the instruction pointed by @insn.
@@ -344,9 +342,9 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
*/
static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
{
-#ifdef CONFIG_X86_64
unsigned short sel;
+#ifdef CONFIG_X86_64
switch (seg_reg_idx) {
case INAT_SEG_REG_IGNORE:
return 0;
@@ -404,11 +402,8 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
case INAT_SEG_REG_FS:
return (unsigned short)(regs->fs & 0xffff);
case INAT_SEG_REG_GS:
- /*
- * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS.
- * The macro below takes care of both cases.
- */
- return get_user_gs(regs);
+ savesegment(gs, sel);
+ return sel;
case INAT_SEG_REG_IGNORE:
default:
return -EINVAL;
@@ -416,32 +411,44 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
#endif /* CONFIG_X86_64 */
}
-static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
- enum reg_type type)
+static const int pt_regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#else
+ offsetof(struct pt_regs, ds),
+ offsetof(struct pt_regs, es),
+ offsetof(struct pt_regs, fs),
+ offsetof(struct pt_regs, gs),
+#endif
+};
+
+int pt_regs_offset(struct pt_regs *regs, int regno)
{
+ if ((unsigned)regno < ARRAY_SIZE(pt_regoff))
+ return pt_regoff[regno];
+ return -EDOM;
+}
+
+static int get_regno(struct insn *insn, enum reg_type type)
+{
+ int nr_registers = ARRAY_SIZE(pt_regoff);
int regno = 0;
- static const int regoff[] = {
- offsetof(struct pt_regs, ax),
- offsetof(struct pt_regs, cx),
- offsetof(struct pt_regs, dx),
- offsetof(struct pt_regs, bx),
- offsetof(struct pt_regs, sp),
- offsetof(struct pt_regs, bp),
- offsetof(struct pt_regs, si),
- offsetof(struct pt_regs, di),
-#ifdef CONFIG_X86_64
- offsetof(struct pt_regs, r8),
- offsetof(struct pt_regs, r9),
- offsetof(struct pt_regs, r10),
- offsetof(struct pt_regs, r11),
- offsetof(struct pt_regs, r12),
- offsetof(struct pt_regs, r13),
- offsetof(struct pt_regs, r14),
- offsetof(struct pt_regs, r15),
-#endif
- };
- int nr_registers = ARRAY_SIZE(regoff);
/*
* Don't possibly decode a 32-bit instructions as
* reading a 64-bit-only register.
@@ -509,7 +516,18 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
WARN_ONCE(1, "decoded an instruction with an invalid register");
return -EINVAL;
}
- return regoff[regno];
+ return regno;
+}
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+ enum reg_type type)
+{
+ int regno = get_regno(insn, type);
+
+ if (regno < 0)
+ return regno;
+
+ return pt_regs_offset(regs, regno);
}
/**
@@ -517,7 +535,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
* @insn: Instruction containing ModRM byte
* @regs: Register values as seen when entering kernel mode
* @offs1: Offset of the first operand register
- * @offs2: Offset of the second opeand register, if applicable
+ * @offs2: Offset of the second operand register, if applicable
*
* Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte
* in @insn. This function is to be used with 16-bit address encodings. The
@@ -576,7 +594,7 @@ static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs,
* If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement-
* only addressing. This means that no registers are involved in
* computing the effective address. Thus, ensure that the first
- * register offset is invalild. The second register offset is already
+ * register offset is invalid. The second register offset is already
* invalid under the aforementioned conditions.
*/
if ((X86_MODRM_MOD(insn->modrm.value) == 0) &&
@@ -855,6 +873,26 @@ int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs)
}
/**
+ * insn_get_modrm_reg_ptr() - Obtain register pointer based on ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the reg part of the ModRM byte.
+ * The register is obtained as a pointer within pt_regs.
+ */
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs)
+{
+ int offset;
+
+ offset = insn_get_modrm_reg_off(insn, regs);
+ if (offset < 0)
+ return NULL;
+ return (void *)regs + offset;
+}
+
+/**
* get_seg_base_limit() - obtain base address and limit of a segment
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
@@ -928,10 +966,11 @@ static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs,
static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs,
int *regoff, long *eff_addr)
{
- insn_get_modrm(insn);
+ int ret;
- if (!insn->modrm.nbytes)
- return -EINVAL;
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
if (X86_MODRM_MOD(insn->modrm.value) != 3)
return -EINVAL;
@@ -977,14 +1016,14 @@ static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
int *regoff, long *eff_addr)
{
long tmp;
+ int ret;
if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
return -EINVAL;
- insn_get_modrm(insn);
-
- if (!insn->modrm.nbytes)
- return -EINVAL;
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
if (X86_MODRM_MOD(insn->modrm.value) > 2)
return -EINVAL;
@@ -1106,18 +1145,21 @@ static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs,
* @base_offset will have a register, as an offset from the base of pt_regs,
* that can be used to resolve the associated segment.
*
- * -EINVAL on error.
+ * Negative value on error.
*/
static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs,
int *base_offset, long *eff_addr)
{
long base, indx;
int indx_offset;
+ int ret;
if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
return -EINVAL;
- insn_get_modrm(insn);
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
if (!insn->modrm.nbytes)
return -EINVAL;
@@ -1125,7 +1167,9 @@ static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs,
if (X86_MODRM_MOD(insn->modrm.value) > 2)
return -EINVAL;
- insn_get_sib(insn);
+ ret = insn_get_sib(insn);
+ if (ret)
+ return ret;
if (!insn->sib.nbytes)
return -EINVAL;
@@ -1194,8 +1238,8 @@ static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs)
short eff_addr;
long tmp;
- insn_get_modrm(insn);
- insn_get_displacement(insn);
+ if (insn_get_displacement(insn))
+ goto out;
if (insn->addr_bytes != 2)
goto out;
@@ -1403,6 +1447,9 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
if (!insn || !regs)
return (void __user *)-1L;
+ if (insn_get_opcode(insn))
+ return (void __user *)-1L;
+
switch (insn->addr_bytes) {
case 2:
return get_addr_ref_16(insn, regs);
@@ -1415,7 +1462,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
}
}
-static unsigned long insn_get_effective_ip(struct pt_regs *regs)
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
{
unsigned long seg_base = 0;
@@ -1428,10 +1475,12 @@ static unsigned long insn_get_effective_ip(struct pt_regs *regs)
if (!user_64bit_mode(regs)) {
seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
if (seg_base == -1L)
- return 0;
+ return -EINVAL;
}
- return seg_base + regs->ip;
+ *ip = seg_base + regs->ip;
+
+ return 0;
}
/**
@@ -1444,18 +1493,17 @@ static unsigned long insn_get_effective_ip(struct pt_regs *regs)
*
* Returns:
*
- * Number of instruction bytes copied.
- *
- * 0 if nothing was copied.
+ * - number of instruction bytes copied.
+ * - 0 if nothing was copied.
+ * - -EINVAL if the linear address of the instruction could not be calculated
*/
int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long ip;
int not_copied;
- ip = insn_get_effective_ip(regs);
- if (!ip)
- return 0;
+ if (insn_get_effective_ip(regs, &ip))
+ return -EINVAL;
not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE);
@@ -1473,18 +1521,17 @@ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
*
* Returns:
*
- * Number of instruction bytes copied.
- *
- * 0 if nothing was copied.
+ * - number of instruction bytes copied.
+ * - 0 if nothing was copied.
+ * - -EINVAL if the linear address of the instruction could not be calculated.
*/
int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long ip;
int not_copied;
- ip = insn_get_effective_ip(regs);
- if (!ip)
- return 0;
+ if (insn_get_effective_ip(regs, &ip))
+ return -EINVAL;
not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE);
@@ -1492,7 +1539,7 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_IN
}
/**
- * insn_decode() - Decode an instruction
+ * insn_decode_from_regs() - Decode an instruction
* @insn: Structure to store decoded instruction
* @regs: Structure with register values as seen when entering kernel mode
* @buf: Buffer containing the instruction bytes
@@ -1505,8 +1552,8 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_IN
*
* True if instruction was decoded, False otherwise.
*/
-bool insn_decode(struct insn *insn, struct pt_regs *regs,
- unsigned char buf[MAX_INSN_SIZE], int buf_size)
+bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE], int buf_size)
{
int seg_defs;
@@ -1529,9 +1576,95 @@ bool insn_decode(struct insn *insn, struct pt_regs *regs,
insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
- insn_get_length(insn);
+ if (insn_get_length(insn))
+ return false;
+
if (buf_size < insn->length)
return false;
return true;
}
+
+/**
+ * insn_decode_mmio() - Decode a MMIO instruction
+ * @insn: Structure to store decoded instruction
+ * @bytes: Returns size of memory operand
+ *
+ * Decodes instruction that used for Memory-mapped I/O.
+ *
+ * Returns:
+ *
+ * Type of the instruction. Size of the memory operand is stored in
+ * @bytes. If decode failed, MMIO_DECODE_FAILED returned.
+ */
+enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
+{
+ enum mmio_type type = MMIO_DECODE_FAILED;
+
+ *bytes = 0;
+
+ if (insn_get_opcode(insn))
+ return MMIO_DECODE_FAILED;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x88: /* MOV m8,r8 */
+ *bytes = 1;
+ fallthrough;
+ case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_WRITE;
+ break;
+
+ case 0xc6: /* MOV m8, imm8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_WRITE_IMM;
+ break;
+
+ case 0x8a: /* MOV r8, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_READ;
+ break;
+
+ case 0xa4: /* MOVS m8, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_MOVS;
+ break;
+
+ case 0x0f: /* Two-byte instruction */
+ switch (insn->opcode.bytes[1]) {
+ case 0xb6: /* MOVZX r16/r32/r64, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xb7: /* MOVZX r32/r64, m16 */
+ if (!*bytes)
+ *bytes = 2;
+ type = MMIO_READ_ZERO_EXTEND;
+ break;
+
+ case 0xbe: /* MOVSX r16/r32/r64, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xbf: /* MOVSX r32/r64, m16 */
+ if (!*bytes)
+ *bytes = 2;
+ type = MMIO_READ_SIGN_EXTEND;
+ break;
+ }
+ break;
+ }
+
+ return type;
+}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 435630a6ec97..55e371cc69fd 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -11,10 +11,14 @@
#else
#include <string.h>
#endif
-#include <asm/inat.h>
-#include <asm/insn.h>
+#include <asm/inat.h> /*__ignore_sync_check__ */
+#include <asm/insn.h> /* __ignore_sync_check__ */
+#include <asm/unaligned.h> /* __ignore_sync_check__ */
-#include <asm/emulate_prefix.h>
+#include <linux/errno.h>
+#include <linux/kconfig.h>
+
+#include <asm/emulate_prefix.h> /* __ignore_sync_check__ */
#define leXX_to_cpu(t, r) \
({ \
@@ -34,10 +38,10 @@
((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr)
#define __get_next(t, insn) \
- ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); })
+ ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); })
#define __peek_nbyte_next(t, insn, n) \
- ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); })
+ ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); })
#define get_next(t, insn) \
({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
@@ -51,6 +55,7 @@
* insn_init() - initialize struct insn
* @insn: &struct insn to be initialized
* @kaddr: address (in kernel memory) of instruction (or copy thereof)
+ * @buf_len: length of the insn buffer at @kaddr
* @x86_64: !0 for 64-bit kernel or 64-bit app
*/
void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
@@ -111,8 +116,12 @@ static void insn_get_emulate_prefix(struct insn *insn)
* Populates the @insn->prefixes bitmap, and updates @insn->next_byte
* to point to the (first) opcode. No effect if @insn->prefixes.got
* is already set.
+ *
+ * * Returns:
+ * 0: on success
+ * < 0: on error
*/
-void insn_get_prefixes(struct insn *insn)
+int insn_get_prefixes(struct insn *insn)
{
struct insn_field *prefixes = &insn->prefixes;
insn_attr_t attr;
@@ -120,7 +129,7 @@ void insn_get_prefixes(struct insn *insn)
int i, nb;
if (prefixes->got)
- return;
+ return 0;
insn_get_emulate_prefix(insn);
@@ -230,8 +239,10 @@ vex_end:
prefixes->got = 1;
+ return 0;
+
err_out:
- return;
+ return -ENODATA;
}
/**
@@ -243,16 +254,25 @@ err_out:
* If necessary, first collects any preceding (prefix) bytes.
* Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
* is already 1.
+ *
+ * Returns:
+ * 0: on success
+ * < 0: on error
*/
-void insn_get_opcode(struct insn *insn)
+int insn_get_opcode(struct insn *insn)
{
struct insn_field *opcode = &insn->opcode;
+ int pfx_id, ret;
insn_byte_t op;
- int pfx_id;
+
if (opcode->got)
- return;
- if (!insn->prefixes.got)
- insn_get_prefixes(insn);
+ return 0;
+
+ if (!insn->prefixes.got) {
+ ret = insn_get_prefixes(insn);
+ if (ret)
+ return ret;
+ }
/* Get first opcode */
op = get_next(insn_byte_t, insn);
@@ -267,9 +287,13 @@ void insn_get_opcode(struct insn *insn)
insn->attr = inat_get_avx_attribute(op, m, p);
if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) ||
(!inat_accept_vex(insn->attr) &&
- !inat_is_group(insn->attr)))
- insn->attr = 0; /* This instruction is bad */
- goto end; /* VEX has only 1 byte for opcode */
+ !inat_is_group(insn->attr))) {
+ /* This instruction is bad */
+ insn->attr = 0;
+ return -EINVAL;
+ }
+ /* VEX has only 1 byte for opcode */
+ goto end;
}
insn->attr = inat_get_opcode_attribute(op);
@@ -280,13 +304,18 @@ void insn_get_opcode(struct insn *insn)
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
}
- if (inat_must_vex(insn->attr))
- insn->attr = 0; /* This instruction is bad */
+
+ if (inat_must_vex(insn->attr)) {
+ /* This instruction is bad */
+ insn->attr = 0;
+ return -EINVAL;
+ }
end:
opcode->got = 1;
+ return 0;
err_out:
- return;
+ return -ENODATA;
}
/**
@@ -296,15 +325,25 @@ err_out:
* Populates @insn->modrm and updates @insn->next_byte to point past the
* ModRM byte, if any. If necessary, first collects the preceding bytes
* (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
+ *
+ * Returns:
+ * 0: on success
+ * < 0: on error
*/
-void insn_get_modrm(struct insn *insn)
+int insn_get_modrm(struct insn *insn)
{
struct insn_field *modrm = &insn->modrm;
insn_byte_t pfx_id, mod;
+ int ret;
+
if (modrm->got)
- return;
- if (!insn->opcode.got)
- insn_get_opcode(insn);
+ return 0;
+
+ if (!insn->opcode.got) {
+ ret = insn_get_opcode(insn);
+ if (ret)
+ return ret;
+ }
if (inat_has_modrm(insn->attr)) {
mod = get_next(insn_byte_t, insn);
@@ -313,17 +352,22 @@ void insn_get_modrm(struct insn *insn)
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_group_attribute(mod, pfx_id,
insn->attr);
- if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
- insn->attr = 0; /* This is bad */
+ if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) {
+ /* Bad insn */
+ insn->attr = 0;
+ return -EINVAL;
+ }
}
}
if (insn->x86_64 && inat_is_force64(insn->attr))
insn->opnd_bytes = 8;
+
modrm->got = 1;
+ return 0;
err_out:
- return;
+ return -ENODATA;
}
@@ -337,11 +381,16 @@ err_out:
int insn_rip_relative(struct insn *insn)
{
struct insn_field *modrm = &insn->modrm;
+ int ret;
if (!insn->x86_64)
return 0;
- if (!modrm->got)
- insn_get_modrm(insn);
+
+ if (!modrm->got) {
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return 0;
+ }
/*
* For rip-relative instructions, the mod field (top 2 bits)
* is zero and the r/m field (bottom 3 bits) is 0x5.
@@ -355,15 +404,25 @@ int insn_rip_relative(struct insn *insn)
*
* If necessary, first collects the instruction up to and including the
* ModRM byte.
+ *
+ * Returns:
+ * 0: if decoding succeeded
+ * < 0: otherwise.
*/
-void insn_get_sib(struct insn *insn)
+int insn_get_sib(struct insn *insn)
{
insn_byte_t modrm;
+ int ret;
if (insn->sib.got)
- return;
- if (!insn->modrm.got)
- insn_get_modrm(insn);
+ return 0;
+
+ if (!insn->modrm.got) {
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
+ }
+
if (insn->modrm.nbytes) {
modrm = insn->modrm.bytes[0];
if (insn->addr_bytes != 2 &&
@@ -374,8 +433,10 @@ void insn_get_sib(struct insn *insn)
}
insn->sib.got = 1;
+ return 0;
+
err_out:
- return;
+ return -ENODATA;
}
@@ -386,15 +447,25 @@ err_out:
* If necessary, first collects the instruction up to and including the
* SIB byte.
* Displacement value is sign-expanded.
+ *
+ * * Returns:
+ * 0: if decoding succeeded
+ * < 0: otherwise.
*/
-void insn_get_displacement(struct insn *insn)
+int insn_get_displacement(struct insn *insn)
{
insn_byte_t mod, rm, base;
+ int ret;
if (insn->displacement.got)
- return;
- if (!insn->sib.got)
- insn_get_sib(insn);
+ return 0;
+
+ if (!insn->sib.got) {
+ ret = insn_get_sib(insn);
+ if (ret)
+ return ret;
+ }
+
if (insn->modrm.nbytes) {
/*
* Interpreting the modrm byte:
@@ -436,9 +507,10 @@ void insn_get_displacement(struct insn *insn)
}
out:
insn->displacement.got = 1;
+ return 0;
err_out:
- return;
+ return -ENODATA;
}
/* Decode moffset16/32/64. Return 0 if failed */
@@ -537,20 +609,30 @@ err_out:
}
/**
- * insn_get_immediate() - Get the immediates of instruction
+ * insn_get_immediate() - Get the immediate in an instruction
* @insn: &struct insn containing instruction
*
* If necessary, first collects the instruction up to and including the
* displacement bytes.
* Basically, most of immediates are sign-expanded. Unsigned-value can be
- * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ * computed by bit masking with ((1 << (nbytes * 8)) - 1)
+ *
+ * Returns:
+ * 0: on success
+ * < 0: on error
*/
-void insn_get_immediate(struct insn *insn)
+int insn_get_immediate(struct insn *insn)
{
+ int ret;
+
if (insn->immediate.got)
- return;
- if (!insn->displacement.got)
- insn_get_displacement(insn);
+ return 0;
+
+ if (!insn->displacement.got) {
+ ret = insn_get_displacement(insn);
+ if (ret)
+ return ret;
+ }
if (inat_has_moffset(insn->attr)) {
if (!__get_moffset(insn))
@@ -597,9 +679,10 @@ void insn_get_immediate(struct insn *insn)
}
done:
insn->immediate.got = 1;
+ return 0;
err_out:
- return;
+ return -ENODATA;
}
/**
@@ -608,13 +691,65 @@ err_out:
*
* If necessary, first collects the instruction up to and including the
* immediates bytes.
- */
-void insn_get_length(struct insn *insn)
+ *
+ * Returns:
+ * - 0 on success
+ * - < 0 on error
+*/
+int insn_get_length(struct insn *insn)
{
+ int ret;
+
if (insn->length)
- return;
- if (!insn->immediate.got)
- insn_get_immediate(insn);
+ return 0;
+
+ if (!insn->immediate.got) {
+ ret = insn_get_immediate(insn);
+ if (ret)
+ return ret;
+ }
+
insn->length = (unsigned char)((unsigned long)insn->next_byte
- (unsigned long)insn->kaddr);
+
+ return 0;
+}
+
+/* Ensure this instruction is decoded completely */
+static inline int insn_complete(struct insn *insn)
+{
+ return insn->opcode.got && insn->modrm.got && insn->sib.got &&
+ insn->displacement.got && insn->immediate.got;
+}
+
+/**
+ * insn_decode() - Decode an x86 instruction
+ * @insn: &struct insn to be initialized
+ * @kaddr: address (in kernel memory) of instruction (or copy thereof)
+ * @buf_len: length of the insn buffer at @kaddr
+ * @m: insn mode, see enum insn_mode
+ *
+ * Returns:
+ * 0: if decoding succeeded
+ * < 0: otherwise.
+ */
+int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m)
+{
+ int ret;
+
+/* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */
+
+ if (m == INSN_MODE_KERN)
+ insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64));
+ else
+ insn_init(insn, kaddr, buf_len, m == INSN_MODE_64);
+
+ ret = insn_get_length(insn);
+ if (ret)
+ return ret;
+
+ if (insn_complete(insn))
+ return 0;
+
+ return -EINVAL;
}
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
index cb5a1964506b..a1f9416bf67a 100644
--- a/arch/x86/lib/iomap_copy_64.S
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -11,5 +11,5 @@
SYM_FUNC_START(__iowrite32_copy)
movl %edx,%ecx
rep movsd
- ret
+ RET
SYM_FUNC_END(__iowrite32_copy)
diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c
index df50451d94ef..3e2f33fc33de 100644
--- a/arch/x86/lib/iomem.c
+++ b/arch/x86/lib/iomem.c
@@ -22,7 +22,7 @@ static __always_inline void rep_movs(void *to, const void *from, size_t n)
: "memory");
}
-void memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
+static void string_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
{
if (unlikely(!n))
return;
@@ -38,9 +38,8 @@ void memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
}
rep_movs(to, (const void *)from, n);
}
-EXPORT_SYMBOL(memcpy_fromio);
-void memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
+static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
{
if (unlikely(!n))
return;
@@ -56,14 +55,64 @@ void memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
}
rep_movs((void *)to, (const void *) from, n);
}
+
+static void unrolled_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
+{
+ const volatile char __iomem *in = from;
+ char *out = to;
+ int i;
+
+ for (i = 0; i < n; ++i)
+ out[i] = readb(&in[i]);
+}
+
+static void unrolled_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
+{
+ volatile char __iomem *out = to;
+ const char *in = from;
+ int i;
+
+ for (i = 0; i < n; ++i)
+ writeb(in[i], &out[i]);
+}
+
+static void unrolled_memset_io(volatile void __iomem *a, int b, size_t c)
+{
+ volatile char __iomem *mem = a;
+ int i;
+
+ for (i = 0; i < c; ++i)
+ writeb(b, &mem[i]);
+}
+
+void memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
+{
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO))
+ unrolled_memcpy_fromio(to, from, n);
+ else
+ string_memcpy_fromio(to, from, n);
+}
+EXPORT_SYMBOL(memcpy_fromio);
+
+void memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
+{
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO))
+ unrolled_memcpy_toio(to, from, n);
+ else
+ string_memcpy_toio(to, from, n);
+}
EXPORT_SYMBOL(memcpy_toio);
void memset_io(volatile void __iomem *a, int b, size_t c)
{
- /*
- * TODO: memset can mangle the IO patterns quite a bit.
- * perhaps it would be better to use a dumb one:
- */
- memset((void *)a, b, c);
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) {
+ unrolled_memset_io(a, b, c);
+ } else {
+ /*
+ * TODO: memset can mangle the IO patterns quite a bit.
+ * perhaps it would be better to use a dumb one:
+ */
+ memset((void *)a, b, c);
+ }
}
EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index a53665116458..a58f451a7dd3 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -11,7 +11,7 @@
#include <asm/msr.h>
#include <asm/archrandom.h>
#include <asm/e820/api.h>
-#include <asm/io.h>
+#include <asm/shared/io.h>
/*
* When built for the regular kernel, several functions need to be stubbed out
@@ -56,11 +56,14 @@ unsigned long kaslr_get_random_long(const char *purpose)
unsigned long raw, random = get_boot_seed();
bool use_i8254 = true;
- debug_putstr(purpose);
- debug_putstr(" KASLR using");
+ if (purpose) {
+ debug_putstr(purpose);
+ debug_putstr(" KASLR using");
+ }
if (has_cpuflag(X86_FEATURE_RDRAND)) {
- debug_putstr(" RDRAND");
+ if (purpose)
+ debug_putstr(" RDRAND");
if (rdrand_long(&raw)) {
random ^= raw;
use_i8254 = false;
@@ -68,7 +71,8 @@ unsigned long kaslr_get_random_long(const char *purpose)
}
if (has_cpuflag(X86_FEATURE_TSC)) {
- debug_putstr(" RDTSC");
+ if (purpose)
+ debug_putstr(" RDTSC");
raw = rdtsc();
random ^= raw;
@@ -76,7 +80,8 @@ unsigned long kaslr_get_random_long(const char *purpose)
}
if (use_i8254) {
- debug_putstr(" i8254");
+ if (purpose)
+ debug_putstr(" i8254");
random ^= i8254();
}
@@ -86,7 +91,8 @@ unsigned long kaslr_get_random_long(const char *purpose)
: "a" (random), "rm" (mix_const));
random += raw;
- debug_putstr("...\n");
+ if (purpose)
+ debug_putstr("...\n");
return random;
}
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index e565d1c9019e..ef3af7ff2c8a 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -4,14 +4,11 @@
#undef memcpy
#undef memset
+#undef memmove
__visible void *memcpy(void *to, const void *from, size_t n)
{
-#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE)
- return __memcpy3d(to, from, n);
-#else
return __memcpy(to, from, n);
-#endif
}
EXPORT_SYMBOL(memcpy);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1e299ac73c86..d0d7b9bc6cad 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,7 +4,7 @@
#include <linux/linkage.h>
#include <asm/errno.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
.pushsection .noinstr.text, "ax"
@@ -27,8 +27,7 @@
* Output:
* rax original destination
*/
-SYM_FUNC_START_ALIAS(__memcpy)
-SYM_FUNC_START_WEAK(memcpy)
+SYM_FUNC_START(__memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memcpy_erms", X86_FEATURE_ERMS
@@ -39,12 +38,13 @@ SYM_FUNC_START_WEAK(memcpy)
rep movsq
movl %edx, %ecx
rep movsb
- ret
-SYM_FUNC_END(memcpy)
-SYM_FUNC_END_ALIAS(__memcpy)
-EXPORT_SYMBOL(memcpy)
+ RET
+SYM_FUNC_END(__memcpy)
EXPORT_SYMBOL(__memcpy)
+SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+EXPORT_SYMBOL(memcpy)
+
/*
* memcpy_erms() - enhanced fast string memcpy. This is faster and
* simpler than memcpy. Use memcpy_erms when possible.
@@ -53,7 +53,7 @@ SYM_FUNC_START_LOCAL(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
- ret
+ RET
SYM_FUNC_END(memcpy_erms)
SYM_FUNC_START_LOCAL(memcpy_orig)
@@ -137,7 +137,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movq %r9, 1*8(%rdi)
movq %r10, -2*8(%rdi, %rdx)
movq %r11, -1*8(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_16bytes:
cmpl $8, %edx
@@ -149,7 +149,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movq -1*8(%rsi, %rdx), %r9
movq %r8, 0*8(%rdi)
movq %r9, -1*8(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_8bytes:
cmpl $4, %edx
@@ -162,7 +162,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movl -4(%rsi, %rdx), %r8d
movl %ecx, (%rdi)
movl %r8d, -4(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_3bytes:
subl $1, %edx
@@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movb %cl, (%rdi)
.Lend:
- retq
+ RET
SYM_FUNC_END(memcpy_orig)
.popsection
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 41902fe8b859..d83cba364e31 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,7 +8,7 @@
*/
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
#undef memmove
@@ -24,7 +24,6 @@
* Output:
* rax: dest
*/
-SYM_FUNC_START_WEAK(memmove)
SYM_FUNC_START(__memmove)
mov %rdi, %rax
@@ -40,7 +39,7 @@ SYM_FUNC_START(__memmove)
/* FSRM implies ERMS => no length checks, do the copy directly */
.Lmemmove_begin_forward:
ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
- ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+ ALTERNATIVE "", __stringify(movq %rdx, %rcx; rep movsb; RET), X86_FEATURE_ERMS
/*
* movsq instruction have many startup latency
@@ -205,8 +204,9 @@ SYM_FUNC_START(__memmove)
movb (%rsi), %r11b
movb %r11b, (%rdi)
13:
- retq
+ RET
SYM_FUNC_END(__memmove)
-SYM_FUNC_END_ALIAS(memmove)
EXPORT_SYMBOL(__memmove)
+
+SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 0bfd26e4ca9e..fc9ffd3ff3b2 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -3,7 +3,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
/*
@@ -17,7 +17,6 @@
*
* rax original destination
*/
-SYM_FUNC_START_WEAK(memset)
SYM_FUNC_START(__memset)
/*
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
@@ -40,12 +39,13 @@ SYM_FUNC_START(__memset)
movl %edx,%ecx
rep stosb
movq %r9,%rax
- ret
+ RET
SYM_FUNC_END(__memset)
-SYM_FUNC_END_ALIAS(memset)
-EXPORT_SYMBOL(memset)
EXPORT_SYMBOL(__memset)
+SYM_FUNC_ALIAS_WEAK(memset, __memset)
+EXPORT_SYMBOL(memset)
+
/*
* ISO C memset - set a memory block to a byte value. This function uses
* enhanced rep stosb to override the fast string function.
@@ -63,7 +63,7 @@ SYM_FUNC_START_LOCAL(memset_erms)
movq %rdx,%rcx
rep stosb
movq %r9,%rax
- ret
+ RET
SYM_FUNC_END(memset_erms)
SYM_FUNC_START_LOCAL(memset_orig)
@@ -125,7 +125,7 @@ SYM_FUNC_START_LOCAL(memset_orig)
.Lende:
movq %r10,%rax
- ret
+ RET
.Lbad_alignment:
cmpq $7,%rdx
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
deleted file mode 100644
index 419365c48b2a..000000000000
--- a/arch/x86/lib/mmx_32.c
+++ /dev/null
@@ -1,388 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MMX 3DNow! library helper functions
- *
- * To do:
- * We can use MMX just for prefetch in IRQ's. This may be a win.
- * (reported so on K6-III)
- * We should use a better code neutral filler for the short jump
- * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
- * We also want to clobber the filler register so we don't get any
- * register forwarding stalls on the filler.
- *
- * Add *user handling. Checksums are not a win with MMX on any CPU
- * tested so far for any MMX solution figured.
- *
- * 22/09/2000 - Arjan van de Ven
- * Improved for non-egineering-sample Athlons
- *
- */
-#include <linux/hardirq.h>
-#include <linux/string.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-#include <asm/fpu/api.h>
-#include <asm/asm.h>
-
-/*
- * Use KFPU_387. MMX instructions are not affected by MXCSR,
- * but both AMD and Intel documentation states that even integer MMX
- * operations will result in #MF if an exception is pending in FCW.
- *
- * EMMS is not needed afterwards because, after calling kernel_fpu_end(),
- * any subsequent user of the 387 stack will reinitialize it using
- * KFPU_387.
- */
-
-void *_mmx_memcpy(void *to, const void *from, size_t len)
-{
- void *p;
- int i;
-
- if (unlikely(in_interrupt()))
- return __memcpy(to, from, len);
-
- p = to;
- i = len >> 6; /* len/64 */
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- "1: prefetch (%0)\n" /* This set is 28 bytes */
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from));
-
- for ( ; i > 5; i--) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
-
- for ( ; i > 0; i--) {
- __asm__ __volatile__ (
- " movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
- /*
- * Now do the tail of the block:
- */
- __memcpy(to, from, len & 63);
- kernel_fpu_end();
-
- return p;
-}
-EXPORT_SYMBOL(_mmx_memcpy);
-
-#ifdef CONFIG_MK7
-
-/*
- * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
- * other MMX using processors do not.
- */
-
-static void fast_clear_page(void *page)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- " pxor %%mm0, %%mm0\n" : :
- );
-
- for (i = 0; i < 4096/64; i++) {
- __asm__ __volatile__ (
- " movntq %%mm0, (%0)\n"
- " movntq %%mm0, 8(%0)\n"
- " movntq %%mm0, 16(%0)\n"
- " movntq %%mm0, 24(%0)\n"
- " movntq %%mm0, 32(%0)\n"
- " movntq %%mm0, 40(%0)\n"
- " movntq %%mm0, 48(%0)\n"
- " movntq %%mm0, 56(%0)\n"
- : : "r" (page) : "memory");
- page += 64;
- }
-
- /*
- * Since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again:
- */
- __asm__ __volatile__("sfence\n"::);
-
- kernel_fpu_end();
-}
-
-static void fast_copy_page(void *to, void *from)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- /*
- * maybe the prefetch stuff can go before the expensive fnsave...
- * but that is for later. -AV
- */
- __asm__ __volatile__(
- "1: prefetch (%0)\n"
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from));
-
- for (i = 0; i < (4096-320)/64; i++) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movntq %%mm0, (%1)\n"
- " movq 8(%0), %%mm1\n"
- " movntq %%mm1, 8(%1)\n"
- " movq 16(%0), %%mm2\n"
- " movntq %%mm2, 16(%1)\n"
- " movq 24(%0), %%mm3\n"
- " movntq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm4\n"
- " movntq %%mm4, 32(%1)\n"
- " movq 40(%0), %%mm5\n"
- " movntq %%mm5, 40(%1)\n"
- " movq 48(%0), %%mm6\n"
- " movntq %%mm6, 48(%1)\n"
- " movq 56(%0), %%mm7\n"
- " movntq %%mm7, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
-
- for (i = (4096-320)/64; i < 4096/64; i++) {
- __asm__ __volatile__ (
- "2: movq (%0), %%mm0\n"
- " movntq %%mm0, (%1)\n"
- " movq 8(%0), %%mm1\n"
- " movntq %%mm1, 8(%1)\n"
- " movq 16(%0), %%mm2\n"
- " movntq %%mm2, 16(%1)\n"
- " movq 24(%0), %%mm3\n"
- " movntq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm4\n"
- " movntq %%mm4, 32(%1)\n"
- " movq 40(%0), %%mm5\n"
- " movntq %%mm5, 40(%1)\n"
- " movq 48(%0), %%mm6\n"
- " movntq %%mm6, 48(%1)\n"
- " movq 56(%0), %%mm7\n"
- " movntq %%mm7, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- from += 64;
- to += 64;
- }
- /*
- * Since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again:
- */
- __asm__ __volatile__("sfence \n"::);
- kernel_fpu_end();
-}
-
-#else /* CONFIG_MK7 */
-
-/*
- * Generic MMX implementation without K7 specific streaming
- */
-static void fast_clear_page(void *page)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- " pxor %%mm0, %%mm0\n" : :
- );
-
- for (i = 0; i < 4096/128; i++) {
- __asm__ __volatile__ (
- " movq %%mm0, (%0)\n"
- " movq %%mm0, 8(%0)\n"
- " movq %%mm0, 16(%0)\n"
- " movq %%mm0, 24(%0)\n"
- " movq %%mm0, 32(%0)\n"
- " movq %%mm0, 40(%0)\n"
- " movq %%mm0, 48(%0)\n"
- " movq %%mm0, 56(%0)\n"
- " movq %%mm0, 64(%0)\n"
- " movq %%mm0, 72(%0)\n"
- " movq %%mm0, 80(%0)\n"
- " movq %%mm0, 88(%0)\n"
- " movq %%mm0, 96(%0)\n"
- " movq %%mm0, 104(%0)\n"
- " movq %%mm0, 112(%0)\n"
- " movq %%mm0, 120(%0)\n"
- : : "r" (page) : "memory");
- page += 128;
- }
-
- kernel_fpu_end();
-}
-
-static void fast_copy_page(void *to, void *from)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- "1: prefetch (%0)\n"
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from));
-
- for (i = 0; i < 4096/64; i++) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
- kernel_fpu_end();
-}
-
-#endif /* !CONFIG_MK7 */
-
-/*
- * Favour MMX for page clear and copy:
- */
-static void slow_zero_page(void *page)
-{
- int d0, d1;
-
- __asm__ __volatile__(
- "cld\n\t"
- "rep ; stosl"
-
- : "=&c" (d0), "=&D" (d1)
- :"a" (0), "1" (page), "0" (1024)
- :"memory");
-}
-
-void mmx_clear_page(void *page)
-{
- if (unlikely(in_interrupt()))
- slow_zero_page(page);
- else
- fast_clear_page(page);
-}
-EXPORT_SYMBOL(mmx_clear_page);
-
-static void slow_copy_page(void *to, void *from)
-{
- int d0, d1, d2;
-
- __asm__ __volatile__(
- "cld\n\t"
- "rep ; movsl"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (1024), "1" ((long) to), "2" ((long) from)
- : "memory");
-}
-
-void mmx_copy_page(void *to, void *from)
-{
- if (unlikely(in_interrupt()))
- slow_copy_page(to, from);
- else
- fast_copy_page(to, from);
-}
-EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index a2b9caa5274c..ebd259f31496 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -35,7 +35,7 @@ SYM_FUNC_START(\op\()_safe_regs)
movl %edi, 28(%r10)
popq %r12
popq %rbx
- ret
+ RET
3:
movl $-EIO, %r11d
jmp 2b
@@ -77,7 +77,7 @@ SYM_FUNC_START(\op\()_safe_regs)
popl %esi
popl %ebp
popl %ebx
- ret
+ RET
3:
movl $-EIO, 4(%esp)
jmp 2b
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index 75a0915b0d01..40bbe56bde32 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -252,7 +252,7 @@ static void __wrmsr_safe_regs_on_cpu(void *info)
rv->err = wrmsr_safe_regs(rv->regs);
}
-int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
int err;
struct msr_regs_info rv;
@@ -265,7 +265,7 @@ int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
}
EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
-int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
int err;
struct msr_regs_info rv;
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 3bd905e10ee2..b09cd2ad426c 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(msrs_free);
* argument @m.
*
*/
-int msr_read(u32 msr, struct msr *m)
+static int msr_read(u32 msr, struct msr *m)
{
int err;
u64 val;
@@ -54,7 +54,7 @@ int msr_read(u32 msr, struct msr *m)
* @msr: MSR to write
* @m: value to write
*/
-int msr_write(u32 msr, struct msr *m)
+static int msr_write(u32 msr, struct msr *m)
{
return wrmsrl_safe(msr, m->q);
}
diff --git a/arch/x86/lib/pc-conf-reg.c b/arch/x86/lib/pc-conf-reg.c
new file mode 100644
index 000000000000..febb52749e8d
--- /dev/null
+++ b/arch/x86/lib/pc-conf-reg.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for the configuration register space at port I/O locations
+ * 0x22 and 0x23 variously used by PC architectures, e.g. the MP Spec,
+ * Cyrix CPUs, numerous chipsets. As the space is indirectly addressed
+ * it may have to be protected with a spinlock, depending on the context.
+ */
+
+#include <linux/spinlock.h>
+
+#include <asm/pc-conf-reg.h>
+
+DEFINE_RAW_SPINLOCK(pc_conf_lock);
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 0ea344c5ea43..b7dfd60243b7 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -48,11 +48,12 @@ SYM_FUNC_START(__put_user_1)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
1: movb %al,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_1)
EXPORT_SYMBOL(__put_user_1)
EXPORT_SYMBOL(__put_user_nocheck_1)
@@ -62,11 +63,12 @@ SYM_FUNC_START(__put_user_2)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
2: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_2)
EXPORT_SYMBOL(__put_user_2)
EXPORT_SYMBOL(__put_user_nocheck_2)
@@ -76,11 +78,12 @@ SYM_FUNC_START(__put_user_4)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
3: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_4)
EXPORT_SYMBOL(__put_user_4)
EXPORT_SYMBOL(__put_user_nocheck_4)
@@ -90,6 +93,7 @@ SYM_FUNC_START(__put_user_8)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
4: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index f6fb1d218dcc..b2b2366885a2 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -4,33 +4,38 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
#include <asm/frame.h>
-.macro THUNK reg
.section .text.__x86.indirect_thunk
- .align 32
-SYM_FUNC_START(__x86_indirect_thunk_\reg)
- JMP_NOSPEC \reg
-SYM_FUNC_END(__x86_indirect_thunk_\reg)
-
-SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
+.macro RETPOLINE reg
ANNOTATE_INTRA_FUNCTION_CALL
- call .Ldo_rop_\@
+ call .Ldo_rop_\@
.Lspec_trap_\@:
UNWIND_HINT_EMPTY
pause
lfence
- jmp .Lspec_trap_\@
+ jmp .Lspec_trap_\@
.Ldo_rop_\@:
- mov %\reg, (%_ASM_SP)
+ mov %\reg, (%_ASM_SP)
UNWIND_HINT_FUNC
- ret
-SYM_FUNC_END(__x86_retpoline_\reg)
+ RET
+.endm
+
+.macro THUNK reg
+
+ .align RETPOLINE_THUNK_SIZE
+SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
+
+ ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
+ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE
.endm
@@ -48,16 +53,17 @@ SYM_FUNC_END(__x86_retpoline_\reg)
#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
-#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg)
-#undef GEN
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_START(__x86_indirect_thunk_array)
+
#define GEN(reg) THUNK reg
#include <asm/GEN-for-each-reg.h>
-
#undef GEN
+
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_END(__x86_indirect_thunk_array)
+
#define GEN(reg) EXPORT_THUNK(reg)
#include <asm/GEN-for-each-reg.h>
-
#undef GEN
-#define GEN(reg) EXPORT_RETPOLINE(reg)
-#include <asm/GEN-for-each-reg.h>
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index d15fdae9656e..53b3f202267c 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -11,6 +11,7 @@
* strings.
*/
+#define __NO_FORTIFY
#include <linux/string.h>
#include <linux/export.h>
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c3e8a62ca561..ad0139d25401 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -32,7 +32,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
unsigned long ret;
- if (__range_not_ok(from, n, TASK_SIZE))
+ if (!__access_ok(from, n))
return n;
if (!nmi_uaccess_okay())
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7d290777246d..422257c350c6 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -8,7 +8,6 @@
*/
#include <linux/export.h>
#include <linux/uaccess.h>
-#include <asm/mmx.h>
#include <asm/asm.h>
#ifdef CONFIG_X86_INTEL_USERCOPY
@@ -43,11 +42,7 @@ do { \
" movl %2,%0\n" \
"1: rep; stosb\n" \
"2: " ASM_CLAC "\n" \
- ".section .fixup,\"ax\"\n" \
- "3: lea 0(%2,%0,4),%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(0b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \
_ASM_EXTABLE_UA(1b, 2b) \
: "=&c"(size), "=&D" (__d0) \
: "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
@@ -149,10 +144,6 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
"36: movl %%eax, %0\n"
"37: rep; movsb\n"
"100:\n"
- ".section .fixup,\"ax\"\n"
- "101: lea 0(%%eax,%0,4),%0\n"
- " jmp 100b\n"
- ".previous\n"
_ASM_EXTABLE_UA(1b, 100b)
_ASM_EXTABLE_UA(2b, 100b)
_ASM_EXTABLE_UA(3b, 100b)
@@ -190,7 +181,7 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
_ASM_EXTABLE_UA(35b, 100b)
_ASM_EXTABLE_UA(36b, 100b)
_ASM_EXTABLE_UA(37b, 100b)
- _ASM_EXTABLE_UA(99b, 101b)
+ _ASM_EXTABLE_TYPE_REG(99b, 100b, EX_TYPE_UCOPY_LEN4, %%eax)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -255,30 +246,26 @@ static unsigned long __copy_user_intel_nocache(void *to,
" movl %%eax,%0\n"
"7: rep; movsb\n"
"8:\n"
- ".section .fixup,\"ax\"\n"
- "9: lea 0(%%eax,%0,4),%0\n"
- "16: jmp 8b\n"
- ".previous\n"
- _ASM_EXTABLE_UA(0b, 16b)
- _ASM_EXTABLE_UA(1b, 16b)
- _ASM_EXTABLE_UA(2b, 16b)
- _ASM_EXTABLE_UA(21b, 16b)
- _ASM_EXTABLE_UA(3b, 16b)
- _ASM_EXTABLE_UA(31b, 16b)
- _ASM_EXTABLE_UA(4b, 16b)
- _ASM_EXTABLE_UA(41b, 16b)
- _ASM_EXTABLE_UA(10b, 16b)
- _ASM_EXTABLE_UA(51b, 16b)
- _ASM_EXTABLE_UA(11b, 16b)
- _ASM_EXTABLE_UA(61b, 16b)
- _ASM_EXTABLE_UA(12b, 16b)
- _ASM_EXTABLE_UA(71b, 16b)
- _ASM_EXTABLE_UA(13b, 16b)
- _ASM_EXTABLE_UA(81b, 16b)
- _ASM_EXTABLE_UA(14b, 16b)
- _ASM_EXTABLE_UA(91b, 16b)
- _ASM_EXTABLE_UA(6b, 9b)
- _ASM_EXTABLE_UA(7b, 16b)
+ _ASM_EXTABLE_UA(0b, 8b)
+ _ASM_EXTABLE_UA(1b, 8b)
+ _ASM_EXTABLE_UA(2b, 8b)
+ _ASM_EXTABLE_UA(21b, 8b)
+ _ASM_EXTABLE_UA(3b, 8b)
+ _ASM_EXTABLE_UA(31b, 8b)
+ _ASM_EXTABLE_UA(4b, 8b)
+ _ASM_EXTABLE_UA(41b, 8b)
+ _ASM_EXTABLE_UA(10b, 8b)
+ _ASM_EXTABLE_UA(51b, 8b)
+ _ASM_EXTABLE_UA(11b, 8b)
+ _ASM_EXTABLE_UA(61b, 8b)
+ _ASM_EXTABLE_UA(12b, 8b)
+ _ASM_EXTABLE_UA(71b, 8b)
+ _ASM_EXTABLE_UA(13b, 8b)
+ _ASM_EXTABLE_UA(81b, 8b)
+ _ASM_EXTABLE_UA(14b, 8b)
+ _ASM_EXTABLE_UA(91b, 8b)
+ _ASM_EXTABLE_TYPE_REG(6b, 8b, EX_TYPE_UCOPY_LEN4, %%eax)
+ _ASM_EXTABLE_UA(7b, 8b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -315,14 +302,8 @@ do { \
" movl %3,%0\n" \
"1: rep; movsb\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "5: addl %3,%0\n" \
- " jmp 2b\n" \
- "3: lea 0(%3,%0,4),%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(4b, 5b) \
- _ASM_EXTABLE_UA(0b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \
_ASM_EXTABLE_UA(1b, 2b) \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 508c81e97ab1..0ae6cf804197 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -35,12 +35,10 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
" incq %[dst]\n"
" decl %%ecx ; jnz 1b\n"
"2:\n"
- ".section .fixup,\"ax\"\n"
- "3: lea 0(%[size1],%[size8],8),%[size8]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE_UA(0b, 3b)
+
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1])
_ASM_EXTABLE_UA(1b, 2b)
+
: [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
clac();
@@ -121,7 +119,7 @@ void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
/* cache copy and flush to align dest */
if (!IS_ALIGNED(dest, 8)) {
- unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
+ size_t len = min_t(size_t, size, ALIGN(dest, 8) - dest);
memcpy((void *) dest, (void *) source, len);
clean_cache_range((void *) dest, len);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index ec31f5b60323..d12d1358f96d 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -690,7 +690,10 @@ AVXcode: 2
45: vpsrlvd/q Vx,Hx,Wx (66),(v)
46: vpsravd Vx,Hx,Wx (66),(v) | vpsravd/q Vx,Hx,Wx (66),(evo)
47: vpsllvd/q Vx,Hx,Wx (66),(v)
-# Skip 0x48-0x4b
+# Skip 0x48
+49: TILERELEASE (v1),(000),(11B) | LDTILECFG Mtc (v1)(000) | STTILECFG Mtc (66),(v1),(000) | TILEZERO Vt (F2),(v1),(11B)
+# Skip 0x4a
+4b: TILELOADD Vt,Wsm (F2),(v1) | TILELOADDT1 Vt,Wsm (66),(v1) | TILESTORED Wsm,Vt (F3),(v)
4c: vrcp14ps/d Vpd,Wpd (66),(ev)
4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev)
4e: vrsqrt14ps/d Vpd,Wpd (66),(ev)
@@ -705,7 +708,10 @@ AVXcode: 2
59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo)
5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo)
5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev)
-# Skip 0x5c-0x61
+5c: TDPBF16PS Vt,Wt,Ht (F3),(v1)
+# Skip 0x5d
+5e: TDPBSSD Vt,Wt,Ht (F2),(v1) | TDPBSUD Vt,Wt,Ht (F3),(v1) | TDPBUSD Vt,Wt,Ht (66),(v1) | TDPBUUD Vt,Wt,Ht (v1)
+# Skip 0x5f-0x61
62: vpexpandb/w Vx,Wx (66),(ev)
63: vpcompressb/w Wx,Vx (66),(ev)
64: vpblendmd/q Vx,Hx,Wx (66),(ev)
@@ -822,9 +828,9 @@ AVXcode: 3
05: vpermilpd Vx,Wx,Ib (66),(v)
06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
07:
-08: vroundps Vx,Wx,Ib (66) | vrndscaleps Vx,Wx,Ib (66),(evo)
+08: vroundps Vx,Wx,Ib (66) | vrndscaleps Vx,Wx,Ib (66),(evo) | vrndscaleph Vx,Wx,Ib (evo)
09: vroundpd Vx,Wx,Ib (66) | vrndscalepd Vx,Wx,Ib (66),(evo)
-0a: vroundss Vss,Wss,Ib (66),(v1) | vrndscaless Vx,Hx,Wx,Ib (66),(evo)
+0a: vroundss Vss,Wss,Ib (66),(v1) | vrndscaless Vx,Hx,Wx,Ib (66),(evo) | vrndscalesh Vx,Hx,Wx,Ib (evo)
0b: vroundsd Vsd,Wsd,Ib (66),(v1) | vrndscalesd Vx,Hx,Wx,Ib (66),(evo)
0c: vblendps Vx,Hx,Wx,Ib (66)
0d: vblendpd Vx,Hx,Wx,Ib (66)
@@ -846,8 +852,8 @@ AVXcode: 3
22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
23: vshuff32x4/64x2 Vx,Hx,Wx,Ib (66),(ev)
25: vpternlogd/q Vx,Hx,Wx,Ib (66),(ev)
-26: vgetmantps/d Vx,Wx,Ib (66),(ev)
-27: vgetmantss/d Vx,Hx,Wx,Ib (66),(ev)
+26: vgetmantps/d Vx,Wx,Ib (66),(ev) | vgetmantph Vx,Wx,Ib (ev)
+27: vgetmantss/d Vx,Hx,Wx,Ib (66),(ev) | vgetmantsh Vx,Hx,Wx,Ib (ev)
30: kshiftrb/w Vk,Uk,Ib (66),(v)
31: kshiftrd/q Vk,Uk,Ib (66),(v)
32: kshiftlb/w Vk,Uk,Ib (66),(v)
@@ -871,23 +877,102 @@ AVXcode: 3
51: vrangess/d Vx,Hx,Wx,Ib (66),(ev)
54: vfixupimmps/d Vx,Hx,Wx,Ib (66),(ev)
55: vfixupimmss/d Vx,Hx,Wx,Ib (66),(ev)
-56: vreduceps/d Vx,Wx,Ib (66),(ev)
-57: vreducess/d Vx,Hx,Wx,Ib (66),(ev)
+56: vreduceps/d Vx,Wx,Ib (66),(ev) | vreduceph Vx,Wx,Ib (ev)
+57: vreducess/d Vx,Hx,Wx,Ib (66),(ev) | vreducesh Vx,Hx,Wx,Ib (ev)
60: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
-66: vfpclassps/d Vk,Wx,Ib (66),(ev)
-67: vfpclassss/d Vk,Wx,Ib (66),(ev)
+66: vfpclassps/d Vk,Wx,Ib (66),(ev) | vfpclassph Vx,Wx,Ib (ev)
+67: vfpclassss/d Vk,Wx,Ib (66),(ev) | vfpclasssh Vx,Wx,Ib (ev)
70: vpshldw Vx,Hx,Wx,Ib (66),(ev)
71: vpshldd/q Vx,Hx,Wx,Ib (66),(ev)
72: vpshrdw Vx,Hx,Wx,Ib (66),(ev)
73: vpshrdd/q Vx,Hx,Wx,Ib (66),(ev)
+c2: vcmpph Vx,Hx,Wx,Ib (ev) | vcmpsh Vx,Hx,Wx,Ib (F3),(ev)
cc: sha1rnds4 Vdq,Wdq,Ib
ce: vgf2p8affineqb Vx,Wx,Ib (66)
cf: vgf2p8affineinvqb Vx,Wx,Ib (66)
df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
-f0: RORX Gy,Ey,Ib (F2),(v)
+f0: RORX Gy,Ey,Ib (F2),(v) | HRESET Gv,Ib (F3),(000),(11B)
+EndTable
+
+Table: EVEX map 5
+Referrer:
+AVXcode: 5
+10: vmovsh Vx,Hx,Wx (F3),(ev) | vmovsh Vx,Wx (F3),(ev)
+11: vmovsh Wx,Hx,Vx (F3),(ev) | vmovsh Wx,Vx (F3),(ev)
+1d: vcvtps2phx Vx,Wx (66),(ev) | vcvtss2sh Vx,Hx,Wx (ev)
+2a: vcvtsi2sh Vx,Hx,Wx (F3),(ev)
+2c: vcvttsh2si Vx,Wx (F3),(ev)
+2d: vcvtsh2si Vx,Wx (F3),(ev)
+2e: vucomish Vx,Wx (ev)
+2f: vcomish Vx,Wx (ev)
+51: vsqrtph Vx,Wx (ev) | vsqrtsh Vx,Hx,Wx (F3),(ev)
+58: vaddph Vx,Hx,Wx (ev) | vaddsh Vx,Hx,Wx (F3),(ev)
+59: vmulph Vx,Hx,Wx (ev) | vmulsh Vx,Hx,Wx (F3),(ev)
+5a: vcvtpd2ph Vx,Wx (66),(ev) | vcvtph2pd Vx,Wx (ev) | vcvtsd2sh Vx,Hx,Wx (F2),(ev) | vcvtsh2sd Vx,Hx,Wx (F3),(ev)
+5b: vcvtdq2ph Vx,Wx (ev) | vcvtph2dq Vx,Wx (66),(ev) | vcvtqq2ph Vx,Wx (ev) | vcvttph2dq Vx,Wx (F3),(ev)
+5c: vsubph Vx,Hx,Wx (ev) | vsubsh Vx,Hx,Wx (F3),(ev)
+5d: vminph Vx,Hx,Wx (ev) | vminsh Vx,Hx,Wx (F3),(ev)
+5e: vdivph Vx,Hx,Wx (ev) | vdivsh Vx,Hx,Wx (F3),(ev)
+5f: vmaxph Vx,Hx,Wx (ev) | vmaxsh Vx,Hx,Wx (F3),(ev)
+6e: vmovw Vx,Wx (66),(ev)
+78: vcvttph2udq Vx,Wx (ev) | vcvttph2uqq Vx,Wx (66),(ev) | vcvttsh2usi Vx,Wx (F3),(ev)
+79: vcvtph2udq Vx,Wx (ev) | vcvtph2uqq Vx,Wx (66),(ev) | vcvtsh2usi Vx,Wx (F3),(ev)
+7a: vcvttph2qq Vx,Wx (66),(ev) | vcvtudq2ph Vx,Wx (F2),(ev) | vcvtuqq2ph Vx,Wx (F2),(ev)
+7b: vcvtph2qq Vx,Wx (66),(ev) | vcvtusi2sh Vx,Hx,Wx (F3),(ev)
+7c: vcvttph2uw Vx,Wx (ev) | vcvttph2w Vx,Wx (66),(ev)
+7d: vcvtph2uw Vx,Wx (ev) | vcvtph2w Vx,Wx (66),(ev) | vcvtuw2ph Vx,Wx (F2),(ev) | vcvtw2ph Vx,Wx (F3),(ev)
+7e: vmovw Wx,Vx (66),(ev)
+EndTable
+
+Table: EVEX map 6
+Referrer:
+AVXcode: 6
+13: vcvtph2psx Vx,Wx (66),(ev) | vcvtsh2ss Vx,Hx,Wx (ev)
+2c: vscalefph Vx,Hx,Wx (66),(ev)
+2d: vscalefsh Vx,Hx,Wx (66),(ev)
+42: vgetexpph Vx,Wx (66),(ev)
+43: vgetexpsh Vx,Hx,Wx (66),(ev)
+4c: vrcpph Vx,Wx (66),(ev)
+4d: vrcpsh Vx,Hx,Wx (66),(ev)
+4e: vrsqrtph Vx,Wx (66),(ev)
+4f: vrsqrtsh Vx,Hx,Wx (66),(ev)
+56: vfcmaddcph Vx,Hx,Wx (F2),(ev) | vfmaddcph Vx,Hx,Wx (F3),(ev)
+57: vfcmaddcsh Vx,Hx,Wx (F2),(ev) | vfmaddcsh Vx,Hx,Wx (F3),(ev)
+96: vfmaddsub132ph Vx,Hx,Wx (66),(ev)
+97: vfmsubadd132ph Vx,Hx,Wx (66),(ev)
+98: vfmadd132ph Vx,Hx,Wx (66),(ev)
+99: vfmadd132sh Vx,Hx,Wx (66),(ev)
+9a: vfmsub132ph Vx,Hx,Wx (66),(ev)
+9b: vfmsub132sh Vx,Hx,Wx (66),(ev)
+9c: vfnmadd132ph Vx,Hx,Wx (66),(ev)
+9d: vfnmadd132sh Vx,Hx,Wx (66),(ev)
+9e: vfnmsub132ph Vx,Hx,Wx (66),(ev)
+9f: vfnmsub132sh Vx,Hx,Wx (66),(ev)
+a6: vfmaddsub213ph Vx,Hx,Wx (66),(ev)
+a7: vfmsubadd213ph Vx,Hx,Wx (66),(ev)
+a8: vfmadd213ph Vx,Hx,Wx (66),(ev)
+a9: vfmadd213sh Vx,Hx,Wx (66),(ev)
+aa: vfmsub213ph Vx,Hx,Wx (66),(ev)
+ab: vfmsub213sh Vx,Hx,Wx (66),(ev)
+ac: vfnmadd213ph Vx,Hx,Wx (66),(ev)
+ad: vfnmadd213sh Vx,Hx,Wx (66),(ev)
+ae: vfnmsub213ph Vx,Hx,Wx (66),(ev)
+af: vfnmsub213sh Vx,Hx,Wx (66),(ev)
+b6: vfmaddsub231ph Vx,Hx,Wx (66),(ev)
+b7: vfmsubadd231ph Vx,Hx,Wx (66),(ev)
+b8: vfmadd231ph Vx,Hx,Wx (66),(ev)
+b9: vfmadd231sh Vx,Hx,Wx (66),(ev)
+ba: vfmsub231ph Vx,Hx,Wx (66),(ev)
+bb: vfmsub231sh Vx,Hx,Wx (66),(ev)
+bc: vfnmadd231ph Vx,Hx,Wx (66),(ev)
+bd: vfnmadd231sh Vx,Hx,Wx (66),(ev)
+be: vfnmsub231ph Vx,Hx,Wx (66),(ev)
+bf: vfnmsub231sh Vx,Hx,Wx (66),(ev)
+d6: vfcmulcph Vx,Hx,Wx (F2),(ev) | vfmulcph Vx,Hx,Wx (F3),(ev)
+d7: vfcmulcsh Vx,Hx,Wx (F2),(ev) | vfmulcsh Vx,Hx,Wx (F3),(ev)
EndTable
GrpTable: Grp1
@@ -970,7 +1055,7 @@ GrpTable: Grp7
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
3: LIDT Ms
4: SMSW Mw/Rv
-5: rdpkru (110),(11B) | wrpkru (111),(11B) | SAVEPREVSSP (F3),(010),(11B) | RSTORSSP Mq (F3) | SETSSBSY (F3),(000),(11B)
+5: rdpkru (110),(11B) | wrpkru (111),(11B) | SAVEPREVSSP (F3),(010),(11B) | RSTORSSP Mq (F3) | SETSSBSY (F3),(000),(11B) | CLUI (F3),(110),(11B) | SERIALIZE (000),(11B) | STUI (F3),(111),(11B) | TESTUI (F3)(101)(11B) | UIRET (F3),(100),(11B) | XRESLDTRK (F2),(000),(11B) | XSUSLDTRK (F2),(001),(11B)
6: LMSW Ew
7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
EndTable
@@ -987,7 +1072,7 @@ GrpTable: Grp9
3: xrstors
4: xsavec
5: xsaves
-6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) | SENDUIPI Gq (F3)
7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B)
EndTable
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
index 951da2ad54bb..8c270ab415be 100644
--- a/arch/x86/math-emu/div_Xsig.S
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -341,7 +341,7 @@ L_exit:
popl %esi
leave
- ret
+ RET
#ifdef PARANOID
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
index d047d1816abe..637439bfefa4 100644
--- a/arch/x86/math-emu/div_small.S
+++ b/arch/x86/math-emu/div_small.S
@@ -44,5 +44,5 @@ SYM_FUNC_START(FPU_div_small)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_div_small)
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 034748459482..d62662bdd460 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft)
void finit(void)
{
- fpstate_init_soft(&current->thread.fpu.state.soft);
+ fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
}
/*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 8679a9d6c47f..7fe56c594aa6 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -31,7 +31,7 @@
#include <linux/uaccess.h>
#include <asm/traps.h>
#include <asm/user.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include "fpu_system.h"
#include "fpu_emu.h"
@@ -640,7 +640,7 @@ int fpregs_soft_set(struct task_struct *target,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct swregs_state *s387 = &target->thread.fpu.state.soft;
+ struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft;
void *space = s387->st_space;
int ret;
int offset, other, i, tags, regnr, tag, newtop;
@@ -691,7 +691,7 @@ int fpregs_soft_get(struct task_struct *target,
const struct user_regset *regset,
struct membuf to)
{
- struct swregs_state *s387 = &target->thread.fpu.state.soft;
+ struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft;
const void *space = s387->st_space;
int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 70d35c200945..94c4023092f3 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -144,7 +144,7 @@ extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
extern int FPU_round_to_int(FPU_REG *r, u_char tag);
extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s);
-extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
+extern void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d);
extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address);
extern int FPU_tagof(FPU_REG *ptr);
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 9b41391867dc..eec3e4805c75 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d)
return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE;
}
-#define I387 (&current->thread.fpu.state)
+#define I387 (&current->thread.fpu.fpstate->regs)
#define FPU_info (I387->soft.info)
#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
index 4a9887851ad8..990d847ae902 100644
--- a/arch/x86/math-emu/fpu_trig.c
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -547,7 +547,7 @@ static void frndint_(FPU_REG *st0_ptr, u_char st0_tag)
single_arg_error(st0_ptr, st0_tag);
}
-static int fsin(FPU_REG *st0_ptr, u_char tag)
+static int f_sin(FPU_REG *st0_ptr, u_char tag)
{
u_char arg_sign = getsign(st0_ptr);
@@ -608,6 +608,11 @@ static int fsin(FPU_REG *st0_ptr, u_char tag)
}
}
+static void fsin(FPU_REG *st0_ptr, u_char tag)
+{
+ f_sin(st0_ptr, tag);
+}
+
static int f_cos(FPU_REG *st0_ptr, u_char tag)
{
u_char st0_sign;
@@ -724,7 +729,7 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
}
reg_copy(st0_ptr, &arg);
- if (!fsin(st0_ptr, st0_tag)) {
+ if (!f_sin(st0_ptr, st0_tag)) {
push();
FPU_copy_to_reg0(&arg, st0_tag);
f_cos(&st(0), st0_tag);
@@ -1635,7 +1640,7 @@ void FPU_triga(void)
}
static FUNC_ST0 const trig_table_b[] = {
- fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos
+ fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, fsin, fcos
};
void FPU_trigb(void)
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index b82ca14ba718..4a9fd9029a53 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -153,7 +153,7 @@ static long pm_address(u_char FPU_modrm, u_char segment,
switch (segment) {
case PREFIX_GS_ - 1:
/* user gs handling can be lazy, use special accessors */
- addr->selector = get_user_gs(FPU_info->regs);
+ savesegment(gs, addr->selector);
break;
default:
addr->selector = PM_REG_(segment);
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index f15263e158e8..4092df79de4f 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -240,7 +240,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
fix-up operations. */
return 1;
case 022: /* frstor m94/108byte */
- frstor(addr_modes, (u_char __user *) data_address);
+ FPU_frstor(addr_modes, (u_char __user *) data_address);
/* Ensure that the values just loaded are not changed by
fix-up operations. */
return 1;
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
index 4afc7b1fa6e9..54a031b66142 100644
--- a/arch/x86/math-emu/mul_Xsig.S
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -62,7 +62,7 @@ SYM_FUNC_START(mul32_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul32_Xsig)
@@ -115,7 +115,7 @@ SYM_FUNC_START(mul64_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul64_Xsig)
@@ -175,5 +175,5 @@ SYM_FUNC_START(mul_Xsig_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul_Xsig_Xsig)
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
index 702315eecb86..35fd723fc0df 100644
--- a/arch/x86/math-emu/polynom_Xsig.S
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -133,5 +133,5 @@ L_accum_done:
popl %edi
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(polynomial_Xsig)
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index fe6246ff9887..7e4521fbe7da 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -964,7 +964,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
/* The return value (in eax) is zero if the result is exact,
if bits are changed due to rounding, truncation, etc, then
a non-zero value is returned */
-/* Overflow is signalled by a non-zero return value (in eax).
+/* Overflow is signaled by a non-zero return value (in eax).
In the case of overflow, the returned significand always has the
largest possible value */
int FPU_round_to_int(FPU_REG *r, u_char tag)
@@ -1117,7 +1117,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
return s;
}
-void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
+void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
{
int i, regnr;
u_char __user *s = fldenv(addr_modes, data_address);
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
index cad1d60b1e84..594936eeed67 100644
--- a/arch/x86/math-emu/reg_norm.S
+++ b/arch/x86/math-emu/reg_norm.S
@@ -72,7 +72,7 @@ L_exit_valid:
L_exit:
popl %ebx
leave
- ret
+ RET
L_zero:
@@ -138,7 +138,7 @@ L_exit_nuo_valid:
popl %ebx
leave
- ret
+ RET
L_exit_nuo_zero:
movl TAG_Zero,%eax
@@ -146,5 +146,5 @@ L_exit_nuo_zero:
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(FPU_normalize_nuo)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
index 11a1f798451b..0bb2a092161a 100644
--- a/arch/x86/math-emu/reg_round.S
+++ b/arch/x86/math-emu/reg_round.S
@@ -437,7 +437,7 @@ fpu_Arith_exit:
popl %edi
popl %esi
leave
- ret
+ RET
/*
@@ -575,7 +575,7 @@ Normalise_result:
#ifdef PECULIAR_486
/*
* This implements a special feature of 80486 behaviour.
- * Underflow will be signalled even if the number is
+ * Underflow will be signaled even if the number is
* not a denormal after rounding.
* This difference occurs only for masked underflow, and not
* in the unmasked case.
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
index 9c9e2c810afe..07247287a3af 100644
--- a/arch/x86/math-emu/reg_u_add.S
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -164,6 +164,6 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_add)
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
index e2fb5c2644c5..b5a41e2fc484 100644
--- a/arch/x86/math-emu/reg_u_div.S
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -468,7 +468,7 @@ L_exit:
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_div)
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
index 0c779c87ac5b..e2588b24b8c2 100644
--- a/arch/x86/math-emu/reg_u_mul.S
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -144,7 +144,7 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_mul)
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
index e9bb7c248649..4c900c29e4ff 100644
--- a/arch/x86/math-emu/reg_u_sub.S
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -270,5 +270,5 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_u_sub)
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
index d9d7de8dbd7b..126c40473bad 100644
--- a/arch/x86/math-emu/round_Xsig.S
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -78,7 +78,7 @@ L_exit:
popl %esi
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(round_Xsig)
@@ -138,5 +138,5 @@ L_n_exit:
popl %esi
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(norm_Xsig)
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
index 726af985f758..f726bf6f6396 100644
--- a/arch/x86/math-emu/shr_Xsig.S
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -45,7 +45,7 @@ SYM_FUNC_START(shr_Xsig)
popl %ebx
popl %esi
leave
- ret
+ RET
L_more_than_31:
cmpl $64,%ecx
@@ -61,7 +61,7 @@ L_more_than_31:
movl $0,8(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_63:
cmpl $96,%ecx
@@ -76,7 +76,7 @@ L_more_than_63:
movl %edx,8(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_95:
xorl %eax,%eax
@@ -85,5 +85,5 @@ L_more_than_95:
movl %eax,8(%esi)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(shr_Xsig)
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
index 4fc89174caf0..f608a28a4c43 100644
--- a/arch/x86/math-emu/wm_shrx.S
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -55,7 +55,7 @@ SYM_FUNC_START(FPU_shrx)
popl %ebx
popl %esi
leave
- ret
+ RET
L_more_than_31:
cmpl $64,%ecx
@@ -70,7 +70,7 @@ L_more_than_31:
movl $0,4(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_63:
cmpl $96,%ecx
@@ -84,7 +84,7 @@ L_more_than_63:
movl %edx,4(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_95:
xorl %eax,%eax
@@ -92,7 +92,7 @@ L_more_than_95:
movl %eax,4(%esi)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_shrx)
@@ -146,7 +146,7 @@ SYM_FUNC_START(FPU_shrxs)
popl %ebx
popl %esi
leave
- ret
+ RET
/* Shift by [0..31] bits */
Ls_less_than_32:
@@ -163,7 +163,7 @@ Ls_less_than_32:
popl %ebx
popl %esi
leave
- ret
+ RET
/* Shift by [64..95] bits */
Ls_more_than_63:
@@ -189,7 +189,7 @@ Ls_more_than_63:
popl %ebx
popl %esi
leave
- ret
+ RET
Ls_more_than_95:
/* Shift by [96..inf) bits */
@@ -203,5 +203,5 @@ Ls_more_than_95:
popl %ebx
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_shrxs)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5864219221ca..f8220fd2c169 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,9 +2,11 @@
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
+KCOV_INSTRUMENT_mem_encrypt_amd.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n
KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt_amd.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions
@@ -13,17 +15,17 @@ KCSAN_SANITIZE := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
- pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o
+ pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o pgprot.o
obj-y += pat/
# Make sure __phys_addr has no stackprotector
CFLAGS_physaddr.o := -fno-stack-protector
-CFLAGS_setup_nx.o := -fno-stack-protector
CFLAGS_mem_encrypt_identity.o := -fno-stack-protector
CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
@@ -52,6 +54,8 @@ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o
+
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 058b2f36b3a6..b3ca7d23e4b0 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -154,7 +154,7 @@ int __init amd_numa_init(void)
node_set(nodeid, numa_nodes_parsed);
}
- if (!nodes_weight(numa_nodes_parsed))
+ if (nodes_empty(numa_nodes_parsed))
return -ENOENT;
/*
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index f5e1e60c9095..6c2f1b76a0b6 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -110,6 +110,13 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
cea_map_stack(NMI);
cea_map_stack(DB);
cea_map_stack(MCE);
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
+ cea_map_stack(VC);
+ cea_map_stack(VC2);
+ }
+ }
}
#else
static inline void percpu_setup_exception_stacks(unsigned int cpu)
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index b93d6cd08a7f..dba2197c05c3 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,48 +2,58 @@
#include <linux/extable.h>
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
+#include <linux/bitfield.h>
#include <xen/xen.h>
-#include <asm/fpu/internal.h>
-#include <asm/sev-es.h>
+#include <asm/fpu/api.h>
+#include <asm/sev.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
+#include <asm/insn-eval.h>
+#include <asm/sgx.h>
-typedef bool (*ex_handler_t)(const struct exception_table_entry *,
- struct pt_regs *, int, unsigned long,
- unsigned long);
+static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr)
+{
+ int reg_offset = pt_regs_offset(regs, nr);
+ static unsigned long __dummy;
+
+ if (WARN_ON_ONCE(reg_offset < 0))
+ return &__dummy;
+
+ return (unsigned long *)((unsigned long)regs + reg_offset);
+}
static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
}
-static inline ex_handler_t
-ex_fixup_handler(const struct exception_table_entry *x)
-{
- return (ex_handler_t)((unsigned long)&x->handler + x->handler);
-}
-__visible bool ex_handler_default(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_default(const struct exception_table_entry *e,
+ struct pt_regs *regs)
{
- regs->ip = ex_fixup_addr(fixup);
+ if (e->data & EX_FLAG_CLEAR_AX)
+ regs->ax = 0;
+ if (e->data & EX_FLAG_CLEAR_DX)
+ regs->dx = 0;
+
+ regs->ip = ex_fixup_addr(e);
return true;
}
-EXPORT_SYMBOL(ex_handler_default);
-__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_fault(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
{
- regs->ip = ex_fixup_addr(fixup);
regs->ax = trapnr;
- return true;
+ return ex_handler_default(fixup, regs);
+}
+
+static bool ex_handler_sgx(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG;
+ return ex_handler_default(fixup, regs);
}
-EXPORT_SYMBOL_GPL(ex_handler_fault);
/*
* Handler for when we fail to restore a task's FPU state. We should never get
@@ -55,111 +65,93 @@ EXPORT_SYMBOL_GPL(ex_handler_fault);
* of vulnerability by restoring from the initial state (essentially, zeroing
* out all the FPU registers) if we can't restore from the task's FPU state.
*/
-__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
+ struct pt_regs *regs)
{
regs->ip = ex_fixup_addr(fixup);
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
- __copy_kernel_to_fpregs(&init_fpstate, -1);
+ fpu_reset_from_exception_fixup();
return true;
}
-EXPORT_SYMBOL_GPL(ex_handler_fprestore);
-__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
{
WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- regs->ip = ex_fixup_addr(fixup);
- return true;
+ return ex_handler_default(fixup, regs);
}
-EXPORT_SYMBOL(ex_handler_uaccess);
-__visible bool ex_handler_copy(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_copy(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
{
WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- regs->ip = ex_fixup_addr(fixup);
- regs->ax = trapnr;
- return true;
+ return ex_handler_fault(fixup, regs, trapnr);
}
-EXPORT_SYMBOL(ex_handler_copy);
-__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_msr(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
- if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ if (!safe && wrmsr &&
+ pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx,
+ (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
+
+ if (!safe && !wrmsr &&
+ pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
show_stack_regs(regs);
- /* Pretend that the read succeeded and returned 0. */
- regs->ip = ex_fixup_addr(fixup);
- regs->ax = 0;
- regs->dx = 0;
- return true;
-}
-EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
+ if (!wrmsr) {
+ /* Pretend that the read succeeded and returned 0. */
+ regs->ax = 0;
+ regs->dx = 0;
+ }
-__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
-{
- if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, (unsigned int)regs->dx,
- (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
- show_stack_regs(regs);
+ if (safe)
+ *pt_regs_nr(regs, reg) = -EIO;
- /* Pretend that the write succeeded. */
- regs->ip = ex_fixup_addr(fixup);
- return true;
+ return ex_handler_default(fixup, regs);
}
-EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
-__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
+ struct pt_regs *regs)
{
if (static_cpu_has(X86_BUG_NULL_SEG))
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
asm volatile ("mov %0, %%fs" : : "rm" (0));
- return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
+ return ex_handler_default(fixup, regs);
}
-EXPORT_SYMBOL(ex_handler_clear_fs);
-enum handler_type ex_get_fault_handler_type(unsigned long ip)
+static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int reg, int imm)
{
- const struct exception_table_entry *e;
- ex_handler_t handler;
+ *pt_regs_nr(regs, reg) = (long)imm;
+ return ex_handler_default(fixup, regs);
+}
- e = search_exception_tables(ip);
- if (!e)
- return EX_HANDLER_NONE;
- handler = ex_fixup_handler(e);
- if (handler == ex_handler_fault)
- return EX_HANDLER_FAULT;
- else if (handler == ex_handler_uaccess || handler == ex_handler_copy)
- return EX_HANDLER_UACCESS;
- else
- return EX_HANDLER_OTHER;
+static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr, int reg, int imm)
+{
+ regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
+ return ex_handler_uaccess(fixup, regs, trapnr);
+}
+
+int ex_get_fixup_type(unsigned long ip)
+{
+ const struct exception_table_entry *e = search_exception_tables(ip);
+
+ return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE;
}
int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
unsigned long fault_addr)
{
const struct exception_table_entry *e;
- ex_handler_t handler;
+ int type, reg, imm;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -179,8 +171,52 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
if (!e)
return 0;
- handler = ex_fixup_handler(e);
- return handler(e, regs, trapnr, error_code, fault_addr);
+ type = FIELD_GET(EX_DATA_TYPE_MASK, e->data);
+ reg = FIELD_GET(EX_DATA_REG_MASK, e->data);
+ imm = FIELD_GET(EX_DATA_IMM_MASK, e->data);
+
+ switch (type) {
+ case EX_TYPE_DEFAULT:
+ case EX_TYPE_DEFAULT_MCE_SAFE:
+ return ex_handler_default(e, regs);
+ case EX_TYPE_FAULT:
+ case EX_TYPE_FAULT_MCE_SAFE:
+ return ex_handler_fault(e, regs, trapnr);
+ case EX_TYPE_UACCESS:
+ return ex_handler_uaccess(e, regs, trapnr);
+ case EX_TYPE_COPY:
+ return ex_handler_copy(e, regs, trapnr);
+ case EX_TYPE_CLEAR_FS:
+ return ex_handler_clear_fs(e, regs);
+ case EX_TYPE_FPU_RESTORE:
+ return ex_handler_fprestore(e, regs);
+ case EX_TYPE_BPF:
+ return ex_handler_bpf(e, regs);
+ case EX_TYPE_WRMSR:
+ return ex_handler_msr(e, regs, true, false, reg);
+ case EX_TYPE_RDMSR:
+ return ex_handler_msr(e, regs, false, false, reg);
+ case EX_TYPE_WRMSR_SAFE:
+ return ex_handler_msr(e, regs, true, true, reg);
+ case EX_TYPE_RDMSR_SAFE:
+ return ex_handler_msr(e, regs, false, true, reg);
+ case EX_TYPE_WRMSR_IN_MCE:
+ ex_handler_msr_mce(regs, true);
+ break;
+ case EX_TYPE_RDMSR_IN_MCE:
+ ex_handler_msr_mce(regs, false);
+ break;
+ case EX_TYPE_POP_REG:
+ regs->sp += sizeof(long);
+ fallthrough;
+ case EX_TYPE_IMM_REG:
+ return ex_handler_imm_reg(e, regs, reg, imm);
+ case EX_TYPE_FAULT_SGX:
+ return ex_handler_sgx(e, regs, trapnr);
+ case EX_TYPE_UCOPY_LEN:
+ return ex_handler_ucopy_len(e, regs, trapnr, reg, imm);
+ }
+ BUG();
}
extern unsigned int early_recursion_flag;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a73347e2cdfc..fad8faa29d04 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -32,6 +32,7 @@
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#include <asm/vdso.h> /* fixup_vdso_exception() */
+#include <asm/irq_stack.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -148,7 +149,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
unsigned char opcode;
if (user_mode(regs)) {
- if (get_user(opcode, instr))
+ if (get_user(opcode, (unsigned char __user *) instr))
break;
} else {
if (get_kernel_nofault(opcode, instr))
@@ -631,6 +632,9 @@ static noinline void
page_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
+#ifdef CONFIG_VMAP_STACK
+ struct stack_info info;
+#endif
unsigned long flags;
int sig;
@@ -649,9 +653,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
- (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
- address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
- unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
+ get_stack_guard_info((void *)address, &info)) {
/*
* We're likely to be running with very little stack space
* left. It's plausible that we'd hit this condition but
@@ -662,13 +664,11 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
* and then double-fault, though, because we're likely to
* break the console driver and lose most of the stack dump.
*/
- asm volatile ("movq %[stack], %%rsp\n\t"
- "call handle_stack_overflow\n\t"
- "1: jmp 1b"
- : ASM_CALL_CONSTRAINT
- : "D" ("kernel stack overflow (page fault)"),
- "S" (regs), "d" (address),
- [stack] "rm" (stack));
+ call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
+ handle_stack_overflow,
+ ASM_CALL_ARG3,
+ , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
+
unreachable();
}
#endif
@@ -710,7 +710,8 @@ oops:
static noinline void
kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int signal, int si_code)
+ unsigned long address, int signal, int si_code,
+ u32 pkey)
{
WARN_ON_ONCE(user_mode(regs));
@@ -735,8 +736,12 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
set_signal_archinfo(address, error_code);
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
+ if (si_code == SEGV_PKUERR) {
+ force_sig_pkuerr((void __user *)address, pkey);
+ } else {
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_fault(signal, si_code, (void __user *)address);
+ }
}
/*
@@ -798,7 +803,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
if (!user_mode(regs)) {
- kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGSEGV, si_code, pkey);
return;
}
@@ -836,8 +842,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (si_code == SEGV_PKUERR)
force_sig_pkuerr((void __user *)address, pkey);
-
- force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ else
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
local_irq_disable();
}
@@ -875,7 +881,7 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
/* This code is always called on the current mm */
bool foreign = false;
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return false;
if (error_code & X86_PF_PK)
return true;
@@ -930,7 +936,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
{
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
- kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
return;
}
@@ -1186,7 +1193,7 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
return;
/* kprobes don't want to hook the spurious faults: */
- if (kprobe_page_fault(regs, X86_TRAP_PF))
+ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
/*
@@ -1239,7 +1246,7 @@ void do_user_addr_fault(struct pt_regs *regs,
}
/* kprobes don't want to hook the spurious faults: */
- if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
+ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
/*
@@ -1396,7 +1403,8 @@ good_area:
*/
if (!user_mode(regs))
kernelmode_fixup_or_oops(regs, error_code, address,
- SIGBUS, BUS_ADRERR);
+ SIGBUS, BUS_ADRERR,
+ ARCH_DEFAULT_PKEY);
return;
}
@@ -1405,8 +1413,7 @@ good_area:
* and if there is a fatal signal pending there is no guarantee
* that we made any progress. Handle this case first.
*/
- if (unlikely((fault & VM_FAULT_RETRY) &&
- (flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(fault & VM_FAULT_RETRY)) {
flags |= FAULT_FLAG_TRIED;
goto retry;
}
@@ -1416,7 +1423,8 @@ good_area:
return;
if (fatal_signal_pending(current) && !user_mode(regs)) {
- kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ 0, 0, ARCH_DEFAULT_PKEY);
return;
}
@@ -1424,7 +1432,8 @@ good_area:
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
kernelmode_fixup_or_oops(regs, error_code, address,
- SIGSEGV, SEGV_MAPERR);
+ SIGSEGV, SEGV_MAPERR,
+ ARCH_DEFAULT_PKEY);
return;
}
@@ -1497,7 +1506,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
* userspace task is trying to access some valid (from guest's point of
* view) memory which is not currently mapped by the host (e.g. the
* memory is swapped out). Note, the corresponding "page ready" event
- * which is injected when the memory becomes available, is delived via
+ * which is injected when the memory becomes available, is delivered via
* an interrupt mechanism and not a #PF exception
* (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
*
@@ -1523,7 +1532,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
*
* In case the fault hit a RCU idle region the conditional entry
* code reenabled RCU to avoid subsequent wreckage which helps
- * debugability.
+ * debuggability.
*/
state = irqentry_enter(regs);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index dd694fb93916..d8cfce221275 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,9 +29,8 @@
/*
* We need to define the tracepoints somewhere, and tlb.c
- * is only compied when SMP=y.
+ * is only compiled when SMP=y.
*/
-#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
#include "mm_internal.h"
@@ -127,14 +126,12 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned long ret = 0;
if (min_pfn_mapped < max_pfn_mapped) {
- ret = memblock_find_in_range(
+ ret = memblock_phys_alloc_range(
+ PAGE_SIZE * num, PAGE_SIZE,
min_pfn_mapped << PAGE_SHIFT,
- max_pfn_mapped << PAGE_SHIFT,
- PAGE_SIZE * num , PAGE_SIZE);
+ max_pfn_mapped << PAGE_SHIFT);
}
- if (ret)
- memblock_reserve(ret, PAGE_SIZE * num);
- else if (can_use_brk_pgt)
+ if (!ret && can_use_brk_pgt)
ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
if (!ret)
@@ -610,8 +607,17 @@ static void __init memory_map_top_down(unsigned long map_start,
unsigned long addr;
unsigned long mapped_ram_size = 0;
- /* xen has big range in reserved near end of ram, skip it at first.*/
- addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+ /*
+ * Systems that have many reserved areas near top of the memory,
+ * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will
+ * require lots of 4K mappings which may exhaust pgt_buf.
+ * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure
+ * there is enough mapped memory that can be allocated from
+ * memblock.
+ */
+ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
+ map_end);
+ memblock_phys_free(addr, PMD_SIZE);
real_end = addr + PMD_SIZE;
/* step_size need to be small so pgt_buf from BRK could cover it */
@@ -707,6 +713,11 @@ static void __init memory_map_bottom_up(unsigned long map_start,
static void __init init_trampoline(void)
{
#ifdef CONFIG_X86_64
+ /*
+ * The code below will alias kernel page-tables in the user-range of the
+ * address space, including the Global bit. So global TLB entries will
+ * be created when using the trampoline page-table.
+ */
if (!kaslr_memory_enabled())
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
else
@@ -756,7 +767,7 @@ void __init init_mem_mapping(void)
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
- /* can we preseve max_low_pfn ?*/
+ /* can we preserve max_low_pfn ?*/
max_low_pfn = max_pfn;
}
#else
@@ -939,7 +950,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
{
/*
* end could be not aligned, and We can not align that,
- * decompresser could be confused by aligned initrd_end
+ * decompressor could be confused by aligned initrd_end
* We already reserve the end partial page before in
* - i386_start_kernel()
* - x86_64_start_kernel()
@@ -1017,7 +1028,7 @@ void __init zone_sizes_init(void)
free_area_init(max_zone_pfns);
}
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index da31c2635ee4..d4e2648a1dfb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -238,11 +238,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
}
}
-/*
- * The <linux/kallsyms.h> already defines is_kernel_text,
- * using '__' prefix not to get in conflict.
- */
-static inline int __is_kernel_text(unsigned long addr)
+static inline int is_x86_32_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
return 1;
@@ -333,8 +329,8 @@ repeat:
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
- if (__is_kernel_text(addr) ||
- __is_kernel_text(addr2))
+ if (is_x86_32_kernel_text(addr) ||
+ is_x86_32_kernel_text(addr2))
prot = PAGE_KERNEL_LARGE_EXEC;
pages_2m++;
@@ -359,7 +355,7 @@ repeat:
*/
pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
- if (__is_kernel_text(addr))
+ if (is_x86_32_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
pages_4k++;
@@ -651,7 +647,7 @@ void __init find_low_pfn_range(void)
highmem_pfn_init();
}
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
#ifdef CONFIG_HIGHMEM
@@ -677,7 +673,7 @@ void __init initmem_init(void)
setup_bootmem_allocator();
}
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+#endif /* !CONFIG_NUMA */
void __init setup_bootmem_allocator(void)
{
@@ -755,8 +751,6 @@ void __init mem_init(void)
after_bootmem = 1;
x86_init.hyper.init_after_bootmem();
- mem_init_print_info(NULL);
-
/*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
@@ -781,38 +775,6 @@ void __init mem_init(void)
test_wp_bit();
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_params *params)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
- int ret;
-
- /*
- * The page tables were already mapped at boot so if the caller
- * requests a different mapping type then we must change all the
- * pages with __set_memory_prot().
- */
- if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) {
- ret = __set_memory_prot(start, nr_pages, params->pgprot);
- if (ret)
- return ret;
- }
-
- return __add_pages(nid, start_pfn, nr_pages, params);
-}
-
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
-
- __remove_pages(start_pfn, nr_pages, altmap);
-}
-#endif
-
int kernel_set_to_readonly __read_mostly;
static void mark_nxdata_nx(void)
@@ -823,7 +785,7 @@ static void mark_nxdata_nx(void)
*/
unsigned long start = PFN_ALIGN(_etext);
/*
- * This comes from __is_kernel_text upper limit. Also HPAGE where used:
+ * This comes from is_x86_32_kernel_text upper limit. Also HPAGE where used:
*/
unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b5a3fa4033d3..39c5246964a9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -33,6 +33,7 @@
#include <linux/nmi.h>
#include <linux/gfp.h>
#include <linux/kcore.h>
+#include <linux/bootmem_info.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
@@ -109,7 +110,6 @@ int force_personality32;
/*
* noexec32=on|off
* Control non executable heap for 32bit processes.
- * To control the stack too use noexec=off
*
* on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
* off PROT_READ implies PROT_EXEC
@@ -172,7 +172,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
/*
* With folded p4d, pgd_none() is always false, we need to
- * handle synchonization on p4d level.
+ * handle synchronization on p4d level.
*/
MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, addr);
@@ -193,8 +193,8 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
spin_lock(pgt_lock);
if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
- BUG_ON(p4d_page_vaddr(*p4d)
- != p4d_page_vaddr(*p4d_ref));
+ BUG_ON(p4d_pgtable(*p4d)
+ != p4d_pgtable(*p4d_ref));
if (p4d_none(*p4d))
set_p4d(p4d, *p4d_ref);
@@ -826,6 +826,107 @@ void __init paging_init(void)
zone_sizes_init();
}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#define PAGE_UNUSED 0xFD
+
+/*
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges
+ * from unused_pmd_start to next PMD_SIZE boundary.
+ */
+static unsigned long unused_pmd_start __meminitdata;
+
+static void __meminit vmemmap_flush_unused_pmd(void)
+{
+ if (!unused_pmd_start)
+ return;
+ /*
+ * Clears (unused_pmd_start, PMD_END]
+ */
+ memset((void *)unused_pmd_start, PAGE_UNUSED,
+ ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
+ unused_pmd_start = 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* Returns true if the PMD is completely unused and thus it can be freed */
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+ /*
+ * Flush the unused range cache to ensure that memchr_inv() will work
+ * for the whole range.
+ */
+ vmemmap_flush_unused_pmd();
+ memset((void *)addr, PAGE_UNUSED, end - addr);
+
+ return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
+}
+#endif
+
+static void __meminit __vmemmap_use_sub_pmd(unsigned long start)
+{
+ /*
+ * As we expect to add in the same granularity as we remove, it's
+ * sufficient to mark only some piece used to block the memmap page from
+ * getting removed when removing some other adjacent memmap (just in
+ * case the first memmap never gets initialized e.g., because the memory
+ * block never gets onlined).
+ */
+ memset((void *)start, 0, sizeof(struct page));
+}
+
+static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+ /*
+ * We only optimize if the new used range directly follows the
+ * previously unused range (esp., when populating consecutive sections).
+ */
+ if (unused_pmd_start == start) {
+ if (likely(IS_ALIGNED(end, PMD_SIZE)))
+ unused_pmd_start = 0;
+ else
+ unused_pmd_start = end;
+ return;
+ }
+
+ /*
+ * If the range does not contiguously follows previous one, make sure
+ * to mark the unused range of the previous one so it can be removed.
+ */
+ vmemmap_flush_unused_pmd();
+ __vmemmap_use_sub_pmd(start);
+}
+
+
+static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+ const unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+ vmemmap_flush_unused_pmd();
+
+ /*
+ * Could be our memmap page is filled with PAGE_UNUSED already from a
+ * previous remove. Make sure to reset it.
+ */
+ __vmemmap_use_sub_pmd(start);
+
+ /*
+ * Mark with PAGE_UNUSED the unused parts of the new memmap range
+ */
+ if (!IS_ALIGNED(start, PMD_SIZE))
+ memset((void *)page, PAGE_UNUSED, start - page);
+
+ /*
+ * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+ * consecutive sections. Remember for the last added PMD where the
+ * unused range begins.
+ */
+ if (!IS_ALIGNED(end, PMD_SIZE))
+ unused_pmd_start = end;
+}
+#endif
+
/*
* Memory hotplug specific functions
*/
@@ -871,8 +972,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
return add_pages(nid, start_pfn, nr_pages, params);
}
-#define PAGE_INUSE 0xFD
-
static void __meminit free_pagetable(struct page *page, int order)
{
unsigned long magic;
@@ -882,7 +981,7 @@ static void __meminit free_pagetable(struct page *page, int order)
if (PageReserved(page)) {
__ClearPageReserved(page);
- magic = (unsigned long)page->freelist;
+ magic = page->index;
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
while (nr_pages--)
put_page_bootmem(page++);
@@ -962,7 +1061,6 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
{
unsigned long next, pages = 0;
pte_t *pte;
- void *page_addr;
phys_addr_t phys_addr;
pte = pte_start + pte_index(addr);
@@ -983,42 +1081,15 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000)
return;
- if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
- /*
- * Do not free direct mapping pages since they were
- * freed when offlining, or simplely not in use.
- */
- if (!direct)
- free_pagetable(pte_page(*pte), 0);
-
- spin_lock(&init_mm.page_table_lock);
- pte_clear(&init_mm, addr, pte);
- spin_unlock(&init_mm.page_table_lock);
-
- /* For non-direct mapping, pages means nothing. */
- pages++;
- } else {
- /*
- * If we are here, we are freeing vmemmap pages since
- * direct mapped memory ranges to be freed are aligned.
- *
- * If we are not removing the whole page, it means
- * other page structs in this page are being used and
- * we canot remove them. So fill the unused page_structs
- * with 0xFD, and remove the page when it is wholly
- * filled with 0xFD.
- */
- memset((void *)addr, PAGE_INUSE, next - addr);
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
- page_addr = page_address(pte_page(*pte));
- if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
- free_pagetable(pte_page(*pte), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
- spin_lock(&init_mm.page_table_lock);
- pte_clear(&init_mm, addr, pte);
- spin_unlock(&init_mm.page_table_lock);
- }
- }
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
}
/* Call free_pte_table() in remove_pmd_table(). */
@@ -1034,7 +1105,6 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
unsigned long next, pages = 0;
pte_t *pte_base;
pmd_t *pmd;
- void *page_addr;
pmd = pmd_start + pmd_index(addr);
for (; addr < end; addr = next, pmd++) {
@@ -1054,22 +1124,16 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
pages++;
- } else {
- /* If here, we are freeing vmemmap pages. */
- memset((void *)addr, PAGE_INUSE, next - addr);
-
- page_addr = page_address(pmd_page(*pmd));
- if (!memchr_inv(page_addr, PAGE_INUSE,
- PMD_SIZE)) {
+ }
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (vmemmap_pmd_is_unused(addr, next)) {
free_hugepage_table(pmd_page(*pmd),
altmap);
-
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
- }
}
-
+#endif
continue;
}
@@ -1090,7 +1154,6 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
unsigned long next, pages = 0;
pmd_t *pmd_base;
pud_t *pud;
- void *page_addr;
pud = pud_start + pud_index(addr);
for (; addr < end; addr = next, pud++) {
@@ -1099,33 +1162,13 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
if (!pud_present(*pud))
continue;
- if (pud_large(*pud)) {
- if (IS_ALIGNED(addr, PUD_SIZE) &&
- IS_ALIGNED(next, PUD_SIZE)) {
- if (!direct)
- free_pagetable(pud_page(*pud),
- get_order(PUD_SIZE));
-
- spin_lock(&init_mm.page_table_lock);
- pud_clear(pud);
- spin_unlock(&init_mm.page_table_lock);
- pages++;
- } else {
- /* If here, we are freeing vmemmap pages. */
- memset((void *)addr, PAGE_INUSE, next - addr);
-
- page_addr = page_address(pud_page(*pud));
- if (!memchr_inv(page_addr, PAGE_INUSE,
- PUD_SIZE)) {
- free_pagetable(pud_page(*pud),
- get_order(PUD_SIZE));
-
- spin_lock(&init_mm.page_table_lock);
- pud_clear(pud);
- spin_unlock(&init_mm.page_table_lock);
- }
- }
-
+ if (pud_large(*pud) &&
+ IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
continue;
}
@@ -1197,6 +1240,9 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct,
void __ref vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
+
remove_pagetable(start, end, false, altmap);
}
@@ -1209,8 +1255,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
remove_pagetable(start, end, true, NULL);
}
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -1224,7 +1269,7 @@ static struct kcore_list kcore_vsyscall;
static void __init register_page_bootmem_info(void)
{
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)
int i;
for_each_online_node(i)
@@ -1306,8 +1351,6 @@ void __init mem_init(void)
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
preallocate_vmalloc_pages();
-
- mem_init_print_info(NULL);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1389,18 +1432,18 @@ int kern_addr_valid(unsigned long addr)
return 0;
p4d = p4d_offset(pgd, addr);
- if (p4d_none(*p4d))
+ if (!p4d_present(*p4d))
return 0;
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+ if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
return pfn_valid(pud_pfn(*pud));
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
@@ -1538,11 +1581,17 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE))
+ vmemmap_use_new_sub_pmd(addr, next);
+
continue;
} else if (altmap)
return -ENOMEM; /* no fallback */
} else if (pmd_large(*pmd)) {
vmemmap_verify((pte_t *)pmd, node, addr, next);
+ vmemmap_use_sub_pmd(addr, next);
continue;
}
if (vmemmap_populate_basepages(addr, next, node, NULL))
@@ -1556,6 +1605,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
{
int err;
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
+
if (end - start < PAGES_PER_SECTION * sizeof(struct page))
err = vmemmap_populate_basepages(start, end, node, NULL);
else if (boot_cpu_has(X86_FEATURE_PSE))
@@ -1571,7 +1623,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
return err;
}
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long nr_pages)
{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 9e5ccc56f8e0..1ad0228f8ceb 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -14,7 +14,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/efi.h>
#include <linux/pgtable.h>
@@ -92,7 +92,7 @@ static unsigned int __ioremap_check_ram(struct resource *res)
*/
static unsigned int __ioremap_check_encrypted(struct resource *res)
{
- if (!sev_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return 0;
switch (res->desc) {
@@ -112,13 +112,15 @@ static unsigned int __ioremap_check_encrypted(struct resource *res)
*/
static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *desc)
{
- if (!sev_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
if (!IS_ENABLED(CONFIG_EFI))
return;
- if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA)
+ if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA ||
+ (efi_mem_type(addr) == EFI_BOOT_SERVICES_DATA &&
+ efi_mem_attributes(addr) & EFI_MEMORY_RUNTIME))
desc->flags |= IORES_MAP_ENCRYPTED;
}
@@ -240,10 +242,15 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size,
* If the page being mapped is in memory and SEV is active then
* make sure the memory encryption attribute is enabled in the
* resulting mapping.
+ * In TDX guests, memory is marked private by default. If encryption
+ * is not requested (using encrypted), explicitly set decrypt
+ * attribute in all IOREMAPPED memory.
*/
prot = PAGE_KERNEL_IO;
if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted)
prot = pgprot_encrypted(prot);
+ else
+ prot = pgprot_decrypted(prot);
switch (pcm) {
case _PAGE_CACHE_MODE_UC:
@@ -481,25 +488,6 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
-int __init arch_ioremap_p4d_supported(void)
-{
- return 0;
-}
-
-int __init arch_ioremap_pud_supported(void)
-{
-#ifdef CONFIG_X86_64
- return boot_cpu_has(X86_FEATURE_GBPAGES);
-#else
- return 0;
-#endif
-}
-
-int __init arch_ioremap_pmd_supported(void)
-{
- return boot_cpu_has(X86_FEATURE_PSE);
-}
-
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
@@ -525,6 +513,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
memunmap((void *)((unsigned long)addr & PAGE_MASK));
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* Examine the physical address to determine if it is an area of memory
* that should be mapped decrypted. If the memory is not part of the
@@ -572,7 +561,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr,
case E820_TYPE_NVS:
case E820_TYPE_UNUSABLE:
/* For SEV, these areas are encrypted */
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
break;
fallthrough;
@@ -631,6 +620,7 @@ static bool memremap_is_efi_data(resource_size_t phys_addr,
static bool memremap_is_setup_data(resource_size_t phys_addr,
unsigned long size)
{
+ struct setup_indirect *indirect;
struct setup_data *data;
u64 paddr, paddr_next;
@@ -643,6 +633,10 @@ static bool memremap_is_setup_data(resource_size_t phys_addr,
data = memremap(paddr, sizeof(*data),
MEMREMAP_WB | MEMREMAP_DEC);
+ if (!data) {
+ pr_warn("failed to memremap setup_data entry\n");
+ return false;
+ }
paddr_next = data->next;
len = data->len;
@@ -652,10 +646,21 @@ static bool memremap_is_setup_data(resource_size_t phys_addr,
return true;
}
- if (data->type == SETUP_INDIRECT &&
- ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
- paddr = ((struct setup_indirect *)data->data)->addr;
- len = ((struct setup_indirect *)data->data)->len;
+ if (data->type == SETUP_INDIRECT) {
+ memunmap(data);
+ data = memremap(paddr, sizeof(*data) + len,
+ MEMREMAP_WB | MEMREMAP_DEC);
+ if (!data) {
+ pr_warn("failed to memremap indirect setup_data\n");
+ return false;
+ }
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ paddr = indirect->addr;
+ len = indirect->len;
+ }
}
memunmap(data);
@@ -676,22 +681,51 @@ static bool memremap_is_setup_data(resource_size_t phys_addr,
static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
unsigned long size)
{
+ struct setup_indirect *indirect;
struct setup_data *data;
u64 paddr, paddr_next;
paddr = boot_params.hdr.setup_data;
while (paddr) {
- unsigned int len;
+ unsigned int len, size;
if (phys_addr == paddr)
return true;
data = early_memremap_decrypted(paddr, sizeof(*data));
+ if (!data) {
+ pr_warn("failed to early memremap setup_data entry\n");
+ return false;
+ }
+
+ size = sizeof(*data);
paddr_next = data->next;
len = data->len;
- early_memunmap(data, sizeof(*data));
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len))) {
+ early_memunmap(data, sizeof(*data));
+ return true;
+ }
+
+ if (data->type == SETUP_INDIRECT) {
+ size += len;
+ early_memunmap(data, sizeof(*data));
+ data = early_memremap_decrypted(paddr, size);
+ if (!data) {
+ pr_warn("failed to early memremap indirect setup_data\n");
+ return false;
+ }
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ paddr = indirect->addr;
+ len = indirect->len;
+ }
+ }
+
+ early_memunmap(data, size);
if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
return true;
@@ -710,7 +744,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
unsigned long flags)
{
- if (!mem_encrypt_active())
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return true;
if (flags & MEMREMAP_ENC)
@@ -719,7 +753,7 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
if (flags & MEMREMAP_DEC)
return false;
- if (sme_active()) {
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
if (memremap_is_setup_data(phys_addr, size) ||
memremap_is_efi_data(phys_addr, size))
return false;
@@ -740,12 +774,12 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
{
bool encrypted_prot;
- if (!mem_encrypt_active())
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return prot;
encrypted_prot = true;
- if (sme_active()) {
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
if (early_memremap_is_setup_data(phys_addr, size) ||
memremap_is_efi_data(phys_addr, size))
encrypted_prot = false;
@@ -763,7 +797,6 @@ bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
return arch_memremap_can_ram_remap(phys_addr, size, 0);
}
-#ifdef CONFIG_AMD_MEM_ENCRYPT
/* Remap memory with encryption */
void __init *early_memremap_encrypted(resource_size_t phys_addr,
unsigned long size)
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 1a50434c8a4d..e7b9b464a82f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -49,8 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
p = early_alloc(PMD_SIZE, nid, false);
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
return;
- else if (p)
- memblock_free(__pa(p), PMD_SIZE);
+ memblock_free(p, PMD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid, true);
@@ -86,8 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
p = early_alloc(PUD_SIZE, nid, false);
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
return;
- else if (p)
- memblock_free(__pa(p), PUD_SIZE);
+ memblock_free(p, PUD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid, true);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 6e6b39710e5f..557f0fe25dff 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -96,7 +96,7 @@ void __init kernel_randomize_memory(void)
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
- /* Adapt phyiscal memory region size based on available memory */
+ /* Adapt physical memory region size based on available memory */
if (memory_tb < kaslr_regions[0].size_tb)
kaslr_regions[0].size_tb = memory_tb;
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index be020a7bc414..d3efbc5b3449 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Support for MMIO probes.
- * Benfit many code from kprobes
+ * Benefit many code from kprobes
* (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
* 2007 Alexander Eichner
* 2008 Pekka Paalanen <pq@iki.fi>
diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c
index 92ec176a7293..5a53c2cc169c 100644
--- a/arch/x86/mm/maccess.c
+++ b/arch/x86/mm/maccess.c
@@ -4,11 +4,6 @@
#include <linux/kernel.h>
#ifdef CONFIG_X86_64
-static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits)
-{
- return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
-}
-
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
unsigned long vaddr = (unsigned long)unsafe_src;
@@ -19,7 +14,7 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
* we also need to include the userspace guard page.
*/
return vaddr >= TASK_SIZE_MAX + PAGE_SIZE &&
- canonical_address(vaddr, boot_cpu_data.x86_virt_bits) == vaddr;
+ __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
}
#else
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index ae78cef79980..9f27e14e185f 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -1,394 +1,17 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * AMD Memory Encryption Support
+ * Memory Encryption Support Common Code
*
* Copyright (C) 2016 Advanced Micro Devices, Inc.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <linux/mm.h>
#include <linux/dma-direct.h>
+#include <linux/dma-mapping.h>
#include <linux/swiotlb.h>
+#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/tlbflush.h>
-#include <asm/fixmap.h>
-#include <asm/setup.h>
-#include <asm/bootparam.h>
-#include <asm/set_memory.h>
-#include <asm/cacheflush.h>
-#include <asm/processor-flags.h>
-#include <asm/msr.h>
-#include <asm/cmdline.h>
-
-#include "mm_internal.h"
-
-/*
- * Since SME related variables are set early in the boot process they must
- * reside in the .data section so as not to be zeroed out when the .bss
- * section is later cleared.
- */
-u64 sme_me_mask __section(".data") = 0;
-u64 sev_status __section(".data") = 0;
-u64 sev_check_data __section(".data") = 0;
-EXPORT_SYMBOL(sme_me_mask);
-DEFINE_STATIC_KEY_FALSE(sev_enable_key);
-EXPORT_SYMBOL_GPL(sev_enable_key);
-
-bool sev_enabled __section(".data");
-
-/* Buffer used for early in-place encryption by BSP, no locking needed */
-static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
-
-/*
- * This routine does not change the underlying encryption setting of the
- * page(s) that map this memory. It assumes that eventually the memory is
- * meant to be accessed as either encrypted or decrypted but the contents
- * are currently not in the desired state.
- *
- * This routine follows the steps outlined in the AMD64 Architecture
- * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
- */
-static void __init __sme_early_enc_dec(resource_size_t paddr,
- unsigned long size, bool enc)
-{
- void *src, *dst;
- size_t len;
-
- if (!sme_me_mask)
- return;
-
- wbinvd();
-
- /*
- * There are limited number of early mapping slots, so map (at most)
- * one page at time.
- */
- while (size) {
- len = min_t(size_t, sizeof(sme_early_buffer), size);
-
- /*
- * Create mappings for the current and desired format of
- * the memory. Use a write-protected mapping for the source.
- */
- src = enc ? early_memremap_decrypted_wp(paddr, len) :
- early_memremap_encrypted_wp(paddr, len);
-
- dst = enc ? early_memremap_encrypted(paddr, len) :
- early_memremap_decrypted(paddr, len);
-
- /*
- * If a mapping can't be obtained to perform the operation,
- * then eventual access of that area in the desired mode
- * will cause a crash.
- */
- BUG_ON(!src || !dst);
-
- /*
- * Use a temporary buffer, of cache-line multiple size, to
- * avoid data corruption as documented in the APM.
- */
- memcpy(sme_early_buffer, src, len);
- memcpy(dst, sme_early_buffer, len);
-
- early_memunmap(dst, len);
- early_memunmap(src, len);
-
- paddr += len;
- size -= len;
- }
-}
-
-void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, true);
-}
-
-void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, false);
-}
-
-static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
- bool map)
-{
- unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
- pmdval_t pmd_flags, pmd;
-
- /* Use early_pmd_flags but remove the encryption mask */
- pmd_flags = __sme_clr(early_pmd_flags);
-
- do {
- pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
- __early_make_pgtable((unsigned long)vaddr, pmd);
-
- vaddr += PMD_SIZE;
- paddr += PMD_SIZE;
- size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
- } while (size);
-
- flush_tlb_local();
-}
-
-void __init sme_unmap_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!sme_active())
- return;
-
- /* Get the command line address before unmapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
-}
-
-void __init sme_map_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!sme_active())
- return;
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
-
- /* Get the command line address after mapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
-}
-
-void __init sme_early_init(void)
-{
- unsigned int i;
-
- if (!sme_me_mask)
- return;
-
- early_pmd_flags = __sme_set(early_pmd_flags);
-
- __supported_pte_mask = __sme_set(__supported_pte_mask);
-
- /* Update the protection map with memory encryption mask */
- for (i = 0; i < ARRAY_SIZE(protection_map); i++)
- protection_map[i] = pgprot_encrypted(protection_map[i]);
-
- if (sev_active())
- swiotlb_force = SWIOTLB_FORCE;
-}
-
-void __init sev_setup_arch(void)
-{
- phys_addr_t total_mem = memblock_phys_mem_size();
- unsigned long size;
-
- if (!sev_active())
- return;
-
- /*
- * For SEV, all DMA has to occur via shared/unencrypted pages.
- * SEV uses SWIOTLB to make this happen without changing device
- * drivers. However, depending on the workload being run, the
- * default 64MB of SWIOTLB may not be enough and SWIOTLB may
- * run out of buffers for DMA, resulting in I/O errors and/or
- * performance degradation especially with high I/O workloads.
- *
- * Adjust the default size of SWIOTLB for SEV guests using
- * a percentage of guest memory for SWIOTLB buffers.
- * Also, as the SWIOTLB bounce buffer memory is allocated
- * from low memory, ensure that the adjusted size is within
- * the limits of low available memory.
- *
- * The percentage of guest memory used here for SWIOTLB buffers
- * is more of an approximation of the static adjustment which
- * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
- */
- size = total_mem * 6 / 100;
- size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
- swiotlb_adjust_size(size);
-}
-
-static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
-{
- pgprot_t old_prot, new_prot;
- unsigned long pfn, pa, size;
- pte_t new_pte;
-
- switch (level) {
- case PG_LEVEL_4K:
- pfn = pte_pfn(*kpte);
- old_prot = pte_pgprot(*kpte);
- break;
- case PG_LEVEL_2M:
- pfn = pmd_pfn(*(pmd_t *)kpte);
- old_prot = pmd_pgprot(*(pmd_t *)kpte);
- break;
- case PG_LEVEL_1G:
- pfn = pud_pfn(*(pud_t *)kpte);
- old_prot = pud_pgprot(*(pud_t *)kpte);
- break;
- default:
- return;
- }
-
- new_prot = old_prot;
- if (enc)
- pgprot_val(new_prot) |= _PAGE_ENC;
- else
- pgprot_val(new_prot) &= ~_PAGE_ENC;
-
- /* If prot is same then do nothing. */
- if (pgprot_val(old_prot) == pgprot_val(new_prot))
- return;
-
- pa = pfn << PAGE_SHIFT;
- size = page_level_size(level);
-
- /*
- * We are going to perform in-place en-/decryption and change the
- * physical page attribute from C=1 to C=0 or vice versa. Flush the
- * caches to ensure that data gets accessed with the correct C-bit.
- */
- clflush_cache_range(__va(pa), size);
-
- /* Encrypt/decrypt the contents in-place */
- if (enc)
- sme_early_encrypt(pa, size);
- else
- sme_early_decrypt(pa, size);
-
- /* Change the page encryption mask. */
- new_pte = pfn_pte(pfn, new_prot);
- set_pte_atomic(kpte, new_pte);
-}
-
-static int __init early_set_memory_enc_dec(unsigned long vaddr,
- unsigned long size, bool enc)
-{
- unsigned long vaddr_end, vaddr_next;
- unsigned long psize, pmask;
- int split_page_size_mask;
- int level, ret;
- pte_t *kpte;
-
- vaddr_next = vaddr;
- vaddr_end = vaddr + size;
-
- for (; vaddr < vaddr_end; vaddr = vaddr_next) {
- kpte = lookup_address(vaddr, &level);
- if (!kpte || pte_none(*kpte)) {
- ret = 1;
- goto out;
- }
-
- if (level == PG_LEVEL_4K) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
- continue;
- }
-
- psize = page_level_size(level);
- pmask = page_level_mask(level);
-
- /*
- * Check whether we can change the large page in one go.
- * We request a split when the address is not aligned and
- * the number of pages to set/clear encryption bit is smaller
- * than the number of pages in the large page.
- */
- if (vaddr == (vaddr & pmask) &&
- ((vaddr_end - vaddr) >= psize)) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & pmask) + psize;
- continue;
- }
-
- /*
- * The virtual address is part of a larger page, create the next
- * level page table mapping (4K or 2M). If it is part of a 2M
- * page then we request a split of the large page into 4K
- * chunks. A 1GB large page is split into 2M pages, resp.
- */
- if (level == PG_LEVEL_2M)
- split_page_size_mask = 0;
- else
- split_page_size_mask = 1 << PG_LEVEL_2M;
-
- /*
- * kernel_physical_mapping_change() does not flush the TLBs, so
- * a TLB flush is required after we exit from the for loop.
- */
- kernel_physical_mapping_change(__pa(vaddr & pmask),
- __pa((vaddr_end & pmask) + psize),
- split_page_size_mask);
- }
-
- ret = 0;
-
-out:
- __flush_tlb_all();
- return ret;
-}
-
-int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, false);
-}
-
-int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, true);
-}
-
-/*
- * SME and SEV are very similar but they are not the same, so there are
- * times that the kernel will need to distinguish between SME and SEV. The
- * sme_active() and sev_active() functions are used for this. When a
- * distinction isn't needed, the mem_encrypt_active() function can be used.
- *
- * The trampoline code is a good example for this requirement. Before
- * paging is activated, SME will access all memory as decrypted, but SEV
- * will access all memory as encrypted. So, when APs are being brought
- * up under SME the trampoline area cannot be encrypted, whereas under SEV
- * the trampoline area must be encrypted.
- */
-bool sme_active(void)
-{
- return sme_me_mask && !sev_enabled;
-}
-
-bool sev_active(void)
-{
- return sev_status & MSR_AMD64_SEV_ENABLED;
-}
-EXPORT_SYMBOL_GPL(sev_active);
-
-/* Needs to be called from non-instrumentable code */
-bool noinstr sev_es_active(void)
-{
- return sev_status & MSR_AMD64_SEV_ES_ENABLED;
-}
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
@@ -396,7 +19,7 @@ bool force_dma_unencrypted(struct device *dev)
/*
* For SEV, all DMA must be to unencrypted addresses.
*/
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return true;
/*
@@ -404,7 +27,7 @@ bool force_dma_unencrypted(struct device *dev)
* device does not support DMA to addresses that include the
* encryption mask.
*/
- if (sme_active()) {
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
dev->bus_dma_limit);
@@ -416,36 +39,19 @@ bool force_dma_unencrypted(struct device *dev)
return false;
}
-void __init mem_encrypt_free_decrypted_mem(void)
+static void print_mem_encrypt_feature_info(void)
{
- unsigned long vaddr, vaddr_end, npages;
- int r;
+ pr_info("Memory Encryption Features active:");
- vaddr = (unsigned long)__start_bss_decrypted_unused;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
-
- /*
- * The unused memory range was mapped decrypted, change the encryption
- * attribute from decrypted to encrypted before freeing it.
- */
- if (mem_encrypt_active()) {
- r = set_memory_encrypted(vaddr, npages);
- if (r) {
- pr_warn("failed to free unused decrypted pages\n");
- return;
- }
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ pr_cont(" Intel TDX\n");
+ return;
}
- free_init_pages("unused decrypted", vaddr, vaddr_end);
-}
-
-static void print_mem_encrypt_feature_info(void)
-{
- pr_info("AMD Memory Encryption Features active:");
+ pr_cont(" AMD");
/* Secure Memory Encryption */
- if (sme_active()) {
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
/*
* SME is mutually exclusive with any of the SEV
* features below.
@@ -455,32 +61,28 @@ static void print_mem_encrypt_feature_info(void)
}
/* Secure Encrypted Virtualization */
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
pr_cont(" SEV");
/* Encrypted Register State */
- if (sev_es_active())
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
pr_cont(" SEV-ES");
+ /* Secure Nested Paging */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ pr_cont(" SEV-SNP");
+
pr_cont("\n");
}
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
- if (!sme_me_mask)
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return;
/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
swiotlb_update_mem_attributes();
- /*
- * With SEV, we need to unroll the rep string I/O instructions,
- * but SEV-ES supports them through the #VC handler.
- */
- if (sev_active() && !sev_es_active())
- static_branch_enable(&sev_enable_key);
-
print_mem_encrypt_feature_info();
}
-
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
new file mode 100644
index 000000000000..f6d038e2cd8e
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/virtio_config.h>
+#include <linux/cc_platform.h>
+#include <linux/platform-feature.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+#include <asm/sev.h>
+
+#include "mm_internal.h"
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+u64 sme_me_mask __section(".data") = 0;
+u64 sev_status __section(".data") = 0;
+u64 sev_check_data __section(".data") = 0;
+EXPORT_SYMBOL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
+
+/*
+ * SNP-specific routine which needs to additionally change the page state from
+ * private to shared before copying the data from the source to destination and
+ * restore after the copy.
+ */
+static inline void __init snp_memcpy(void *dst, void *src, size_t sz,
+ unsigned long paddr, bool decrypt)
+{
+ unsigned long npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+
+ if (decrypt) {
+ /*
+ * @paddr needs to be accessed decrypted, mark the page shared in
+ * the RMP table before copying it.
+ */
+ early_snp_set_memory_shared((unsigned long)__va(paddr), paddr, npages);
+
+ memcpy(dst, src, sz);
+
+ /* Restore the page state after the memcpy. */
+ early_snp_set_memory_private((unsigned long)__va(paddr), paddr, npages);
+ } else {
+ /*
+ * @paddr need to be accessed encrypted, no need for the page state
+ * change.
+ */
+ memcpy(dst, src, sz);
+ }
+}
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ void *src, *dst;
+ size_t len;
+
+ if (!sme_me_mask)
+ return;
+
+ wbinvd();
+
+ /*
+ * There are limited number of early mapping slots, so map (at most)
+ * one page at time.
+ */
+ while (size) {
+ len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+ /*
+ * Create mappings for the current and desired format of
+ * the memory. Use a write-protected mapping for the source.
+ */
+ src = enc ? early_memremap_decrypted_wp(paddr, len) :
+ early_memremap_encrypted_wp(paddr, len);
+
+ dst = enc ? early_memremap_encrypted(paddr, len) :
+ early_memremap_decrypted(paddr, len);
+
+ /*
+ * If a mapping can't be obtained to perform the operation,
+ * then eventual access of that area in the desired mode
+ * will cause a crash.
+ */
+ BUG_ON(!src || !dst);
+
+ /*
+ * Use a temporary buffer, of cache-line multiple size, to
+ * avoid data corruption as documented in the APM.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ snp_memcpy(sme_early_buffer, src, len, paddr, enc);
+ snp_memcpy(dst, sme_early_buffer, len, paddr, !enc);
+ } else {
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+ }
+
+ early_memunmap(dst, len);
+ early_memunmap(src, len);
+
+ paddr += len;
+ size -= len;
+ }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+ bool map)
+{
+ unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+ pmdval_t pmd_flags, pmd;
+
+ /* Use early_pmd_flags but remove the encryption mask */
+ pmd_flags = __sme_clr(early_pmd_flags);
+
+ do {
+ pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+ __early_make_pgtable((unsigned long)vaddr, pmd);
+
+ vaddr += PMD_SIZE;
+ paddr += PMD_SIZE;
+ size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+ } while (size);
+
+ flush_tlb_local();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ /* Get the command line address before unmapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+ /* Get the command line address after mapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sev_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * For SEV, all DMA has to occur via shared/unencrypted pages.
+ * SEV uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB for SEV guests using
+ * a percentage of guest memory for SWIOTLB buffers.
+ * Also, as the SWIOTLB bounce buffer memory is allocated
+ * from low memory, ensure that the adjusted size is within
+ * the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+
+ /* Set restricted memory access for virtio. */
+ platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS);
+}
+
+static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
+{
+ unsigned long pfn = 0;
+ pgprot_t prot;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid level for kpte\n");
+ return 0;
+ }
+
+ if (ret_prot)
+ *ret_prot = prot;
+
+ return pfn;
+}
+
+static bool amd_enc_tlb_flush_required(bool enc)
+{
+ return true;
+}
+
+static bool amd_enc_cache_flush_required(void)
+{
+ return !cpu_feature_enabled(X86_FEATURE_SME_COHERENT);
+}
+
+static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
+{
+#ifdef CONFIG_PARAVIRT
+ unsigned long sz = npages << PAGE_SHIFT;
+ unsigned long vaddr_end = vaddr + sz;
+
+ while (vaddr < vaddr_end) {
+ int psize, pmask, level;
+ unsigned long pfn;
+ pte_t *kpte;
+
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ WARN_ONCE(1, "kpte lookup for vaddr\n");
+ return;
+ }
+
+ pfn = pg_level_to_pfn(level, kpte, NULL);
+ if (!pfn)
+ continue;
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc);
+
+ vaddr = (vaddr & pmask) + psize;
+ }
+#endif
+}
+
+static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
+{
+ /*
+ * To maintain the security guarantees of SEV-SNP guests, make sure
+ * to invalidate the memory before encryption attribute is cleared.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc)
+ snp_set_memory_shared(vaddr, npages);
+}
+
+/* Return true unconditionally: return value doesn't matter for the SEV side */
+static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
+{
+ /*
+ * After memory is mapped encrypted in the page table, validate it
+ * so that it is consistent with the page table updates.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && enc)
+ snp_set_memory_private(vaddr, npages);
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ enc_dec_hypercall(vaddr, npages, enc);
+
+ return true;
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ pfn = pg_level_to_pfn(level, kpte, &old_prot);
+ if (!pfn)
+ return;
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << PAGE_SHIFT;
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc) {
+ sme_early_encrypt(pa, size);
+ } else {
+ sme_early_decrypt(pa, size);
+
+ /*
+ * ON SNP, the page state in the RMP table must happen
+ * before the page table updates.
+ */
+ early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1);
+ }
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+
+ /*
+ * If page is set encrypted in the page table, then update the RMP table to
+ * add this page as private.
+ */
+ if (enc)
+ early_snp_set_memory_private((unsigned long)__va(pa), pa, 1);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next, start;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ start = vaddr;
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ /*
+ * kernel_physical_mapping_change() does not flush the TLBs, so
+ * a TLB flush is required after we exit from the for loop.
+ */
+ kernel_physical_mapping_change(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+ early_set_mem_enc_dec_hypercall(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
+{
+ enc_dec_hypercall(vaddr, npages, enc);
+}
+
+void __init sme_early_init(void)
+{
+ unsigned int i;
+
+ if (!sme_me_mask)
+ return;
+
+ early_pmd_flags = __sme_set(early_pmd_flags);
+
+ __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+ /* Update the protection map with memory encryption mask */
+ for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+ protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ x86_platform.guest.enc_status_change_prepare = amd_enc_status_change_prepare;
+ x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish;
+ x86_platform.guest.enc_tlb_flush_required = amd_enc_tlb_flush_required;
+ x86_platform.guest.enc_cache_flush_required = amd_enc_cache_flush_required;
+}
+
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 7a84fc8bc5c3..3d1dba05fce4 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -27,7 +27,7 @@ SYM_FUNC_START(sme_encrypt_execute)
* - stack page (PAGE_SIZE)
* - encryption routine page (PAGE_SIZE)
* - intermediate copy buffer (PMD_PAGE_SIZE)
- * R8 - physcial address of the pagetables to use for encryption
+ * R8 - physical address of the pagetables to use for encryption
*/
push %rbp
@@ -65,7 +65,7 @@ SYM_FUNC_START(sme_encrypt_execute)
movq %rbp, %rsp /* Restore original stack pointer */
pop %rbp
- ret
+ RET
SYM_FUNC_END(sme_encrypt_execute)
SYM_FUNC_START(__enc_copy)
@@ -151,6 +151,6 @@ SYM_FUNC_START(__enc_copy)
pop %r12
pop %r15
- ret
+ RET
.L__enc_copy_end:
SYM_FUNC_END(__enc_copy)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 6c5eb6f3f14f..f415498d3175 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -27,13 +27,25 @@
#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * This code runs before CPU feature bits are set. By default, the
+ * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if
+ * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5
+ * is provided to handle this situation and, instead, use a variable that
+ * has been set by the early boot code.
+ */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/cmdline.h>
+#include <asm/coco.h>
+#include <asm/sev.h>
#include "mm_internal.h"
@@ -287,7 +299,13 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
unsigned long pgtable_area_len;
unsigned long decrypted_base;
- if (!sme_active())
+ /*
+ * This is early code, use an open coded check for SME instead of
+ * using cc_platform_has(). This eliminates worries about removing
+ * instrumentation or checking boot_cpu_data in the cc_platform_has()
+ * function.
+ */
+ if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
return;
/*
@@ -492,8 +510,11 @@ void __init sme_enable(struct boot_params *bp)
bool active_by_default;
unsigned long me_mask;
char buffer[16];
+ bool snp;
u64 msr;
+ snp = snp_init(bp);
+
/* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
@@ -503,14 +524,6 @@ void __init sme_enable(struct boot_params *bp)
#define AMD_SME_BIT BIT(0)
#define AMD_SEV_BIT BIT(1)
- /*
- * Set the feature mask (SME or SEV) based on whether we are
- * running under a hypervisor.
- */
- eax = 1;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
/*
* Check for the SME/SEV feature:
@@ -523,31 +536,45 @@ void __init sme_enable(struct boot_params *bp)
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
- if (!(eax & feature_mask))
+ /* Check whether SEV or SME is supported */
+ if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT)))
return;
me_mask = 1UL << (ebx & 0x3f);
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */
+ if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ snp_abort();
+
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
+ /*
+ * No SME if Hypervisor bit is set. This check is here to
+ * prevent a guest from trying to enable SME. For running as a
+ * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there
+ * might be other hypervisors which emulate that MSR as non-zero
+ * or even pass it through to the guest.
+ * A malicious hypervisor can still trick a guest into this
+ * path, but there is no way to protect against that.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (ecx & BIT(31))
+ return;
+
/* For SME, check the SYSCFG MSR */
- msr = __rdmsr(MSR_K8_SYSCFG);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ msr = __rdmsr(MSR_AMD64_SYSCFG);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
} else {
- /* For SEV, check the SEV MSR */
- msr = __rdmsr(MSR_AMD64_SEV);
- if (!(msr & MSR_AMD64_SEV_ENABLED))
- return;
-
- /* Save SEV_STATUS to avoid reading MSR again */
- sev_status = msr;
-
/* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask;
- sev_enabled = true;
- physical_mask &= ~sme_me_mask;
- return;
+ goto out;
}
/*
@@ -581,6 +608,10 @@ void __init sme_enable(struct boot_params *bp)
sme_me_mask = 0;
else
sme_me_mask = active_by_default ? me_mask : 0;
-
- physical_mask &= ~sme_me_mask;
+out:
+ if (sme_me_mask) {
+ physical_mask &= ~sme_me_mask;
+ cc_set_vendor(CC_VENDOR_AMD);
+ cc_set_mask(sme_me_mask);
+ }
}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index cd768dafca9e..c3317f0650d8 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -376,12 +376,12 @@ static void enter_uniprocessor(void)
goto out;
}
- get_online_cpus();
+ cpus_read_lock();
cpumask_copy(downed_cpus, cpu_online_mask);
cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
if (num_online_cpus() > 1)
pr_notice("Disabling non-boot CPUs...\n");
- put_online_cpus();
+ cpus_read_unlock();
for_each_cpu(cpu, downed_cpus) {
err = remove_cpu(cpu);
@@ -400,7 +400,7 @@ static void leave_uniprocessor(void)
int cpu;
int err;
- if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0)
+ if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus))
return;
pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 5eb4dc2b97da..e8b061557887 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -254,7 +254,13 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
/* make sure all non-reserved blocks are inside the limits */
bi->start = max(bi->start, low);
- bi->end = min(bi->end, high);
+
+ /* preserve info for non-RAM areas above 'max_pfn': */
+ if (bi->end > high) {
+ numa_add_memblk_to(bi->nid, high, bi->end,
+ &numa_reserved_meminfo);
+ bi->end = high;
+ }
/* and there's no empty block */
if (bi->start >= bi->end)
@@ -349,7 +355,7 @@ void __init numa_reset_distance(void)
/* numa_distance could be 1LU marking allocation failure, test cnt */
if (numa_distance_cnt)
- memblock_free(__pa(numa_distance), size);
+ memblock_free(numa_distance, size);
numa_distance_cnt = 0;
numa_distance = NULL; /* enable table creation */
}
@@ -370,15 +376,14 @@ static int __init numa_alloc_distance(void)
cnt++;
size = cnt * cnt * sizeof(numa_distance[0]);
- phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
- size, PAGE_SIZE);
+ phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
if (!phys) {
pr_warn("Warning: can't allocate distance table!\n");
/* don't retry until explicitly reset */
numa_distance = (void *)1LU;
return -ENOMEM;
}
- memblock_reserve(phys, size);
numa_distance = __va(phys);
numa_distance_cnt = cnt;
@@ -733,17 +738,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static void __init init_memory_less_node(int nid)
-{
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_memoryless_node(nid);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
/*
* A node may exist which has one or more Generic Initiators but no CPUs and no
@@ -761,9 +755,18 @@ void __init init_gi_nodes(void)
{
int nid;
+ /*
+ * Exclude this node from
+ * bringup_nonboot_cpus
+ * cpu_up
+ * __try_online_node
+ * register_one_node
+ * because node_subsys is not initialized yet.
+ * TODO remove dependency on node_online
+ */
for_each_node_state(nid, N_GENERIC_INITIATOR)
if (!node_online(nid))
- init_memory_less_node(nid);
+ node_set_online(nid);
}
/*
@@ -793,8 +796,17 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
+ /*
+ * Exclude this node from
+ * bringup_nonboot_cpus
+ * cpu_up
+ * __try_online_node
+ * register_one_node
+ * because node_subsys is not initialized yet.
+ * TODO remove dependency on node_online
+ */
if (!node_online(node))
- init_memory_less_node(node);
+ node_set_online(node);
numa_set_node(cpu, node);
}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 87d77cc52f86..9a9305367fdd 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -123,7 +123,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
*/
- while (nodes_weight(physnode_mask)) {
+ while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
@@ -270,7 +270,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
* Fill physical nodes with fake nodes of size until there is no memory
* left on any of them.
*/
- while (nodes_weight(physnode_mask)) {
+ while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
@@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
if (numa_dist_cnt) {
u64 phys;
- phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
- phys_size, PAGE_SIZE);
+ phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
if (!phys) {
pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
goto no_emu;
}
- memblock_reserve(phys, phys_size);
phys_dist = __va(phys);
for (i = 0; i < numa_dist_cnt; i++)
@@ -518,8 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
}
/* free the copied physical distance table */
- if (phys_dist)
- memblock_free(__pa(phys_dist), phys_size);
+ memblock_free(phys_dist, phys_size);
return;
no_emu:
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index ca311aaa67b8..d5ef64ddd35e 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -101,7 +101,7 @@ int pat_debug_enable;
static int __init pat_debug_setup(char *str)
{
pat_debug_enable = 1;
- return 0;
+ return 1;
}
__setup("debugpat", pat_debug_setup);
@@ -583,7 +583,12 @@ int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type,
int err = 0;
start = sanitize_phys(start);
- end = sanitize_phys(end);
+
+ /*
+ * The end address passed into this function is exclusive, but
+ * sanitize_phys() expects an inclusive address.
+ */
+ end = sanitize_phys(end - 1) + 1;
if (start >= end) {
WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
start, end - 1, cattr_name(req_type));
@@ -695,7 +700,7 @@ int memtype_free(u64 start, u64 end)
/**
- * lookup_memtype - Looksup the memory type for a physical address
+ * lookup_memtype - Looks up the memory type for a physical address
* @paddr: physical address of which memory type needs to be looked up
*
* Only to be called when PAT is enabled
@@ -800,6 +805,7 @@ void memtype_free_io(resource_size_t start, resource_size_t end)
memtype_free(start, end);
}
+#ifdef CONFIG_X86_PAT
int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
{
enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
@@ -813,6 +819,7 @@ void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
memtype_free_io(start, start + size);
}
EXPORT_SYMBOL(arch_io_free_memtype_wc);
+#endif
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..1abd5438f126 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,10 @@
#include <linux/pci.h>
#include <linux/vmalloc.h>
#include <linux/libnvdimm.h>
+#include <linux/vmstat.h>
+#include <linux/kernel.h>
+#include <linux/cc_platform.h>
+#include <linux/set_memory.h>
#include <asm/e820/api.h>
#include <asm/processor.h>
@@ -26,7 +30,8 @@
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/memtype.h>
-#include <asm/set_memory.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
#include "../mm_internal.h"
@@ -91,6 +96,12 @@ static void split_page_count(int level)
return;
direct_pages_count[level]--;
+ if (system_state == SYSTEM_RUNNING) {
+ if (level == PG_LEVEL_2M)
+ count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
+ else if (level == PG_LEVEL_1G)
+ count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
+ }
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
@@ -627,17 +638,6 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
}
EXPORT_SYMBOL_GPL(lookup_address);
-/*
- * Lookup the page table entry for a virtual address in a given mm. Return a
- * pointer to the entry and the level of the mapping.
- */
-pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
- unsigned int *level)
-{
- return lookup_address_in_pgd(pgd_offset(mm, address), address, level);
-}
-EXPORT_SYMBOL_GPL(lookup_address_in_mm);
-
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
@@ -680,7 +680,7 @@ pmd_t *lookup_pmd_address(unsigned long address)
* end up in this kind of memory, for instance.
*
* This could be optimized, but it is only intended to be
- * used at inititalization time, and keeping it
+ * used at initialization time, and keeping it
* unoptimized should increase the testing coverage for
* the more obscure platforms.
*/
@@ -1126,7 +1126,7 @@ static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
if (unmap_pte_range(pmd, start, end))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (try_to_free_pmd_page(pud_pgtable(*pud)))
pud_clear(pud);
}
@@ -1170,7 +1170,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (try_to_free_pmd_page(pud_pgtable(*pud)))
pud_clear(pud);
}
@@ -1805,7 +1805,7 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
}
/*
- * _set_memory_prot is an internal helper for callers that have been passed
+ * __set_memory_prot is an internal helper for callers that have been passed
* a pgprot_t value from upper layers and a reservation has already been taken.
* If you want to set the pgprot to a specific page protocol, use the
* set_memory_xx() functions.
@@ -1914,6 +1914,51 @@ int set_memory_wb(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_wb);
+/* Prevent speculative access to a page by marking it not-present */
+#ifdef CONFIG_X86_64
+int set_mce_nospec(unsigned long pfn)
+{
+ unsigned long decoy_addr;
+ int rc;
+
+ /* SGX pages are not in the 1:1 map */
+ if (arch_is_platform_page(pfn << PAGE_SHIFT))
+ return 0;
+ /*
+ * We would like to just call:
+ * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
+ * but doing that would radically increase the odds of a
+ * speculative access to the poison page because we'd have
+ * the virtual address of the kernel 1:1 mapping sitting
+ * around in registers.
+ * Instead we get tricky. We create a non-canonical address
+ * that looks just like the one we want, but has bit 63 flipped.
+ * This relies on set_memory_XX() properly sanitizing any __pa()
+ * results with __PHYSICAL_MASK or PTE_PFN_MASK.
+ */
+ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+
+ rc = set_memory_np(decoy_addr, 1);
+ if (rc)
+ pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+ return rc;
+}
+
+static int set_memory_present(unsigned long *addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+}
+
+/* Restore full speculative operation to the pfn. */
+int clear_mce_nospec(unsigned long pfn)
+{
+ unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
+
+ return set_memory_present(&addr, 1);
+}
+EXPORT_SYMBOL_GPL(clear_mce_nospec);
+#endif /* CONFIG_X86_64 */
+
int set_memory_x(unsigned long addr, int numpages)
{
if (!(__supported_pte_mask & _PAGE_NX))
@@ -1972,15 +2017,16 @@ int set_memory_global(unsigned long addr, int numpages)
__pgprot(_PAGE_GLOBAL), 0);
}
-static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+/*
+ * __set_memory_enc_pgtable() is used for the hypervisors that get
+ * informed about "encryption" status via page tables.
+ */
+static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
{
+ pgprot_t empty = __pgprot(0);
struct cpa_data cpa;
int ret;
- /* Nothing to do if memory encryption is not active */
- if (!mem_encrypt_active())
- return 0;
-
/* Should not be working on unaligned addresses */
if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
addr &= PAGE_MASK;
@@ -1988,18 +2034,20 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
memset(&cpa, 0, sizeof(cpa));
cpa.vaddr = &addr;
cpa.numpages = numpages;
- cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
- cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+ cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
+ cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
cpa.pgd = init_mm.pgd;
/* Must avoid aliasing mappings in the highmem code */
kmap_flush_unused();
vm_unmap_aliases();
- /*
- * Before changing the encryption attribute, we need to flush caches.
- */
- cpa_flush(&cpa, !this_cpu_has(X86_FEATURE_SME_COHERENT));
+ /* Flush the caches as needed before changing the encryption attribute. */
+ if (x86_platform.guest.enc_tlb_flush_required(enc))
+ cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
+
+ /* Notify hypervisor that we are about to set/clr encryption attribute. */
+ x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -2012,9 +2060,26 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
*/
cpa_flush(&cpa, 0);
+ /* Notify hypervisor that we have successfully set/clr encryption attribute. */
+ if (!ret) {
+ if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
+ ret = -EIO;
+ }
+
return ret;
}
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+ if (hv_is_isolation_supported())
+ return hv_set_mem_host_visibility(addr, numpages, !enc);
+
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
+ return __set_memory_enc_pgtable(addr, numpages, enc);
+
+ return 0;
+}
+
int set_memory_encrypted(unsigned long addr, int numpages)
{
return __set_memory_enc_dec(addr, numpages, true);
@@ -2093,12 +2158,6 @@ int set_pages_array_wc(struct page **pages, int numpages)
}
EXPORT_SYMBOL(set_pages_array_wc);
-int set_pages_array_wt(struct page **pages, int numpages)
-{
- return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WT);
-}
-EXPORT_SYMBOL_GPL(set_pages_array_wt);
-
int set_pages_wb(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c
new file mode 100644
index 000000000000..763742782286
--- /dev/null
+++ b/arch/x86/mm/pgprot.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+pgprot_t vm_get_page_prot(unsigned long vm_flags)
+{
+ unsigned long val = pgprot_val(protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /*
+ * Take the 4 protection key bits out of the vma->vm_flags value and
+ * turn them in to the bits that we can put in to a pte.
+ *
+ * Only override these if Protection Keys are available (which is only
+ * on 64-bit).
+ */
+ if (vm_flags & VM_PKEY_BIT0)
+ val |= _PAGE_PKEY_BIT0;
+ if (vm_flags & VM_PKEY_BIT1)
+ val |= _PAGE_PKEY_BIT1;
+ if (vm_flags & VM_PKEY_BIT2)
+ val |= _PAGE_PKEY_BIT2;
+ if (vm_flags & VM_PKEY_BIT3)
+ val |= _PAGE_PKEY_BIT3;
+#endif
+
+ val = __sme_set(val);
+ if (val & _PAGE_PRESENT)
+ val &= __supported_pte_mask;
+ return __pgprot(val);
+}
+EXPORT_SYMBOL(vm_get_page_prot);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index f6a9e2e36642..a932d7712d85 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -608,6 +608,16 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
return young;
}
+
+pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ /*
+ * No flush is necessary. Once an invalid PTE is established, the PTE's
+ * access and dirty bits cannot be updated.
+ */
+ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
+}
#endif
/**
@@ -676,9 +686,8 @@ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
*
* No 512GB pages yet -- always return 0
*/
-int p4d_clear_huge(p4d_t *p4d)
+void p4d_clear_huge(p4d_t *p4d)
{
- return 0;
}
#endif
@@ -780,14 +789,6 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
-/*
- * Until we support 512GB pages, skip them in the vmap area.
- */
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
@@ -805,7 +806,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
pte_t *pte;
int i;
- pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd = pud_pgtable(*pud);
pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
if (!pmd_sv)
return 0;
@@ -861,11 +862,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
#else /* !CONFIG_X86_64 */
-int pud_free_pmd_page(pud_t *pud, unsigned long addr)
-{
- return pud_none(*pud);
-}
-
/*
* Disable free page handling on x86-PAE. This assures that ioremap()
* does not update sync'd pmd entries. See vmalloc_sync_one().
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 8873ed1438a9..e44e938885b7 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -10,7 +10,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/mmu_context.h> /* vma_pkey() */
-#include <asm/fpu/internal.h> /* init_fpstate */
int __execute_only_pkey(struct mm_struct *mm)
{
@@ -125,22 +124,6 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
-/*
- * Called from the FPU code when creating a fresh set of FPU
- * registers. This is called from a very specific context where
- * we know the FPU regstiers are safe for use and we can use PKRU
- * directly.
- */
-void copy_init_pkru_to_fpregs(void)
-{
- u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
- /*
- * Override the PKRU state that came from 'init_fpstate'
- * with the baseline from the process.
- */
- write_pkru(init_pkru_value_snapshot);
-}
-
static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -154,7 +137,6 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
static ssize_t init_pkru_write_file(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
- struct pkru_state *pk;
char buf[32];
ssize_t len;
u32 new_init_pkru;
@@ -177,10 +159,6 @@ static ssize_t init_pkru_write_file(struct file *file,
return -EINVAL;
WRITE_ONCE(init_pkru_value, new_init_pkru);
- pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
- if (!pk)
- return -EINVAL;
- pk->pkru = new_init_pkru;
return count;
}
@@ -192,6 +170,10 @@ static const struct file_operations fops_init_pkru = {
static int __init create_init_pkru_value(void)
{
+ /* Do not expose the file if pkeys are not supported. */
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return 0;
+
debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_init_pkru);
return 0;
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 1aab92930569..ffe3b3a087fe 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -361,7 +361,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
* global, so set it as global in both copies. Note:
* the X86_FEATURE_PGE check is not _required_ because
* the CPU ignores _PAGE_GLOBAL when PGE is not
- * supported. The check keeps consistentency with
+ * supported. The check keeps consistency with
* code that only set this bit when supported.
*/
if (boot_cpu_has(X86_FEATURE_PGE))
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
for_each_possible_cpu(cpu) {
/*
- * The SYSCALL64 entry code needs to be able to find the
- * thread stack and needs one word of scratch space in which
- * to spill a register. All of this lives in the TSS, in
- * the sp1 and sp2 slots.
+ * The SYSCALL64 entry code needs one word of scratch space
+ * in which to spill a register. It lives in the sp2 slot
+ * of the CPU's TSS.
*
* This is done for all possible CPUs during boot to ensure
* that it's propagated to all mms.
@@ -512,7 +511,7 @@ static void pti_clone_entry_text(void)
static inline bool pti_kernel_image_global_ok(void)
{
/*
- * Systems with PCIDs get litlle benefit from global
+ * Systems with PCIDs get little benefit from global
* kernel text and are not worth the downsides.
*/
if (cpu_feature_enabled(X86_FEATURE_PCID))
@@ -541,7 +540,7 @@ static inline bool pti_kernel_image_global_ok(void)
* cases where RANDSTRUCT is in use to help keep the layout a
* secret.
*/
- if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT))
+ if (IS_ENABLED(CONFIG_RANDSTRUCT))
return false;
return true;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
deleted file mode 100644
index ed5667f5169f..000000000000
--- a/arch/x86/mm/setup_nx.c
+++ /dev/null
@@ -1,62 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/pgtable.h>
-
-#include <asm/proto.h>
-#include <asm/cpufeature.h>
-
-static int disable_nx;
-
-/*
- * noexec = on|off
- *
- * Control non-executable mappings for processes.
- *
- * on Enable
- * off Disable
- */
-static int __init noexec_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- disable_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- disable_nx = 1;
- }
- x86_configure_nx();
- return 0;
-}
-early_param("noexec", noexec_setup);
-
-void x86_configure_nx(void)
-{
- if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
- __supported_pte_mask |= _PAGE_NX;
- else
- __supported_pte_mask &= ~_PAGE_NX;
-}
-
-void __init x86_report_nx(void)
-{
- if (!boot_cpu_has(X86_FEATURE_NX)) {
- printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
- "missing in CPU!\n");
- } else {
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
- if (disable_nx) {
- printk(KERN_INFO "NX (Execute Disable) protection: "
- "disabled by kernel command line option\n");
- } else {
- printk(KERN_INFO "NX (Execute Disable) protection: "
- "active\n");
- }
-#else
- /* 32bit non-PAE kernel, NX cannot be used */
- printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
- "cannot be enabled: non-PAE kernel!\n");
-#endif
- }
-}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 569ac1d57f55..d400b6d9d246 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,12 +8,16 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
+#include <linux/sched/smt.h>
+#include <linux/task_work.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/nospec-branch.h>
#include <asm/cache.h>
+#include <asm/cacheflush.h>
#include <asm/apic.h>
+#include <asm/perf_event.h>
#include "mm_internal.h"
@@ -24,7 +28,7 @@
# define __flush_tlb_local native_flush_tlb_local
# define __flush_tlb_global native_flush_tlb_global
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
-# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
+# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
#endif
/*
@@ -42,10 +46,15 @@
*/
/*
- * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
- * stored in cpu_tlb_state.last_user_mm_ibpb.
+ * Bits to mangle the TIF_SPEC_* state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_spec.
*/
#define LAST_USER_MM_IBPB 0x1UL
+#define LAST_USER_MM_L1D_FLUSH 0x2UL
+#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH)
+
+/* Bits to set when tlbstate and flush is (re)initialized */
+#define LAST_USER_MM_INIT LAST_USER_MM_IBPB
/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
@@ -106,7 +115,7 @@ static inline u16 kern_pcid(u16 asid)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
- * Make sure that the dynamic ASID space does not confict with the
+ * Make sure that the dynamic ASID space does not conflict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
@@ -300,7 +309,7 @@ void leave_mm(int cpu)
return;
/* Warn if we're not lazy. */
- WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
+ WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
@@ -316,20 +325,70 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+/*
+ * Invoked from return to user/guest by a task that opted-in to L1D
+ * flushing but ended up running on an SMT enabled core due to wrong
+ * affinity settings or CPU hotplug. This is part of the paranoid L1D flush
+ * contract which this task requested.
+ */
+static void l1d_flush_force_sigbus(struct callback_head *ch)
+{
+ force_sig(SIGBUS);
+}
+
+static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
+ struct task_struct *next)
{
- unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+ /* Flush L1D if the outgoing task requests it */
+ if (prev_mm & LAST_USER_MM_L1D_FLUSH)
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+
+ /* Check whether the incoming task opted in for L1D flush */
+ if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH)))
+ return;
- return (unsigned long)next->mm | ibpb;
+ /*
+ * Validate that it is not running on an SMT sibling as this would
+ * make the excercise pointless because the siblings share L1D. If
+ * it runs on a SMT sibling, notify it with SIGBUS on return to
+ * user/guest
+ */
+ if (this_cpu_read(cpu_info.smt_active)) {
+ clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH);
+ next->l1d_flush_kill.func = l1d_flush_force_sigbus;
+ task_work_add(next, &next->l1d_flush_kill, TWA_RESUME);
+ }
}
-static void cond_ibpb(struct task_struct *next)
+static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{
+ unsigned long next_tif = read_task_thread_flags(next);
+ unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
+
+ /*
+ * Ensure that the bit shift above works as expected and the two flags
+ * end up in bit 0 and 1.
+ */
+ BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1);
+
+ return (unsigned long)next->mm | spec_bits;
+}
+
+static void cond_mitigation(struct task_struct *next)
+{
+ unsigned long prev_mm, next_mm;
+
if (!next || !next->mm)
return;
+ next_mm = mm_mangle_tif_spec_bits(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
+
/*
+ * Avoid user/user BTB poisoning by flushing the branch predictor
+ * when switching between processes. This stops one process from
+ * doing Spectre-v2 attacks on another.
+ *
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
* same process. Using the mm pointer instead of mm->context.ctx_id
@@ -339,8 +398,6 @@ static void cond_ibpb(struct task_struct *next)
* exposed data is not really interesting.
*/
if (static_branch_likely(&switch_mm_cond_ibpb)) {
- unsigned long prev_mm, next_mm;
-
/*
* This is a bit more complex than the always mode because
* it has to handle two cases:
@@ -370,20 +427,14 @@ static void cond_ibpb(struct task_struct *next)
* Optimize this with reasonably small overhead for the
* above cases. Mangle the TIF_SPEC_IB bit into the mm
* pointer of the incoming task which is stored in
- * cpu_tlbstate.last_user_mm_ibpb for comparison.
- */
- next_mm = mm_mangle_tif_spec_ib(next);
- prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
-
- /*
+ * cpu_tlbstate.last_user_mm_spec for comparison.
+ *
* Issue IBPB only if the mm's are different and one or
* both have the IBPB bit set.
*/
if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
-
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
}
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
@@ -392,11 +443,22 @@ static void cond_ibpb(struct task_struct *next)
* different context than the user space task which ran
* last on this CPU.
*/
- if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
+ (unsigned long)next->mm)
indirect_branch_prediction_barrier();
- this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
- }
}
+
+ if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) {
+ /*
+ * Flush L1D when the outgoing task requested it and/or
+ * check whether the incoming task requested L1D flushing
+ * and ended up on an SMT sibling.
+ */
+ if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH))
+ l1d_flush_evaluate(prev_mm, next_mm, next);
+ }
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
}
#ifdef CONFIG_PERF_EVENTS
@@ -404,9 +466,14 @@ static inline void cr4_update_pce_mm(struct mm_struct *mm)
{
if (static_branch_unlikely(&rdpmc_always_available_key) ||
(!static_branch_unlikely(&rdpmc_never_available_key) &&
- atomic_read(&mm->context.perf_rdpmc_allowed)))
+ atomic_read(&mm->context.perf_rdpmc_allowed))) {
+ /*
+ * Clear the existing dirty counters to
+ * prevent the leak for an RDPMC task.
+ */
+ perf_clear_dirty_counters();
cr4_set_bits_irqsoff(X86_CR4_PCE);
- else
+ } else
cr4_clear_bits_irqsoff(X86_CR4_PCE);
}
@@ -424,7 +491,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
+ bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
@@ -439,7 +506,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
*/
- /* We don't want flush_tlb_func_* to run concurrently with us. */
+ /* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
@@ -469,7 +536,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
__flush_tlb_all();
}
#endif
- this_cpu_write(cpu_tlbstate.is_lazy, false);
+ if (was_lazy)
+ this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
/*
* The membarrier system call requires a full memory barrier and
@@ -490,7 +558,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
- * from cpu_tlbstate.is_lazy whether or not to send an IPI.
+ * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
@@ -524,11 +592,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
need_flush = true;
} else {
/*
- * Avoid user/user BTB poisoning by flushing the branch
- * predictor when switching between processes. This stops
- * one process from doing Spectre-v2 attacks on another.
+ * Apply process to process speculation vulnerability
+ * mitigations if applicable.
*/
- cond_ibpb(tsk);
+ cond_mitigation(tsk);
/*
* Stop remote flushes for the previous mm.
@@ -598,7 +665,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- this_cpu_write(cpu_tlbstate.is_lazy, true);
+ this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
}
/*
@@ -636,7 +703,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
@@ -647,14 +714,13 @@ void initialize_tlbstate_and_flush(void)
}
/*
- * flush_tlb_func_common()'s memory ordering requirement is that any
+ * flush_tlb_func()'s memory ordering requirement is that any
* TLB fills that happen after we flush the TLB are ordered after we
* read active_mm's tlb_gen. We don't need any explicit barriers
* because all x86 flush operations are serializing and the
* atomic64_read operation won't be reordered by the compiler.
*/
-static void flush_tlb_func_common(const struct flush_tlb_info *f,
- bool local, enum tlb_flush_reason reason)
+static void flush_tlb_func(void *info)
{
/*
* We have three different tlb_gen values in here. They are:
@@ -665,28 +731,40 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* - f->new_tlb_gen: the generation that the requester of the flush
* wants us to catch up to.
*/
+ const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ bool local = smp_processor_id() == f->initiating_cpu;
+ unsigned long nr_invalidate = 0;
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
+ if (!local) {
+ inc_irq_stat(irq_tlb_count);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+
+ /* Can only happen on remote CPUs */
+ if (f->mm && f->mm != loaded_mm)
+ return;
+ }
+
if (unlikely(loaded_mm == &init_mm))
return;
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
- if (this_cpu_read(cpu_tlbstate.is_lazy)) {
+ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
/*
* We're in lazy mode. We need to at least flush our
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
- * This should be rare, with native_flush_tlb_others skipping
+ * This should be rare, with native_flush_tlb_multi() skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
@@ -700,8 +778,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* be handled can catch us all the way up, leaving no work for
* the second flush.
*/
- trace_tlb_flush(reason, 0);
- return;
+ goto done;
}
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
@@ -736,7 +813,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* 3, we'd be break the invariant: we'd update local_tlb_gen above
* 1 without the full flush that's needed for tlb_gen 2.
*
- * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization.
* Partial TLB flushes are not all that much cheaper than full TLB
* flushes, so it seems unlikely that it would be a performance win
* to do a partial flush if that won't bring our TLB fully up to
@@ -748,56 +825,52 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
- unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
+ nr_invalidate = (f->end - f->start) >> f->stride_shift;
+
while (addr < f->end) {
flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
- trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
+ nr_invalidate = TLB_FLUSH_ALL;
+
flush_tlb_local();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
/* Both paths above update our state to mm_tlb_gen. */
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
-}
-static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
-{
- const struct flush_tlb_info *f = info;
-
- flush_tlb_func_common(f, true, reason);
-}
-
-static void flush_tlb_func_remote(void *info)
-{
- const struct flush_tlb_info *f = info;
-
- inc_irq_stat(irq_tlb_count);
-
- if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
- return;
-
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
+ /* Tracing is done in a unified manner to reduce the code size */
+done:
+ trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN :
+ (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :
+ TLB_LOCAL_MM_SHOOTDOWN,
+ nr_invalidate);
}
static bool tlb_is_not_lazy(int cpu, void *data)
{
- return !per_cpu(cpu_tlbstate.is_lazy, cpu);
+ return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
}
-STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
+
+STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
+ /*
+ * Do accounting and tracing. Note that there are (and have always been)
+ * cases in which a remote TLB flush will be traced, but eventually
+ * would not happen.
+ */
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -816,17 +889,16 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
* doing a speculative memory access.
*/
if (info->freed_tables)
- smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
+ on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
+ on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
(void *)info, 1, cpumask);
}
-void flush_tlb_others(const struct cpumask *cpumask,
+void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
- __flush_tlb_others(cpumask, info);
+ __flush_tlb_multi(cpumask, info);
}
/*
@@ -847,7 +919,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
#endif
-static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
+static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int stride_shift, bool freed_tables,
u64 new_tlb_gen)
@@ -869,14 +941,15 @@ static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->stride_shift = stride_shift;
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
+ info->initiating_cpu = smp_processor_id();
return info;
}
-static inline void put_flush_tlb_info(void)
+static void put_flush_tlb_info(void)
{
#ifdef CONFIG_DEBUG_VM
- /* Complete reentrency prevention checks */
+ /* Complete reentrancy prevention checks */
barrier();
this_cpu_dec(flush_tlb_info_idx);
#endif
@@ -905,16 +978,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
new_tlb_gen);
- if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ /*
+ * flush_tlb_multi() is not optimized for the common case in which only
+ * a local TLB flush is needed. Optimize this use-case by calling
+ * flush_tlb_func_local() directly in this case.
+ */
+ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ flush_tlb_multi(mm_cpumask(mm), info);
+ } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
+ flush_tlb_func(info);
local_irq_enable();
}
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), info);
-
put_flush_tlb_info();
put_cpu();
}
@@ -1045,7 +1122,7 @@ void flush_tlb_one_user(unsigned long addr)
*/
STATIC_NOPV void native_flush_tlb_global(void)
{
- unsigned long cr4, flags;
+ unsigned long flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
@@ -1065,11 +1142,7 @@ STATIC_NOPV void native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* toggle PGE */
- native_write_cr4(cr4 ^ X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
raw_local_irq_restore(flags);
}
@@ -1119,34 +1192,30 @@ void __flush_tlb_all(void)
}
EXPORT_SYMBOL_GPL(__flush_tlb_all);
-/*
- * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
- * This means that the 'struct flush_tlb_info' that describes which mappings to
- * flush is actually fixed. We therefore set a single fixed struct and use it in
- * arch_tlbbatch_flush().
- */
-static const struct flush_tlb_info full_flush_tlb_info = {
- .mm = NULL,
- .start = 0,
- .end = TLB_FLUSH_ALL,
-};
-
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
+ struct flush_tlb_info *info;
+
int cpu = get_cpu();
- if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
+ /*
+ * flush_tlb_multi() is not optimized for the common case in which only
+ * a local TLB flush is needed. Optimize this use-case by calling
+ * flush_tlb_func_local() directly in this case.
+ */
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ flush_tlb_multi(&batch->cpumask, info);
+ } else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
+ flush_tlb_func(info);
local_irq_enable();
}
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
-
cpumask_clear(&batch->cpumask);
+ put_flush_tlb_info();
put_cpu();
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b35fc8023884..f298b18a9a3d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * bpf_jit_comp.c: BPF JIT compiler
+ * BPF JIT compiler
*
* Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
- * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/netdevice.h>
#include <linux/filter.h>
@@ -15,7 +15,6 @@
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
-#include <asm/asm-prototypes.h>
static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -31,7 +30,7 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
}
#define EMIT(bytes, len) \
- do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+ do { prog = emit_code(prog, bytes, len); } while (0)
#define EMIT1(b1) EMIT(b1, 1)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
@@ -47,6 +46,12 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
#define EMIT4_off32(b1, b2, b3, b4, off) \
do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+#ifdef CONFIG_X86_KERNEL_IBT
+#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#else
+#define EMIT_ENDBR()
+#endif
+
static bool is_imm8(int value)
{
return value <= 127 && value >= -128;
@@ -223,8 +228,21 @@ static void jit_fill_hole(void *area, unsigned int size)
memset(area, 0xcc, size);
}
+int bpf_arch_text_invalidate(void *dst, size_t len)
+{
+ return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len));
+}
+
struct jit_context {
int cleanup_addr; /* Epilogue code offset */
+
+ /*
+ * Program specific offsets of labels in the code; these rely on the
+ * JIT doing at least 2 passes, recording the position on the first
+ * pass, only to generate the correct offset on the second pass.
+ */
+ int tail_call_direct_label;
+ int tail_call_indirect_label;
};
/* Maximum number of bytes emitted while JITing one eBPF insn */
@@ -234,12 +252,11 @@ struct jit_context {
/* Number of bytes emit_patch() needs to generate instructions */
#define X86_PATCH_SIZE 5
/* Number of bytes that will be skipped on tailcall */
-#define X86_TAIL_CALL_OFFSET 11
+#define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE)
static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
{
u8 *prog = *pprog;
- int cnt = 0;
if (callee_regs_used[0])
EMIT1(0x53); /* push rbx */
@@ -255,7 +272,6 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
{
u8 *prog = *pprog;
- int cnt = 0;
if (callee_regs_used[3])
EMIT2(0x41, 0x5F); /* pop r15 */
@@ -277,13 +293,13 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
bool tail_call_reachable, bool is_subprog)
{
u8 *prog = *pprog;
- int cnt = X86_PATCH_SIZE;
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
- memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt);
- prog += cnt;
+ EMIT_ENDBR();
+ memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
+ prog += X86_PATCH_SIZE;
if (!ebpf_from_cbpf) {
if (tail_call_reachable && !is_subprog)
EMIT2(0x31, 0xC0); /* xor eax, eax */
@@ -292,6 +308,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
}
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+
+ /* X86_TAIL_CALL_OFFSET is here */
+ EMIT_ENDBR();
+
/* sub rsp, rounded_stack_depth */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
@@ -303,7 +323,6 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
{
u8 *prog = *pprog;
- int cnt = 0;
s64 offset;
offset = func - (ip + X86_PATCH_SIZE);
@@ -327,10 +346,9 @@ static int emit_jump(u8 **pprog, void *func, void *ip)
}
static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
- void *old_addr, void *new_addr,
- const bool text_live)
+ void *old_addr, void *new_addr)
{
- const u8 *nop_insn = ideal_nops[NOP_ATOMIC5];
+ const u8 *nop_insn = x86_nops[5];
u8 old_insn[X86_PATCH_SIZE];
u8 new_insn[X86_PATCH_SIZE];
u8 *prog;
@@ -362,10 +380,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
goto out;
ret = 1;
if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
- if (text_live)
- text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
- else
- memcpy(ip, new_insn, X86_PATCH_SIZE);
+ text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
ret = 0;
}
out:
@@ -381,23 +396,34 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
/* BPF poking in modules is not supported */
return -EINVAL;
- return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
+ /*
+ * See emit_prologue(), for IBT builds the trampoline hook is preceded
+ * with an ENDBR instruction.
+ */
+ if (is_endbr(*(u32 *)ip))
+ ip += ENDBR_INSN_SIZE;
+
+ return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
}
-static int get_pop_bytes(bool *callee_regs_used)
+#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8)
+
+static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
{
- int bytes = 0;
+ u8 *prog = *pprog;
- if (callee_regs_used[3])
- bytes += 2;
- if (callee_regs_used[2])
- bytes += 2;
- if (callee_regs_used[1])
- bytes += 2;
- if (callee_regs_used[0])
- bytes += 1;
+#ifdef CONFIG_RETPOLINE
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ EMIT_LFENCE();
+ EMIT2(0xFF, 0xE0 + reg);
+ } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
+ OPTIMIZER_HIDE_VAR(reg);
+ emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
+ } else
+#endif
+ EMIT2(0xFF, 0xE0 + reg);
- return bytes;
+ *pprog = prog;
}
/*
@@ -406,7 +432,7 @@ static int get_pop_bytes(bool *callee_regs_used)
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
- * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
* prog = array->ptrs[index];
* if (prog == NULL)
@@ -415,30 +441,12 @@ static int get_pop_bytes(bool *callee_regs_used)
* out:
*/
static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
- u32 stack_depth)
+ u32 stack_depth, u8 *ip,
+ struct jit_context *ctx)
{
int tcc_off = -4 - round_up(stack_depth, 8);
- u8 *prog = *pprog;
- int pop_bytes = 0;
- int off1 = 42;
- int off2 = 31;
- int off3 = 9;
- int cnt = 0;
-
- /* count the additional bytes used for popping callee regs from stack
- * that need to be taken into account for each of the offsets that
- * are used for bailing out of the tail call
- */
- pop_bytes = get_pop_bytes(callee_regs_used);
- off1 += pop_bytes;
- off2 += pop_bytes;
- off3 += pop_bytes;
-
- if (stack_depth) {
- off1 += 7;
- off2 += 7;
- off3 += 7;
- }
+ u8 *prog = *pprog, *start = *pprog;
+ int offset;
/*
* rdi - pointer to ctx
@@ -453,17 +461,19 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
EMIT2(0x89, 0xD2); /* mov edx, edx */
EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */
offsetof(struct bpf_array, map.max_entries));
-#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */
- EMIT2(X86_JBE, OFFSET1); /* jbe out */
+
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
+ EMIT2(X86_JBE, offset); /* jbe out */
/*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
-#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE)
- EMIT2(X86_JA, OFFSET2); /* ja out */
+
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
+ EMIT2(X86_JAE, offset); /* jae out */
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
@@ -476,12 +486,11 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
* goto out;
*/
EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */
-#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE)
- EMIT2(X86_JE, OFFSET3); /* je out */
- *pprog = prog;
- pop_callee_regs(pprog, callee_regs_used);
- prog = *pprog;
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
+ EMIT2(X86_JE, offset); /* je out */
+
+ pop_callee_regs(&prog, callee_regs_used);
EMIT1(0x58); /* pop rax */
if (stack_depth)
@@ -498,71 +507,52 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
* rdi == ctx (1st arg)
* rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET
*/
- RETPOLINE_RCX_BPF_JIT();
+ emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start));
/* out: */
+ ctx->tail_call_indirect_label = prog - start;
*pprog = prog;
}
static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
- u8 **pprog, int addr, u8 *image,
- bool *callee_regs_used, u32 stack_depth)
+ u8 **pprog, u8 *ip,
+ bool *callee_regs_used, u32 stack_depth,
+ struct jit_context *ctx)
{
int tcc_off = -4 - round_up(stack_depth, 8);
- u8 *prog = *pprog;
- int pop_bytes = 0;
- int off1 = 20;
- int poke_off;
- int cnt = 0;
-
- /* count the additional bytes used for popping callee regs to stack
- * that need to be taken into account for jump offset that is used for
- * bailing out from of the tail call when limit is reached
- */
- pop_bytes = get_pop_bytes(callee_regs_used);
- off1 += pop_bytes;
-
- /*
- * total bytes for:
- * - nop5/ jmpq $off
- * - pop callee regs
- * - sub rsp, $val if depth > 0
- * - pop rax
- */
- poke_off = X86_PATCH_SIZE + pop_bytes + 1;
- if (stack_depth) {
- poke_off += 7;
- off1 += 7;
- }
+ u8 *prog = *pprog, *start = *pprog;
+ int offset;
/*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
- EMIT2(X86_JA, off1); /* ja out */
+
+ offset = ctx->tail_call_direct_label - (prog + 2 - start);
+ EMIT2(X86_JAE, offset); /* jae out */
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
- poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE);
+ poke->tailcall_bypass = ip + (prog - start);
poke->adj_off = X86_TAIL_CALL_OFFSET;
- poke->tailcall_target = image + (addr - X86_PATCH_SIZE);
+ poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE;
poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE;
emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
poke->tailcall_bypass);
- *pprog = prog;
- pop_callee_regs(pprog, callee_regs_used);
- prog = *pprog;
+ pop_callee_regs(&prog, callee_regs_used);
EMIT1(0x58); /* pop rax */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
- memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
+ memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
+
/* out: */
+ ctx->tail_call_direct_label = prog - start;
*pprog = prog;
}
@@ -576,6 +566,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
for (i = 0; i < prog->aux->size_poke_tab; i++) {
poke = &prog->aux->poke_tab[i];
+ if (poke->aux && poke->aux != prog->aux)
+ continue;
+
WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
@@ -585,24 +578,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
mutex_lock(&array->aux->poke_mutex);
target = array->ptrs[poke->tail_call.key];
if (target) {
- /* Plain memcpy is used when image is not live yet
- * and still not locked as read-only. Once poke
- * location is active (poke->tailcall_target_stable),
- * any parallel bpf_arch_text_poke() might occur
- * still on the read-write image until we finally
- * locked it as read-only. Both modifications on
- * the given image are under text_mutex to avoid
- * interference.
- */
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP, NULL,
(u8 *)target->bpf_func +
- poke->adj_off, false);
+ poke->adj_off);
BUG_ON(ret < 0);
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
(u8 *)poke->tailcall_target +
- X86_PATCH_SIZE, NULL, false);
+ X86_PATCH_SIZE, NULL);
BUG_ON(ret < 0);
}
WRITE_ONCE(poke->tailcall_target_stable, true);
@@ -615,7 +599,6 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
{
u8 *prog = *pprog;
u8 b1, b2, b3;
- int cnt = 0;
/*
* Optimization: if imm32 is positive, use 'mov %eax, imm32'
@@ -655,7 +638,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
const u32 imm32_hi, const u32 imm32_lo)
{
u8 *prog = *pprog;
- int cnt = 0;
if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
/*
@@ -678,7 +660,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
{
u8 *prog = *pprog;
- int cnt = 0;
if (is64) {
/* mov dst, src */
@@ -697,7 +678,6 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
{
u8 *prog = *pprog;
- int cnt = 0;
if (is_imm8(off)) {
/* 1-byte signed displacement.
@@ -720,7 +700,6 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
{
u8 *prog = *pprog;
- int cnt = 0;
if (is64)
EMIT1(add_2mod(0x48, dst_reg, src_reg));
@@ -729,11 +708,24 @@ static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
*pprog = prog;
}
+/*
+ * Similar version of maybe_emit_mod() for a single register
+ */
+static void maybe_emit_1mod(u8 **pprog, u32 reg, bool is64)
+{
+ u8 *prog = *pprog;
+
+ if (is64)
+ EMIT1(add_1mod(0x48, reg));
+ else if (is_ereg(reg))
+ EMIT1(add_1mod(0x40, reg));
+ *pprog = prog;
+}
+
/* LDX: dst_reg = *(u8*)(src_reg + off) */
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
u8 *prog = *pprog;
- int cnt = 0;
switch (size) {
case BPF_B:
@@ -764,7 +756,6 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
u8 *prog = *pprog;
- int cnt = 0;
switch (size) {
case BPF_B:
@@ -799,7 +790,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
{
u8 *prog = *pprog;
- int cnt = 0;
EMIT1(0xF0); /* lock prefix */
@@ -808,7 +798,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
/* emit opcode */
switch (atomic_op) {
case BPF_ADD:
- case BPF_SUB:
case BPF_AND:
case BPF_OR:
case BPF_XOR:
@@ -838,9 +827,7 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
return 0;
}
-static bool ex_handler_bpf(const struct exception_table_entry *x,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code, unsigned long fault_addr)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
{
u32 reg = x->fixup >> 8;
@@ -869,10 +856,10 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
}
}
-static int emit_nops(u8 **pprog, int len)
+static void emit_nops(u8 **pprog, int len)
{
u8 *prog = *pprog;
- int i, noplen, cnt = 0;
+ int i, noplen;
while (len > 0) {
noplen = len;
@@ -881,18 +868,16 @@ static int emit_nops(u8 **pprog, int len)
noplen = ASM_NOP_MAX;
for (i = 0; i < noplen; i++)
- EMIT1(ideal_nops[noplen][i]);
+ EMIT1(x86_nops[noplen][i]);
len -= noplen;
}
*pprog = prog;
-
- return cnt;
}
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
@@ -902,7 +887,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
bool tail_call_seen = false;
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
- int i, cnt = 0, excnt = 0;
+ int i, excnt = 0;
int ilen, proglen = 0;
u8 *prog = temp;
int err;
@@ -919,8 +904,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
push_callee_regs(&prog, callee_regs_used);
ilen = prog - temp;
- if (image)
- memcpy(image + proglen, temp, ilen);
+ if (rw_image)
+ memcpy(rw_image + proglen, temp, ilen);
proglen += ilen;
addrs[0] = proglen;
prog = temp;
@@ -964,10 +949,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
/* neg dst */
case BPF_ALU | BPF_NEG:
case BPF_ALU64 | BPF_NEG:
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
EMIT2(0xF7, add_1reg(0xD8, dst_reg));
break;
@@ -981,10 +964,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_AND | BPF_K:
case BPF_ALU64 | BPF_OR | BPF_K:
case BPF_ALU64 | BPF_XOR | BPF_K:
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
/*
* b3 holds 'normal' opcode, b2 short form only valid
@@ -1041,19 +1022,30 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_MOD | BPF_X:
case BPF_ALU64 | BPF_DIV | BPF_X:
case BPF_ALU64 | BPF_MOD | BPF_K:
- case BPF_ALU64 | BPF_DIV | BPF_K:
- EMIT1(0x50); /* push rax */
- EMIT1(0x52); /* push rdx */
+ case BPF_ALU64 | BPF_DIV | BPF_K: {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
- if (BPF_SRC(insn->code) == BPF_X)
- /* mov r11, src_reg */
- EMIT_mov(AUX_REG, src_reg);
- else
+ if (dst_reg != BPF_REG_0)
+ EMIT1(0x50); /* push rax */
+ if (dst_reg != BPF_REG_3)
+ EMIT1(0x52); /* push rdx */
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (src_reg == BPF_REG_0 ||
+ src_reg == BPF_REG_3) {
+ /* mov r11, src_reg */
+ EMIT_mov(AUX_REG, src_reg);
+ src_reg = AUX_REG;
+ }
+ } else {
/* mov r11, imm32 */
EMIT3_off32(0x49, 0xC7, 0xC3, imm32);
+ src_reg = AUX_REG;
+ }
- /* mov rax, dst_reg */
- EMIT_mov(BPF_REG_0, dst_reg);
+ if (dst_reg != BPF_REG_0)
+ /* mov rax, dst_reg */
+ emit_mov_reg(&prog, is64, BPF_REG_0, dst_reg);
/*
* xor edx, edx
@@ -1061,63 +1053,51 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
*/
EMIT2(0x31, 0xd2);
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- /* div r11 */
- EMIT3(0x49, 0xF7, 0xF3);
- else
- /* div r11d */
- EMIT3(0x41, 0xF7, 0xF3);
-
- if (BPF_OP(insn->code) == BPF_MOD)
- /* mov r11, rdx */
- EMIT3(0x49, 0x89, 0xD3);
- else
- /* mov r11, rax */
- EMIT3(0x49, 0x89, 0xC3);
+ /* div src_reg */
+ maybe_emit_1mod(&prog, src_reg, is64);
+ EMIT2(0xF7, add_1reg(0xF0, src_reg));
- EMIT1(0x5A); /* pop rdx */
- EMIT1(0x58); /* pop rax */
+ if (BPF_OP(insn->code) == BPF_MOD &&
+ dst_reg != BPF_REG_3)
+ /* mov dst_reg, rdx */
+ emit_mov_reg(&prog, is64, dst_reg, BPF_REG_3);
+ else if (BPF_OP(insn->code) == BPF_DIV &&
+ dst_reg != BPF_REG_0)
+ /* mov dst_reg, rax */
+ emit_mov_reg(&prog, is64, dst_reg, BPF_REG_0);
- /* mov dst_reg, r11 */
- EMIT_mov(dst_reg, AUX_REG);
+ if (dst_reg != BPF_REG_3)
+ EMIT1(0x5A); /* pop rdx */
+ if (dst_reg != BPF_REG_0)
+ EMIT1(0x58); /* pop rax */
break;
+ }
case BPF_ALU | BPF_MUL | BPF_K:
- case BPF_ALU | BPF_MUL | BPF_X:
case BPF_ALU64 | BPF_MUL | BPF_K:
- case BPF_ALU64 | BPF_MUL | BPF_X:
- {
- bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
-
- if (dst_reg != BPF_REG_0)
- EMIT1(0x50); /* push rax */
- if (dst_reg != BPF_REG_3)
- EMIT1(0x52); /* push rdx */
-
- /* mov r11, dst_reg */
- EMIT_mov(AUX_REG, dst_reg);
+ maybe_emit_mod(&prog, dst_reg, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
- if (BPF_SRC(insn->code) == BPF_X)
- emit_mov_reg(&prog, is64, BPF_REG_0, src_reg);
+ if (is_imm8(imm32))
+ /* imul dst_reg, dst_reg, imm8 */
+ EMIT3(0x6B, add_2reg(0xC0, dst_reg, dst_reg),
+ imm32);
else
- emit_mov_imm32(&prog, is64, BPF_REG_0, imm32);
+ /* imul dst_reg, dst_reg, imm32 */
+ EMIT2_off32(0x69,
+ add_2reg(0xC0, dst_reg, dst_reg),
+ imm32);
+ break;
- if (is64)
- EMIT1(add_1mod(0x48, AUX_REG));
- else if (is_ereg(AUX_REG))
- EMIT1(add_1mod(0x40, AUX_REG));
- /* mul(q) r11 */
- EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU64 | BPF_MUL | BPF_X:
+ maybe_emit_mod(&prog, src_reg, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
- if (dst_reg != BPF_REG_3)
- EMIT1(0x5A); /* pop rdx */
- if (dst_reg != BPF_REG_0) {
- /* mov dst_reg, rax */
- EMIT_mov(dst_reg, BPF_REG_0);
- EMIT1(0x58); /* pop rax */
- }
+ /* imul dst_reg, src_reg */
+ EMIT3(0x0F, 0xAF, add_2reg(0xC0, src_reg, dst_reg));
break;
- }
+
/* Shifts */
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU | BPF_RSH | BPF_K:
@@ -1125,10 +1105,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_LSH | BPF_K:
case BPF_ALU64 | BPF_RSH | BPF_K:
case BPF_ALU64 | BPF_ARSH | BPF_K:
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
b3 = simple_alu_opcodes[BPF_OP(insn->code)];
if (imm32 == 1)
@@ -1159,10 +1137,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
}
/* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
b3 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(0xD3, add_1reg(b3, dst_reg));
@@ -1232,6 +1208,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
}
break;
+ /* speculation barrier */
+ case BPF_ST | BPF_NOSPEC:
+ if (boot_cpu_has(X86_FEATURE_XMM2))
+ EMIT_LFENCE();
+ break;
+
/* ST: *(u8*)(dst_reg + off) = imm */
case BPF_ST | BPF_MEM | BPF_B:
if (is_ereg(dst_reg))
@@ -1280,24 +1262,59 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
- /* test src_reg, src_reg */
- maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
- EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
- /* jne start_of_ldx */
- EMIT2(X86_JNE, 0);
+ /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
+ * add abs(insn->off) to the limit to make sure that negative
+ * offset won't be an issue.
+ * insn->off is s16, so it won't affect valid pointers.
+ */
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
+ u8 *end_of_jmp1, *end_of_jmp2;
+
+ /* Conservatively check that src_reg + insn->off is a kernel address:
+ * 1. src_reg + insn->off >= limit
+ * 2. src_reg + insn->off doesn't become small positive.
+ * Cannot do src_reg + insn->off >= limit in one branch,
+ * since it needs two spare registers, but JIT has only one.
+ */
+
+ /* movabsq r11, limit */
+ EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
+ EMIT((u32)limit, 4);
+ EMIT(limit >> 32, 4);
+ /* cmp src_reg, r11 */
+ maybe_emit_mod(&prog, src_reg, AUX_REG, true);
+ EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+ /* if unsigned '<' goto end_of_jmp2 */
+ EMIT2(X86_JB, 0);
+ end_of_jmp1 = prog;
+
+ /* mov r11, src_reg */
+ emit_mov_reg(&prog, true, AUX_REG, src_reg);
+ /* add r11, insn->off */
+ maybe_emit_1mod(&prog, AUX_REG, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
+ /* jmp if not carry to start_of_ldx
+ * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
+ * that has to be rejected.
+ */
+ EMIT2(0x73 /* JNC */, 0);
+ end_of_jmp2 = prog;
+
/* xor dst_reg, dst_reg */
emit_mov_imm32(&prog, false, dst_reg, 0);
/* jmp byte_after_ldx */
EMIT2(0xEB, 0);
- /* populate jmp_offset for JNE above */
- temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ /* populate jmp_offset for JB above to jump to xor dst_reg */
+ end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
+ /* populate jmp_offset for JNC above to jump to start_of_ldx */
start_of_ldx = prog;
+ end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
}
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
- u8 *_insn = image + proglen;
+ u8 *_insn = image + proglen + (start_of_ldx - temp);
s64 delta;
/* populate jmp_offset for JMP above */
@@ -1317,14 +1334,12 @@ st: if (is_imm8(insn->off))
pr_err("extable->insn doesn't fit into 32-bit\n");
return -EFAULT;
}
+ /* switch ex to rw buffer for writes */
+ ex = (void *)rw_image + ((void *)ex - (void *)image);
+
ex->insn = delta;
- delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler;
- if (!is_simm32(delta)) {
- pr_err("extable->handler doesn't fit into 32-bit\n");
- return -EFAULT;
- }
- ex->handler = delta;
+ ex->data = EX_TYPE_BPF;
if (dst_reg > BPF_REG_9) {
pr_err("verifier error\n");
@@ -1338,7 +1353,7 @@ st: if (is_imm8(insn->off))
* End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
* of 4 bytes will be ignored and rbx will be zero inited.
*/
- ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
+ ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
}
break;
@@ -1347,9 +1362,10 @@ st: if (is_imm8(insn->off))
if (insn->imm == (BPF_AND | BPF_FETCH) ||
insn->imm == (BPF_OR | BPF_FETCH) ||
insn->imm == (BPF_XOR | BPF_FETCH)) {
- u8 *branch_target;
bool is64 = BPF_SIZE(insn->code) == BPF_DW;
u32 real_src_reg = src_reg;
+ u32 real_dst_reg = dst_reg;
+ u8 *branch_target;
/*
* Can't be implemented with a single x86 insn.
@@ -1360,11 +1376,13 @@ st: if (is_imm8(insn->off))
emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
if (src_reg == BPF_REG_0)
real_src_reg = BPF_REG_AX;
+ if (dst_reg == BPF_REG_0)
+ real_dst_reg = BPF_REG_AX;
branch_target = prog;
/* Load old value */
emit_ldx(&prog, BPF_SIZE(insn->code),
- BPF_REG_0, dst_reg, insn->off);
+ BPF_REG_0, real_dst_reg, insn->off);
/*
* Perform the (commutative) operation locally,
* put the result in the AUX_REG.
@@ -1375,7 +1393,8 @@ st: if (is_imm8(insn->off))
add_2reg(0xC0, AUX_REG, real_src_reg));
/* Attempt to swap in new value */
err = emit_atomic(&prog, BPF_CMPXCHG,
- dst_reg, AUX_REG, insn->off,
+ real_dst_reg, AUX_REG,
+ insn->off,
BPF_SIZE(insn->code));
if (WARN_ON(err))
return err;
@@ -1389,11 +1408,10 @@ st: if (is_imm8(insn->off))
/* Restore R0 after clobbering RAX */
emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
break;
-
}
err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
- insn->off, BPF_SIZE(insn->code));
+ insn->off, BPF_SIZE(insn->code));
if (err)
return err;
break;
@@ -1415,13 +1433,16 @@ st: if (is_imm8(insn->off))
case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
- &prog, addrs[i], image,
+ &prog, image + addrs[i - 1],
callee_regs_used,
- bpf_prog->aux->stack_depth);
+ bpf_prog->aux->stack_depth,
+ ctx);
else
emit_bpf_tail_call_indirect(&prog,
callee_regs_used,
- bpf_prog->aux->stack_depth);
+ bpf_prog->aux->stack_depth,
+ image + addrs[i - 1],
+ ctx);
break;
/* cond jump */
@@ -1462,10 +1483,8 @@ st: if (is_imm8(insn->off))
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP32 | BPF_JSET | BPF_K:
/* test dst_reg, imm32 */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
goto emit_cond_jmp;
@@ -1498,10 +1517,8 @@ st: if (is_imm8(insn->off))
}
/* cmp dst_reg, imm8/32 */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
if (is_imm8(imm32))
EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
@@ -1556,7 +1573,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
if (is_imm8(jmp_offset)) {
if (jmp_padding) {
/* To keep the jmp_offset valid, the extra bytes are
- * padded before the jump insn, so we substract the
+ * padded before the jump insn, so we subtract the
* 2 bytes of jmp_cond insn from INSN_SZ_DIFF.
*
* If the previous pass already emits an imm8
@@ -1576,7 +1593,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
nops);
return -EFAULT;
}
- cnt += emit_nops(&prog, nops);
+ emit_nops(&prog, nops);
}
EMIT2(jmp_cond, jmp_offset);
} else if (is_simm32(jmp_offset)) {
@@ -1622,7 +1639,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
nops);
return -EFAULT;
}
- cnt += emit_nops(&prog, nops);
+ emit_nops(&prog, nops);
}
break;
}
@@ -1631,7 +1648,7 @@ emit_jmp:
if (jmp_padding) {
/* To avoid breaking jmp_offset, the extra bytes
* are padded before the actual jmp insn, so
- * 2 bytes is substracted from INSN_SZ_DIFF.
+ * 2 bytes is subtracted from INSN_SZ_DIFF.
*
* If the previous pass already emits an imm8
* jmp, there is nothing to pad (0 byte).
@@ -1647,7 +1664,7 @@ emit_jmp:
nops);
return -EFAULT;
}
- cnt += emit_nops(&prog, INSN_SZ_DIFF - 2);
+ emit_nops(&prog, INSN_SZ_DIFF - 2);
}
EMIT2(0xEB, jmp_offset);
} else if (is_simm32(jmp_offset)) {
@@ -1689,11 +1706,20 @@ emit_jmp:
}
if (image) {
- if (unlikely(proglen + ilen > oldproglen)) {
+ /*
+ * When populating the image, assert that:
+ *
+ * i) We do not write beyond the allocated space, and
+ * ii) addrs[i] did not change from the prior run, in order
+ * to validate assumptions made for computing branch
+ * displacements.
+ */
+ if (unlikely(proglen + ilen > oldproglen ||
+ proglen + ilen != addrs[i])) {
pr_err("bpf_jit: fatal error\n");
return -EFAULT;
}
- memcpy(image + proglen, temp, ilen);
+ memcpy(rw_image + proglen, temp, ilen);
}
proglen += ilen;
addrs[i] = proglen;
@@ -1741,14 +1767,32 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
}
static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
- struct bpf_prog *p, int stack_size, bool mod_ret)
+ struct bpf_tramp_link *l, int stack_size,
+ int run_ctx_off, bool save_ret)
{
u8 *prog = *pprog;
u8 *jmp_insn;
- int cnt = 0;
+ int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
+ struct bpf_prog *p = l->link.prog;
+ u64 cookie = l->cookie;
+
+ /* mov rdi, cookie */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie);
+
+ /* Prepare struct bpf_tramp_run_ctx.
+ *
+ * bpf_tramp_run_ctx is already preserved by
+ * arch_prepare_bpf_trampoline().
+ *
+ * mov QWORD PTR [rbp - run_ctx_off + ctx_cookie_off], rdi
+ */
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);
/* arg1: mov rdi, progs[i] */
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
+ /* arg2: lea rsi, [rbp - ctx_cookie_off] */
+ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
+
if (emit_call(&prog,
p->aux->sleepable ? __bpf_prog_enter_sleepable :
__bpf_prog_enter, prog))
@@ -1775,11 +1819,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (emit_call(&prog, p->bpf_func, prog))
return -EINVAL;
- /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return
+ /*
+ * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return
* of the previous call which is then passed on the stack to
* the next BPF program.
+ *
+ * BPF_TRAMP_FENTRY trampoline may need to return the return
+ * value of BPF_PROG_TYPE_STRUCT_OPS prog.
*/
- if (mod_ret)
+ if (save_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
/* replace 2 nops with JE insn, since jmp target is known */
@@ -1790,6 +1838,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
/* arg2: mov rsi, rbx <- start time in nsec */
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+ /* arg3: lea rdx, [rbp - run_ctx_off] */
+ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
if (emit_call(&prog,
p->aux->sleepable ? __bpf_prog_exit_sleepable :
__bpf_prog_exit, prog))
@@ -1813,7 +1863,6 @@ static void emit_align(u8 **pprog, u32 align)
static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
{
u8 *prog = *pprog;
- int cnt = 0;
s64 offset;
offset = func - (ip + 2 + 4);
@@ -1827,13 +1876,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
}
static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
- struct bpf_tramp_progs *tp, int stack_size)
+ struct bpf_tramp_links *tl, int stack_size,
+ int run_ctx_off, bool save_ret)
{
int i;
u8 *prog = *pprog;
- for (i = 0; i < tp->nr_progs; i++) {
- if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false))
+ for (i = 0; i < tl->nr_links; i++) {
+ if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
+ run_ctx_off, save_ret))
return -EINVAL;
}
*pprog = prog;
@@ -1841,19 +1892,19 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
}
static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
- struct bpf_tramp_progs *tp, int stack_size,
- u8 **branches)
+ struct bpf_tramp_links *tl, int stack_size,
+ int run_ctx_off, u8 **branches)
{
u8 *prog = *pprog;
- int i, cnt = 0;
+ int i;
/* The first fmod_ret program will receive a garbage return value.
* Set this to 0 to avoid confusing the program.
*/
emit_mov_imm32(&prog, false, BPF_REG_0, 0);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- for (i = 0; i < tp->nr_progs; i++) {
- if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true))
+ for (i = 0; i < tl->nr_links; i++) {
+ if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true))
return -EINVAL;
/* mod_ret prog stored return value into [rbp - 8]. Emit:
@@ -1876,6 +1927,23 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
return 0;
}
+static bool is_valid_bpf_tramp_flags(unsigned int flags)
+{
+ if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
+ (flags & BPF_TRAMP_F_SKIP_FRAME))
+ return false;
+
+ /*
+ * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
+ * and it must be used alone.
+ */
+ if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) &&
+ (flags & ~BPF_TRAMP_F_RET_FENTRY_RET))
+ return false;
+
+ return true;
+}
+
/* Example:
* __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
* its 'struct btf_func_model' will be nr_args=2
@@ -1938,42 +2006,99 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
*/
int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_progs *tprogs,
+ struct bpf_tramp_links *tlinks,
void *orig_call)
{
- int ret, i, cnt = 0, nr_args = m->nr_args;
- int stack_size = nr_args * 8;
- struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
- struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
- struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
+ int ret, i, nr_args = m->nr_args;
+ int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
+ struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
+ struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
+ struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
u8 **branches = NULL;
u8 *prog;
+ bool save_ret;
/* x86-64 supports up to 6 arguments. 7+ can be added in the future */
if (nr_args > 6)
return -ENOTSUPP;
- if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
- (flags & BPF_TRAMP_F_SKIP_FRAME))
+ if (!is_valid_bpf_tramp_flags(flags))
return -EINVAL;
- if (flags & BPF_TRAMP_F_CALL_ORIG)
- stack_size += 8; /* room for return value of orig_call */
+ /* Generated trampoline stack layout:
+ *
+ * RBP + 8 [ return address ]
+ * RBP + 0 [ RBP ]
+ *
+ * RBP - 8 [ return value ] BPF_TRAMP_F_CALL_ORIG or
+ * BPF_TRAMP_F_RET_FENTRY_RET flags
+ *
+ * [ reg_argN ] always
+ * [ ... ]
+ * RBP - regs_off [ reg_arg1 ] program's ctx pointer
+ *
+ * RBP - args_off [ args count ] always
+ *
+ * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
+ *
+ * RBP - run_ctx_off [ bpf_tramp_run_ctx ]
+ */
- if (flags & BPF_TRAMP_F_SKIP_FRAME)
+ /* room for return value of orig_call or fentry prog */
+ save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
+ if (save_ret)
+ stack_size += 8;
+
+ regs_off = stack_size;
+
+ /* args count */
+ stack_size += 8;
+ args_off = stack_size;
+
+ if (flags & BPF_TRAMP_F_IP_ARG)
+ stack_size += 8; /* room for IP address argument */
+
+ ip_off = stack_size;
+
+ stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7;
+ run_ctx_off = stack_size;
+
+ if (flags & BPF_TRAMP_F_SKIP_FRAME) {
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
*/
+ if (is_endbr(*(u32 *)orig_call))
+ orig_call += ENDBR_INSN_SIZE;
orig_call += X86_PATCH_SIZE;
+ }
prog = image;
+ EMIT_ENDBR();
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
EMIT1(0x53); /* push rbx */
- save_regs(m, &prog, nr_args, stack_size);
+ /* Store number of arguments of the traced function:
+ * mov rax, nr_args
+ * mov QWORD PTR [rbp - args_off], rax
+ */
+ emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args);
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
+
+ if (flags & BPF_TRAMP_F_IP_ARG) {
+ /* Store IP address of the traced function:
+ * mov rax, QWORD PTR [rbp + 8]
+ * sub rax, X86_PATCH_SIZE
+ * mov QWORD PTR [rbp - ip_off], rax
+ */
+ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
+ EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
+ }
+
+ save_regs(m, &prog, nr_args, regs_off);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
@@ -1984,25 +2109,26 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
}
- if (fentry->nr_progs)
- if (invoke_bpf(m, &prog, fentry, stack_size))
+ if (fentry->nr_links)
+ if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
+ flags & BPF_TRAMP_F_RET_FENTRY_RET))
return -EINVAL;
- if (fmod_ret->nr_progs) {
- branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *),
+ if (fmod_ret->nr_links) {
+ branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
GFP_KERNEL);
if (!branches)
return -ENOMEM;
- if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size,
- branches)) {
+ if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
+ run_ctx_off, branches)) {
ret = -EINVAL;
goto cleanup;
}
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- restore_regs(m, &prog, nr_args, stack_size);
+ restore_regs(m, &prog, nr_args, regs_off);
/* call original function */
if (emit_call(&prog, orig_call, prog)) {
@@ -2012,11 +2138,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
im->ip_after_call = prog;
- memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
+ memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
}
- if (fmod_ret->nr_progs) {
+ if (fmod_ret->nr_links) {
/* From Intel 64 and IA-32 Architectures Optimization
* Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
* Coding Rule 11: All branch targets should be 16-byte
@@ -2026,19 +2152,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
/* Update the branches saved in invoke_bpf_mod_ret with the
* aligned address of do_fexit.
*/
- for (i = 0; i < fmod_ret->nr_progs; i++)
+ for (i = 0; i < fmod_ret->nr_links; i++)
emit_cond_near_jump(&branches[i], prog, branches[i],
X86_JNE);
}
- if (fexit->nr_progs)
- if (invoke_bpf(m, &prog, fexit, stack_size)) {
+ if (fexit->nr_links)
+ if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) {
ret = -EINVAL;
goto cleanup;
}
if (flags & BPF_TRAMP_F_RESTORE_REGS)
- restore_regs(m, &prog, nr_args, stack_size);
+ restore_regs(m, &prog, nr_args, regs_off);
/* This needs to be done regardless. If there were fmod_ret programs,
* the return value is only updated on the stack and still needs to be
@@ -2052,9 +2178,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
ret = -EINVAL;
goto cleanup;
}
- /* restore original return value back into RAX */
- emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
}
+ /* restore return value of orig_call or fentry prog back into RAX */
+ if (save_ret)
+ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
EMIT1(0x5B); /* pop rbx */
EMIT1(0xC9); /* leave */
@@ -2074,30 +2201,10 @@ cleanup:
return ret;
}
-static int emit_fallback_jump(u8 **pprog)
-{
- u8 *prog = *pprog;
- int err = 0;
-
-#ifdef CONFIG_RETPOLINE
- /* Note that this assumes the the compiler uses external
- * thunks for indirect calls. Both clang and GCC use the same
- * naming convention for external thunks.
- */
- err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog);
-#else
- int cnt = 0;
-
- EMIT2(0xFF, 0xE2); /* jmp rdx */
-#endif
- *pprog = prog;
- return err;
-}
-
static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
{
u8 *jg_reloc, *prog = *pprog;
- int pivot, err, jg_bytes = 1, cnt = 0;
+ int pivot, err, jg_bytes = 1;
s64 jg_offset;
if (a == b) {
@@ -2115,9 +2222,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
if (err)
return err;
- err = emit_fallback_jump(&prog); /* jmp thunk/indirect */
- if (err)
- return err;
+ emit_indirect_jump(&prog, 2 /* rdx */, prog);
*pprog = prog;
return 0;
@@ -2185,6 +2290,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
}
struct x64_jit_data {
+ struct bpf_binary_header *rw_header;
struct bpf_binary_header *header;
int *addrs;
u8 *image;
@@ -2197,6 +2303,7 @@ struct x64_jit_data {
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
+ struct bpf_binary_header *rw_header = NULL;
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
struct x64_jit_data *jit_data;
@@ -2205,6 +2312,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bool tmp_blinded = false;
bool extra_pass = false;
bool padding = false;
+ u8 *rw_image = NULL;
u8 *image = NULL;
int *addrs;
int pass;
@@ -2240,6 +2348,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
oldproglen = jit_data->proglen;
image = jit_data->image;
header = jit_data->header;
+ rw_header = jit_data->rw_header;
+ rw_image = (void *)rw_header + ((void *)image - (void *)header);
extra_pass = true;
padding = true;
goto skip_init_addrs;
@@ -2270,13 +2380,22 @@ skip_init_addrs:
for (pass = 0; pass < MAX_PASSES || image; pass++) {
if (!padding && pass >= PADDING_PASSES)
padding = true;
- proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
+ proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding);
if (proglen <= 0) {
out_image:
image = NULL;
- if (header)
- bpf_jit_binary_free(header);
+ if (header) {
+ bpf_arch_text_copy(&header->size, &rw_header->size,
+ sizeof(rw_header->size));
+ bpf_jit_binary_pack_free(header, rw_header);
+ }
+ /* Fall back to interpreter mode */
prog = orig_prog;
+ if (extra_pass) {
+ prog->bpf_func = NULL;
+ prog->jited = 0;
+ prog->jited_len = 0;
+ }
goto out_addrs;
}
if (image) {
@@ -2299,8 +2418,9 @@ out_image:
sizeof(struct exception_table_entry);
/* allocate module memory for x86 insns and extable */
- header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
- &image, align, jit_fill_hole);
+ header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size,
+ &image, align, &rw_header, &rw_image,
+ jit_fill_hole);
if (!header) {
prog = orig_prog;
goto out_addrs;
@@ -2316,14 +2436,27 @@ out_image:
if (image) {
if (!prog->is_func || extra_pass) {
+ /*
+ * bpf_jit_binary_pack_finalize fails in two scenarios:
+ * 1) header is not pointing to proper module memory;
+ * 2) the arch doesn't support bpf_arch_text_copy().
+ *
+ * Both cases are serious bugs and justify WARN_ON.
+ */
+ if (WARN_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header))) {
+ /* header has been freed */
+ header = NULL;
+ goto out_image;
+ }
+
bpf_tail_call_direct_fixup(prog);
- bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
jit_data->header = header;
+ jit_data->rw_header = rw_header;
}
prog->bpf_func = (void *)image;
prog->jited = 1;
@@ -2346,3 +2479,15 @@ out:
tmp : orig_prog);
return prog;
}
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+ return true;
+}
+
+void *bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ if (text_poke_copy(dst, src, len) == NULL)
+ return ERR_PTR(-EINVAL);
+ return dst;
+}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index d17b67c69f89..429a89c5468b 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -15,6 +15,7 @@
#include <asm/cacheflush.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
+#include <asm/asm-prototypes.h>
#include <linux/bpf.h>
/*
@@ -1267,6 +1268,21 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth)
*pprog = prog;
}
+static int emit_jmp_edx(u8 **pprog, u8 *ip)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+#ifdef CONFIG_RETPOLINE
+ EMIT1_off32(0xE9, (u8 *)__x86_indirect_thunk_edx - (ip + 5));
+#else
+ EMIT2(0xFF, 0xE2);
+#endif
+ *pprog = prog;
+
+ return cnt;
+}
+
/*
* Generate the following code:
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
@@ -1280,7 +1296,7 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth)
* goto *(prog->bpf_func + prologue_size);
* out:
*/
-static void emit_bpf_tail_call(u8 **pprog)
+static void emit_bpf_tail_call(u8 **pprog, u8 *ip)
{
u8 *prog = *pprog;
int cnt = 0;
@@ -1307,7 +1323,7 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
/*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
lo = (u32)MAX_TAIL_CALL_CNT;
@@ -1321,7 +1337,7 @@ static void emit_bpf_tail_call(u8 **pprog)
/* cmp ecx,lo */
EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
- /* ja out */
+ /* jae out */
EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
/* add eax,0x1 */
@@ -1362,7 +1378,7 @@ static void emit_bpf_tail_call(u8 **pprog)
* eax == ctx (1st arg)
* edx == prog->bpf_func + prologue_size
*/
- RETPOLINE_EDX_BPF_JIT();
+ cnt += emit_jmp_edx(&prog, ip + cnt);
if (jmp_label1 == -1)
jmp_label1 = cnt;
@@ -1390,6 +1406,19 @@ static inline void emit_push_r64(const u8 src[], u8 **pprog)
*pprog = prog;
}
+static void emit_push_r32(const u8 src[], u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+ /* push ecx */
+ EMIT1(0x51);
+
+ *pprog = prog;
+}
+
static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
{
u8 jmp_cond;
@@ -1459,6 +1488,174 @@ static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
return jmp_cond;
}
+/* i386 kernel compiles with "-mregparm=3". From gcc document:
+ *
+ * ==== snippet ====
+ * regparm (number)
+ * On x86-32 targets, the regparm attribute causes the compiler
+ * to pass arguments number one to (number) if they are of integral
+ * type in registers EAX, EDX, and ECX instead of on the stack.
+ * Functions that take a variable number of arguments continue
+ * to be passed all of their arguments on the stack.
+ * ==== snippet ====
+ *
+ * The first three args of a function will be considered for
+ * putting into the 32bit register EAX, EDX, and ECX.
+ *
+ * Two 32bit registers are used to pass a 64bit arg.
+ *
+ * For example,
+ * void foo(u32 a, u32 b, u32 c, u32 d):
+ * u32 a: EAX
+ * u32 b: EDX
+ * u32 c: ECX
+ * u32 d: stack
+ *
+ * void foo(u64 a, u32 b, u32 c):
+ * u64 a: EAX (lo32) EDX (hi32)
+ * u32 b: ECX
+ * u32 c: stack
+ *
+ * void foo(u32 a, u64 b, u32 c):
+ * u32 a: EAX
+ * u64 b: EDX (lo32) ECX (hi32)
+ * u32 c: stack
+ *
+ * void foo(u32 a, u32 b, u64 c):
+ * u32 a: EAX
+ * u32 b: EDX
+ * u64 c: stack
+ *
+ * The return value will be stored in the EAX (and EDX for 64bit value).
+ *
+ * For example,
+ * u32 foo(u32 a, u32 b, u32 c):
+ * return value: EAX
+ *
+ * u64 foo(u32 a, u32 b, u32 c):
+ * return value: EAX (lo32) EDX (hi32)
+ *
+ * Notes:
+ * The verifier only accepts function having integer and pointers
+ * as its args and return value, so it does not have
+ * struct-by-value.
+ *
+ * emit_kfunc_call() finds out the btf_func_model by calling
+ * bpf_jit_find_kfunc_model(). A btf_func_model
+ * has the details about the number of args, size of each arg,
+ * and the size of the return value.
+ *
+ * It first decides how many args can be passed by EAX, EDX, and ECX.
+ * That will decide what args should be pushed to the stack:
+ * [first_stack_regno, last_stack_regno] are the bpf regnos
+ * that should be pushed to the stack.
+ *
+ * It will first push all args to the stack because the push
+ * will need to use ECX. Then, it moves
+ * [BPF_REG_1, first_stack_regno) to EAX, EDX, and ECX.
+ *
+ * When emitting a call (0xE8), it needs to figure out
+ * the jmp_offset relative to the jit-insn address immediately
+ * following the call (0xE8) instruction. At this point, it knows
+ * the end of the jit-insn address after completely translated the
+ * current (BPF_JMP | BPF_CALL) bpf-insn. It is passed as "end_addr"
+ * to the emit_kfunc_call(). Thus, it can learn the "immediate-follow-call"
+ * address by figuring out how many jit-insn is generated between
+ * the call (0xE8) and the end_addr:
+ * - 0-1 jit-insn (3 bytes each) to restore the esp pointer if there
+ * is arg pushed to the stack.
+ * - 0-2 jit-insns (3 bytes each) to handle the return value.
+ */
+static int emit_kfunc_call(const struct bpf_prog *bpf_prog, u8 *end_addr,
+ const struct bpf_insn *insn, u8 **pprog)
+{
+ const u8 arg_regs[] = { IA32_EAX, IA32_EDX, IA32_ECX };
+ int i, cnt = 0, first_stack_regno, last_stack_regno;
+ int free_arg_regs = ARRAY_SIZE(arg_regs);
+ const struct btf_func_model *fm;
+ int bytes_in_stack = 0;
+ const u8 *cur_arg_reg;
+ u8 *prog = *pprog;
+ s64 jmp_offset;
+
+ fm = bpf_jit_find_kfunc_model(bpf_prog, insn);
+ if (!fm)
+ return -EINVAL;
+
+ first_stack_regno = BPF_REG_1;
+ for (i = 0; i < fm->nr_args; i++) {
+ int regs_needed = fm->arg_size[i] > sizeof(u32) ? 2 : 1;
+
+ if (regs_needed > free_arg_regs)
+ break;
+
+ free_arg_regs -= regs_needed;
+ first_stack_regno++;
+ }
+
+ /* Push the args to the stack */
+ last_stack_regno = BPF_REG_0 + fm->nr_args;
+ for (i = last_stack_regno; i >= first_stack_regno; i--) {
+ if (fm->arg_size[i - 1] > sizeof(u32)) {
+ emit_push_r64(bpf2ia32[i], &prog);
+ bytes_in_stack += 8;
+ } else {
+ emit_push_r32(bpf2ia32[i], &prog);
+ bytes_in_stack += 4;
+ }
+ }
+
+ cur_arg_reg = &arg_regs[0];
+ for (i = BPF_REG_1; i < first_stack_regno; i++) {
+ /* mov e[adc]x,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++),
+ STACK_VAR(bpf2ia32[i][0]));
+ if (fm->arg_size[i - 1] > sizeof(u32))
+ /* mov e[adc]x,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++),
+ STACK_VAR(bpf2ia32[i][1]));
+ }
+
+ if (bytes_in_stack)
+ /* add esp,"bytes_in_stack" */
+ end_addr -= 3;
+
+ /* mov dword ptr [ebp+off],edx */
+ if (fm->ret_size > sizeof(u32))
+ end_addr -= 3;
+
+ /* mov dword ptr [ebp+off],eax */
+ if (fm->ret_size)
+ end_addr -= 3;
+
+ jmp_offset = (u8 *)__bpf_call_base + insn->imm - end_addr;
+ if (!is_simm32(jmp_offset)) {
+ pr_err("unsupported BPF kernel function jmp_offset:%lld\n",
+ jmp_offset);
+ return -EINVAL;
+ }
+
+ EMIT1_off32(0xE8, jmp_offset);
+
+ if (fm->ret_size)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(bpf2ia32[BPF_REG_0][0]));
+
+ if (fm->ret_size > sizeof(u32))
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(bpf2ia32[BPF_REG_0][1]));
+
+ if (bytes_in_stack)
+ /* add esp,"bytes_in_stack" */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ESP), bytes_in_stack);
+
+ *pprog = prog;
+
+ return 0;
+}
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
@@ -1705,6 +1902,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
i++;
break;
}
+ /* speculation barrier */
+ case BPF_ST | BPF_NOSPEC:
+ if (boot_cpu_has(X86_FEATURE_XMM2))
+ /* Emit 'lfence' */
+ EMIT3(0x0F, 0xAE, 0xE8);
+ break;
/* ST: *(u8*)(dst_reg + off) = imm */
case BPF_ST | BPF_MEM | BPF_H:
case BPF_ST | BPF_MEM | BPF_B:
@@ -1888,6 +2091,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
if (insn->src_reg == BPF_PSEUDO_CALL)
goto notyet;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ int err;
+
+ err = emit_kfunc_call(bpf_prog,
+ image + addrs[i],
+ insn, &prog);
+
+ if (err)
+ return err;
+ break;
+ }
+
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
@@ -1923,7 +2138,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
break;
}
case BPF_JMP | BPF_TAIL_CALL:
- emit_bpf_tail_call(&prog);
+ emit_bpf_tail_call(&prog, image + addrs[i - 1]);
break;
/* cond jump */
@@ -2276,7 +2491,16 @@ notyet:
}
if (image) {
- if (unlikely(proglen + ilen > oldproglen)) {
+ /*
+ * When populating the image, assert that:
+ *
+ * i) We do not write beyond the allocated space, and
+ * ii) addrs[i] did not change from the prior run, in order
+ * to validate assumptions made for computing branch
+ * displacements.
+ */
+ if (unlikely(proglen + ilen > oldproglen ||
+ proglen + ilen != addrs[i])) {
pr_err("bpf_jit: fatal error\n");
return -EFAULT;
}
@@ -2393,3 +2617,8 @@ out:
tmp : orig_prog);
return prog;
}
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+ return true;
+}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 948656069cdd..2f82480fd430 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -19,8 +19,9 @@ struct pci_root_info {
#endif
};
+bool pci_use_e820 = true;
static bool pci_use_crs = true;
-static bool pci_ignore_seg = false;
+static bool pci_ignore_seg;
static int __init set_use_crs(const struct dmi_system_id *id)
{
@@ -41,6 +42,14 @@ static int __init set_ignore_seg(const struct dmi_system_id *id)
return 0;
}
+static int __init set_no_e820(const struct dmi_system_id *id)
+{
+ printk(KERN_INFO "PCI: %s detected: not clipping E820 regions from _CRS\n",
+ id->ident);
+ pci_use_e820 = false;
+ return 0;
+}
+
static const struct dmi_system_id pci_crs_quirks[] __initconst = {
/* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
{
@@ -135,6 +144,51 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP xw9300 Workstation"),
},
},
+
+ /*
+ * Many Lenovo models with "IIL" in their DMI_PRODUCT_VERSION have
+ * an E820 reserved region that covers the entire 32-bit host
+ * bridge memory window from _CRS. Using the E820 region to clip
+ * _CRS means no space is available for hot-added or uninitialized
+ * PCI devices. This typically breaks I2C controllers for touchpads
+ * and hot-added Thunderbolt devices. See the commit log for
+ * models known to require this quirk and related bug reports.
+ */
+ {
+ .callback = set_no_e820,
+ .ident = "Lenovo *IIL* product version",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "IIL"),
+ },
+ },
+
+ /*
+ * The Acer Spin 5 (SP513-54N) has the same E820 reservation covering
+ * the entire _CRS 32-bit window issue as the Lenovo *IIL* models.
+ * See https://bugs.launchpad.net/bugs/1884232
+ */
+ {
+ .callback = set_no_e820,
+ .ident = "Acer Spin 5 (SP513-54N)",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Spin SP513-54N"),
+ },
+ },
+
+ /*
+ * Clevo X170KM-G barebones have the same E820 reservation covering
+ * the entire _CRS 32-bit window issue as the Lenovo *IIL* models.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=214259
+ */
+ {
+ .callback = set_no_e820,
+ .ident = "Clevo X170KM-G Barebone",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_NAME, "X170KM-G"),
+ },
+ },
{}
};
@@ -145,6 +199,27 @@ void __init pci_acpi_crs_quirks(void)
if (year >= 0 && year < 2008 && iomem_resource.end <= 0xffffffff)
pci_use_crs = false;
+ /*
+ * Some firmware includes unusable space (host bridge registers,
+ * hidden PCI device BARs, etc) in PCI host bridge _CRS. This is a
+ * firmware defect, and 4dc2287c1805 ("x86: avoid E820 regions when
+ * allocating address space") has clipped out the unusable space in
+ * the past.
+ *
+ * But other firmware supplies E820 reserved regions that cover
+ * entire _CRS windows, so clipping throws away the entire window,
+ * leaving none for hot-added or uninitialized devices. These E820
+ * entries are probably *not* a firmware defect, so disable the
+ * clipping by default for post-2022 machines.
+ *
+ * We already have quirks to disable clipping for pre-2023
+ * machines, and we'll likely need quirks to *enable* clipping for
+ * post-2022 machines that incorrectly include unusable space in
+ * _CRS.
+ */
+ if (year >= 2023)
+ pci_use_e820 = false;
+
dmi_check_system(pci_crs_quirks);
/*
@@ -160,6 +235,17 @@ void __init pci_acpi_crs_quirks(void)
"if necessary, use \"pci=%s\" and report a bug\n",
pci_use_crs ? "Using" : "Ignoring",
pci_use_crs ? "nocrs" : "use_crs");
+
+ /* "pci=use_e820"/"pci=no_e820" on the kernel cmdline takes precedence */
+ if (pci_probe & PCI_NO_E820)
+ pci_use_e820 = false;
+ else if (pci_probe & PCI_USE_E820)
+ pci_use_e820 = true;
+
+ printk(KERN_INFO "PCI: %s E820 reservations for host bridge windows\n",
+ pci_use_e820 ? "Using" : "Ignoring");
+ if (pci_probe & (PCI_NO_E820 | PCI_USE_E820))
+ printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can this automatically\n");
}
#ifdef CONFIG_PCI_MMCONFIG
@@ -299,6 +385,7 @@ static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
int status;
status = acpi_pci_probe_root_resources(ci);
+
if (pci_use_crs) {
resource_list_for_each_entry_safe(entry, tmp, &ci->resources)
if (resource_is_pcicfg_ioport(entry->res))
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index bfa50e65ef6c..dd40d3fea74e 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -126,7 +126,7 @@ static int __init early_root_info_init(void)
node = (reg >> 4) & 0x07;
link = (reg >> 8) & 0x03;
- info = alloc_pci_root_info(min_bus, max_bus, node, link);
+ alloc_pci_root_info(min_bus, max_bus, node, link);
}
/*
@@ -284,7 +284,7 @@ static int __init early_root_info_init(void)
/* need to take out [4G, TOM2) for RAM*/
/* SYS_CFG */
- address = MSR_K8_SYSCFG;
+ address = MSR_AMD64_SYSCFG;
rdmsrl(address, val);
/* TOP_MEM2 is enabled? */
if (val & (1<<21)) {
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 3507f456fcd0..ddb798603201 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -595,6 +595,14 @@ char *__init pcibios_setup(char *str)
} else if (!strcmp(str, "nocrs")) {
pci_probe |= PCI_ROOT_NO_CRS;
return NULL;
+ } else if (!strcmp(str, "use_e820")) {
+ pci_probe |= PCI_USE_E820;
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ return NULL;
+ } else if (!strcmp(str, "no_e820")) {
+ pci_probe |= PCI_NO_E820;
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ return NULL;
#ifdef CONFIG_PHYS_ADDR_T_64BIT
} else if (!strcmp(str, "big_root_window")) {
pci_probe |= PCI_BIG_ROOT_WINDOW;
@@ -632,7 +640,7 @@ static void set_dev_domain_options(struct pci_dev *pdev)
pdev->hotplug_user_indicators = 1;
}
-int pcibios_add_device(struct pci_dev *dev)
+int pcibios_device_add(struct pci_dev *dev)
{
struct pci_setup_rom *rom;
struct irq_domain *msidom;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 0a0e168be1cb..615a76d70019 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -353,8 +353,8 @@ static void pci_fixup_video(struct pci_dev *pdev)
}
}
}
-DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
- PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID,
+ PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
static const struct dmi_system_id msi_k8t_dmi_table[] = {
@@ -375,7 +375,7 @@ static const struct dmi_system_id msi_k8t_dmi_table[] = {
* The BIOS only gives options "DISABLED" and "AUTO". This code sets
* the corresponding register-value to enable the soundcard.
*
- * The soundcard is only enabled, if the mainborad is identified
+ * The soundcard is only enabled, if the mainboard is identified
* via DMI-tables and the soundcard is detected to be off.
*/
static void pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev)
@@ -779,4 +779,48 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar);
DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
+#define RS690_LOWER_TOP_OF_DRAM2 0x30
+#define RS690_LOWER_TOP_OF_DRAM2_VALID 0x1
+#define RS690_UPPER_TOP_OF_DRAM2 0x31
+#define RS690_HTIU_NB_INDEX 0xA8
+#define RS690_HTIU_NB_INDEX_WR_ENABLE 0x100
+#define RS690_HTIU_NB_DATA 0xAC
+
+/*
+ * Some BIOS implementations support RAM above 4GB, but do not configure the
+ * PCI host to respond to bus master accesses for these addresses. These
+ * implementations set the TOP_OF_DRAM_SLOT1 register correctly, so PCI DMA
+ * works as expected for addresses below 4GB.
+ *
+ * Reference: "AMD RS690 ASIC Family Register Reference Guide" (pg. 2-57)
+ * https://www.amd.com/system/files/TechDocs/43372_rs690_rrg_3.00o.pdf
+ */
+static void rs690_fix_64bit_dma(struct pci_dev *pdev)
+{
+ u32 val = 0;
+ phys_addr_t top_of_dram = __pa(high_memory - 1) + 1;
+
+ if (top_of_dram <= (1ULL << 32))
+ return;
+
+ pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX,
+ RS690_LOWER_TOP_OF_DRAM2);
+ pci_read_config_dword(pdev, RS690_HTIU_NB_DATA, &val);
+
+ if (val)
+ return;
+
+ pci_info(pdev, "Adjusting top of DRAM to %pa for 64-bit DMA support\n", &top_of_dram);
+
+ pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX,
+ RS690_UPPER_TOP_OF_DRAM2 | RS690_HTIU_NB_INDEX_WR_ENABLE);
+ pci_write_config_dword(pdev, RS690_HTIU_NB_DATA, top_of_dram >> 32);
+
+ pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX,
+ RS690_LOWER_TOP_OF_DRAM2 | RS690_HTIU_NB_INDEX_WR_ENABLE);
+ pci_write_config_dword(pdev, RS690_HTIU_NB_DATA,
+ top_of_dram | RS690_LOWER_TOP_OF_DRAM2_VALID);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma);
+
#endif
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index d3a73f9335e1..a498b847d740 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -13,14 +13,20 @@
#include <linux/dmi.h>
#include <linux/io.h>
#include <linux/smp.h>
+#include <linux/spinlock.h>
#include <asm/io_apic.h>
#include <linux/irq.h>
#include <linux/acpi.h>
+
+#include <asm/i8259.h>
+#include <asm/pc-conf-reg.h>
#include <asm/pci_x86.h>
#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
#define PIRQ_VERSION 0x0100
+#define IRT_SIGNATURE (('$' << 0) + ('I' << 8) + ('R' << 16) + ('T' << 24))
+
static int broken_hp_bios_irq9;
static int acer_tm360_irqrouting;
@@ -47,6 +53,8 @@ struct irq_router {
int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
int new);
+ int (*lvl)(struct pci_dev *router, struct pci_dev *dev, int pirq,
+ int irq);
};
struct irq_router_handler {
@@ -62,30 +70,99 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq;
* and perform checksum verification.
*/
-static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
+static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr,
+ u8 *limit)
{
struct irq_routing_table *rt;
int i;
u8 sum;
- rt = (struct irq_routing_table *) addr;
+ rt = (struct irq_routing_table *)addr;
if (rt->signature != PIRQ_SIGNATURE ||
rt->version != PIRQ_VERSION ||
rt->size % 16 ||
- rt->size < sizeof(struct irq_routing_table))
+ rt->size < sizeof(struct irq_routing_table) ||
+ (limit && rt->size > limit - addr))
return NULL;
sum = 0;
for (i = 0; i < rt->size; i++)
sum += addr[i];
if (!sum) {
- DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
- rt);
+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%lx\n",
+ __pa(rt));
return rt;
}
return NULL;
}
+/*
+ * Handle the $IRT PCI IRQ Routing Table format used by AMI for its BCP
+ * (BIOS Configuration Program) external tool meant for tweaking BIOS
+ * structures without the need to rebuild it from sources. The $IRT
+ * format has been invented by AMI before Microsoft has come up with its
+ * $PIR format and a $IRT table is therefore there in some systems that
+ * lack a $PIR table.
+ *
+ * It uses the same PCI BIOS 2.1 format for interrupt routing entries
+ * themselves but has a different simpler header prepended instead,
+ * occupying 8 bytes, where a `$IRT' signature is followed by one byte
+ * specifying the total number of interrupt routing entries allocated in
+ * the table, then one byte specifying the actual number of entries used
+ * (which the BCP tool can take advantage of when modifying the table),
+ * and finally a 16-bit word giving the IRQs devoted exclusively to PCI.
+ * Unlike with the $PIR table there is no alignment guarantee.
+ *
+ * Given the similarity of the two formats the $IRT one is trivial to
+ * convert to the $PIR one, which we do here, except that obviously we
+ * have no information as to the router device to use, but we can handle
+ * it by matching PCI device IDs actually seen on the bus against ones
+ * that our individual routers recognise.
+ *
+ * Reportedly there is another $IRT table format where a 16-bit word
+ * follows the header instead that points to interrupt routing entries
+ * in a $PIR table provided elsewhere. In that case this code will not
+ * be reached though as the $PIR table will have been chosen instead.
+ */
+static inline struct irq_routing_table *pirq_convert_irt_table(u8 *addr,
+ u8 *limit)
+{
+ struct irt_routing_table *ir;
+ struct irq_routing_table *rt;
+ u16 size;
+ u8 sum;
+ int i;
+
+ ir = (struct irt_routing_table *)addr;
+ if (ir->signature != IRT_SIGNATURE || !ir->used || ir->size < ir->used)
+ return NULL;
+
+ size = sizeof(*ir) + ir->used * sizeof(ir->slots[0]);
+ if (size > limit - addr)
+ return NULL;
+
+ DBG(KERN_DEBUG "PCI: $IRT Interrupt Routing Table found at 0x%lx\n",
+ __pa(ir));
+ size = sizeof(*rt) + ir->used * sizeof(rt->slots[0]);
+ rt = kzalloc(size, GFP_KERNEL);
+ if (!rt)
+ return NULL;
+
+ rt->signature = PIRQ_SIGNATURE;
+ rt->version = PIRQ_VERSION;
+ rt->size = size;
+ rt->exclusive_irqs = ir->exclusive_irqs;
+ for (i = 0; i < ir->used; i++)
+ rt->slots[i] = ir->slots[i];
+
+ addr = (u8 *)rt;
+ sum = 0;
+ for (i = 0; i < size; i++)
+ sum += addr[i];
+ rt->checksum = -sum;
+
+ return rt;
+}
/*
* Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
@@ -93,17 +170,29 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
static struct irq_routing_table * __init pirq_find_routing_table(void)
{
+ u8 * const bios_start = (u8 *)__va(0xf0000);
+ u8 * const bios_end = (u8 *)__va(0x100000);
u8 *addr;
struct irq_routing_table *rt;
if (pirq_table_addr) {
- rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr));
+ rt = pirq_check_routing_table((u8 *)__va(pirq_table_addr),
+ NULL);
if (rt)
return rt;
printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
}
- for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
- rt = pirq_check_routing_table(addr);
+ for (addr = bios_start;
+ addr < bios_end - sizeof(struct irq_routing_table);
+ addr += 16) {
+ rt = pirq_check_routing_table(addr, bios_end);
+ if (rt)
+ return rt;
+ }
+ for (addr = bios_start;
+ addr < bios_end - sizeof(struct irt_routing_table);
+ addr++) {
+ rt = pirq_convert_irt_table(addr, bios_end);
if (rt)
return rt;
}
@@ -129,7 +218,8 @@ static void __init pirq_peer_trick(void)
#ifdef DEBUG
{
int j;
- DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
+ DBG(KERN_DEBUG "%02x:%02x.%x slot=%02x",
+ e->bus, e->devfn / 8, e->devfn % 8, e->slot);
for (j = 0; j < 4; j++)
DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
DBG("\n");
@@ -153,7 +243,7 @@ static void __init pirq_peer_trick(void)
void elcr_set_level_irq(unsigned int irq)
{
unsigned char mask = 1 << (irq & 7);
- unsigned int port = 0x4d0 + (irq >> 3);
+ unsigned int port = PIC_ELCR1 + (irq >> 3);
unsigned char val;
static u16 elcr_irq_mask;
@@ -170,6 +260,152 @@ void elcr_set_level_irq(unsigned int irq)
}
/*
+ * PIRQ routing for the M1487 ISA Bus Controller (IBC) ASIC used
+ * with the ALi FinALi 486 chipset. The IBC is not decoded in the
+ * PCI configuration space, so we identify it by the accompanying
+ * M1489 Cache-Memory PCI Controller (CMP) ASIC.
+ *
+ * There are four 4-bit mappings provided, spread across two PCI
+ * INTx Routing Table Mapping Registers, available in the port I/O
+ * space accessible indirectly via the index/data register pair at
+ * 0x22/0x23, located at indices 0x42 and 0x43 for the INT1/INT2
+ * and INT3/INT4 lines respectively. The INT1/INT3 and INT2/INT4
+ * lines are mapped in the low and the high 4-bit nibble of the
+ * corresponding register as follows:
+ *
+ * 0000 : Disabled
+ * 0001 : IRQ9
+ * 0010 : IRQ3
+ * 0011 : IRQ10
+ * 0100 : IRQ4
+ * 0101 : IRQ5
+ * 0110 : IRQ7
+ * 0111 : IRQ6
+ * 1000 : Reserved
+ * 1001 : IRQ11
+ * 1010 : Reserved
+ * 1011 : IRQ12
+ * 1100 : Reserved
+ * 1101 : IRQ14
+ * 1110 : Reserved
+ * 1111 : IRQ15
+ *
+ * In addition to the usual ELCR register pair there is a separate
+ * PCI INTx Sensitivity Register at index 0x44 in the same port I/O
+ * space, whose bits 3:0 select the trigger mode for INT[4:1] lines
+ * respectively. Any bit set to 1 causes interrupts coming on the
+ * corresponding line to be passed to ISA as edge-triggered and
+ * otherwise they are passed as level-triggered. Manufacturer's
+ * documentation says this register has to be set consistently with
+ * the relevant ELCR register.
+ *
+ * Accesses to the port I/O space concerned here need to be unlocked
+ * by writing the value of 0xc5 to the Lock Register at index 0x03
+ * beforehand. Any other value written to said register prevents
+ * further accesses from reaching the register file, except for the
+ * Lock Register being written with 0xc5 again.
+ *
+ * References:
+ *
+ * "M1489/M1487: 486 PCI Chip Set", Version 1.2, Acer Laboratories
+ * Inc., July 1997
+ */
+
+#define PC_CONF_FINALI_LOCK 0x03u
+#define PC_CONF_FINALI_PCI_INTX_RT1 0x42u
+#define PC_CONF_FINALI_PCI_INTX_RT2 0x43u
+#define PC_CONF_FINALI_PCI_INTX_SENS 0x44u
+
+#define PC_CONF_FINALI_LOCK_KEY 0xc5u
+
+static u8 read_pc_conf_nybble(u8 base, u8 index)
+{
+ u8 reg = base + (index >> 1);
+ u8 x;
+
+ x = pc_conf_get(reg);
+ return index & 1 ? x >> 4 : x & 0xf;
+}
+
+static void write_pc_conf_nybble(u8 base, u8 index, u8 val)
+{
+ u8 reg = base + (index >> 1);
+ u8 x;
+
+ x = pc_conf_get(reg);
+ x = index & 1 ? (x & 0x0f) | (val << 4) : (x & 0xf0) | val;
+ pc_conf_set(reg, x);
+}
+
+/*
+ * FinALi pirq rules are as follows:
+ *
+ * - bit 0 selects between INTx Routing Table Mapping Registers,
+ *
+ * - bit 3 selects the nibble within the INTx Routing Table Mapping Register,
+ *
+ * - bits 7:4 map to bits 3:0 of the PCI INTx Sensitivity Register.
+ */
+static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev,
+ int pirq)
+{
+ static const u8 irqmap[16] = {
+ 0, 9, 3, 10, 4, 5, 7, 6, 0, 11, 0, 12, 0, 14, 0, 15
+ };
+ unsigned long flags;
+ u8 index;
+ u8 x;
+
+ index = (pirq & 1) << 1 | (pirq & 8) >> 3;
+ raw_spin_lock_irqsave(&pc_conf_lock, flags);
+ pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY);
+ x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, index)];
+ pc_conf_set(PC_CONF_FINALI_LOCK, 0);
+ raw_spin_unlock_irqrestore(&pc_conf_lock, flags);
+ return x;
+}
+
+static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev,
+ int pirq, int irq)
+{
+ static const u8 irqmap[16] = {
+ 0, 0, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15
+ };
+ u8 val = irqmap[irq];
+ unsigned long flags;
+ u8 index;
+
+ if (!val)
+ return 0;
+
+ index = (pirq & 1) << 1 | (pirq & 8) >> 3;
+ raw_spin_lock_irqsave(&pc_conf_lock, flags);
+ pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY);
+ write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, index, val);
+ pc_conf_set(PC_CONF_FINALI_LOCK, 0);
+ raw_spin_unlock_irqrestore(&pc_conf_lock, flags);
+ return 1;
+}
+
+static int pirq_finali_lvl(struct pci_dev *router, struct pci_dev *dev,
+ int pirq, int irq)
+{
+ u8 mask = ~((pirq & 0xf0u) >> 4);
+ unsigned long flags;
+ u8 trig;
+
+ elcr_set_level_irq(irq);
+ raw_spin_lock_irqsave(&pc_conf_lock, flags);
+ pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY);
+ trig = pc_conf_get(PC_CONF_FINALI_PCI_INTX_SENS);
+ trig &= mask;
+ pc_conf_set(PC_CONF_FINALI_PCI_INTX_SENS, trig);
+ pc_conf_set(PC_CONF_FINALI_LOCK, 0);
+ raw_spin_unlock_irqrestore(&pc_conf_lock, flags);
+ return 1;
+}
+
+/*
* Common IRQ routing practice: nibbles in config space,
* offset by some magic constant.
*/
@@ -220,6 +456,74 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
}
/*
+ * PIRQ routing for the 82374EB/82374SB EISA System Component (ESC)
+ * ASIC used with the Intel 82420 and 82430 PCIsets. The ESC is not
+ * decoded in the PCI configuration space, so we identify it by the
+ * accompanying 82375EB/82375SB PCI-EISA Bridge (PCEB) ASIC.
+ *
+ * There are four PIRQ Route Control registers, available in the
+ * port I/O space accessible indirectly via the index/data register
+ * pair at 0x22/0x23, located at indices 0x60/0x61/0x62/0x63 for the
+ * PIRQ0/1/2/3# lines respectively. The semantics is the same as
+ * with the PIIX router.
+ *
+ * Accesses to the port I/O space concerned here need to be unlocked
+ * by writing the value of 0x0f to the ESC ID Register at index 0x02
+ * beforehand. Any other value written to said register prevents
+ * further accesses from reaching the register file, except for the
+ * ESC ID Register being written with 0x0f again.
+ *
+ * References:
+ *
+ * "82374EB/82374SB EISA System Component (ESC)", Intel Corporation,
+ * Order Number: 290476-004, March 1996
+ *
+ * "82375EB/82375SB PCI-EISA Bridge (PCEB)", Intel Corporation, Order
+ * Number: 290477-004, March 1996
+ */
+
+#define PC_CONF_I82374_ESC_ID 0x02u
+#define PC_CONF_I82374_PIRQ_ROUTE_CONTROL 0x60u
+
+#define PC_CONF_I82374_ESC_ID_KEY 0x0fu
+
+static int pirq_esc_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ unsigned long flags;
+ int reg;
+ u8 x;
+
+ reg = pirq;
+ if (reg >= 1 && reg <= 4)
+ reg += PC_CONF_I82374_PIRQ_ROUTE_CONTROL - 1;
+
+ raw_spin_lock_irqsave(&pc_conf_lock, flags);
+ pc_conf_set(PC_CONF_I82374_ESC_ID, PC_CONF_I82374_ESC_ID_KEY);
+ x = pc_conf_get(reg);
+ pc_conf_set(PC_CONF_I82374_ESC_ID, 0);
+ raw_spin_unlock_irqrestore(&pc_conf_lock, flags);
+ return (x < 16) ? x : 0;
+}
+
+static int pirq_esc_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
+ int irq)
+{
+ unsigned long flags;
+ int reg;
+
+ reg = pirq;
+ if (reg >= 1 && reg <= 4)
+ reg += PC_CONF_I82374_PIRQ_ROUTE_CONTROL - 1;
+
+ raw_spin_lock_irqsave(&pc_conf_lock, flags);
+ pc_conf_set(PC_CONF_I82374_ESC_ID, PC_CONF_I82374_ESC_ID_KEY);
+ pc_conf_set(reg, irq);
+ pc_conf_set(PC_CONF_I82374_ESC_ID, 0);
+ raw_spin_unlock_irqrestore(&pc_conf_lock, flags);
+ return 1;
+}
+
+/*
* The Intel PIIX4 pirq rules are fairly simple: "pirq" is
* just a pointer to the config space.
*/
@@ -238,6 +542,50 @@ static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
}
/*
+ * PIRQ routing for the 82426EX ISA Bridge (IB) ASIC used with the
+ * Intel 82420EX PCIset.
+ *
+ * There are only two PIRQ Route Control registers, available in the
+ * combined 82425EX/82426EX PCI configuration space, at 0x66 and 0x67
+ * for the PIRQ0# and PIRQ1# lines respectively. The semantics is
+ * the same as with the PIIX router.
+ *
+ * References:
+ *
+ * "82420EX PCIset Data Sheet, 82425EX PCI System Controller (PSC)
+ * and 82426EX ISA Bridge (IB)", Intel Corporation, Order Number:
+ * 290488-004, December 1995
+ */
+
+#define PCI_I82426EX_PIRQ_ROUTE_CONTROL 0x66u
+
+static int pirq_ib_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ int reg;
+ u8 x;
+
+ reg = pirq;
+ if (reg >= 1 && reg <= 2)
+ reg += PCI_I82426EX_PIRQ_ROUTE_CONTROL - 1;
+
+ pci_read_config_byte(router, reg, &x);
+ return (x < 16) ? x : 0;
+}
+
+static int pirq_ib_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
+ int irq)
+{
+ int reg;
+
+ reg = pirq;
+ if (reg >= 1 && reg <= 2)
+ reg += PCI_I82426EX_PIRQ_ROUTE_CONTROL - 1;
+
+ pci_write_config_byte(router, reg, irq);
+ return 1;
+}
+
+/*
* The VIA pirq rules are nibble-based, like ALI,
* but without the ugly irq number munging.
* However, PIRQD is in the upper instead of lower 4 bits.
@@ -328,6 +676,81 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
return 1;
}
+
+/*
+ * PIRQ routing for the SiS85C497 AT Bus Controller & Megacell (ATM)
+ * ISA bridge used with the SiS 85C496/497 486 Green PC VESA/ISA/PCI
+ * Chipset.
+ *
+ * There are four PCI INTx#-to-IRQ Link registers provided in the
+ * SiS85C497 part of the peculiar combined 85C496/497 configuration
+ * space decoded by the SiS85C496 PCI & CPU Memory Controller (PCM)
+ * host bridge, at 0xc0/0xc1/0xc2/0xc3 respectively for the PCI INT
+ * A/B/C/D lines. Bit 7 enables the respective link if set and bits
+ * 3:0 select the 8259A IRQ line as follows:
+ *
+ * 0000 : Reserved
+ * 0001 : Reserved
+ * 0010 : Reserved
+ * 0011 : IRQ3
+ * 0100 : IRQ4
+ * 0101 : IRQ5
+ * 0110 : IRQ6
+ * 0111 : IRQ7
+ * 1000 : Reserved
+ * 1001 : IRQ9
+ * 1010 : IRQ10
+ * 1011 : IRQ11
+ * 1100 : IRQ12
+ * 1101 : Reserved
+ * 1110 : IRQ14
+ * 1111 : IRQ15
+ *
+ * We avoid using a reserved value for disabled links, hence the
+ * choice of IRQ15 for that case.
+ *
+ * References:
+ *
+ * "486 Green PC VESA/ISA/PCI Chipset, SiS 85C496/497", Rev 3.0,
+ * Silicon Integrated Systems Corp., July 1995
+ */
+
+#define PCI_SIS497_INTA_TO_IRQ_LINK 0xc0u
+
+#define PIRQ_SIS497_IRQ_MASK 0x0fu
+#define PIRQ_SIS497_IRQ_ENABLE 0x80u
+
+static int pirq_sis497_get(struct pci_dev *router, struct pci_dev *dev,
+ int pirq)
+{
+ int reg;
+ u8 x;
+
+ reg = pirq;
+ if (reg >= 1 && reg <= 4)
+ reg += PCI_SIS497_INTA_TO_IRQ_LINK - 1;
+
+ pci_read_config_byte(router, reg, &x);
+ return (x & PIRQ_SIS497_IRQ_ENABLE) ? (x & PIRQ_SIS497_IRQ_MASK) : 0;
+}
+
+static int pirq_sis497_set(struct pci_dev *router, struct pci_dev *dev,
+ int pirq, int irq)
+{
+ int reg;
+ u8 x;
+
+ reg = pirq;
+ if (reg >= 1 && reg <= 4)
+ reg += PCI_SIS497_INTA_TO_IRQ_LINK - 1;
+
+ pci_read_config_byte(router, reg, &x);
+ x &= ~(PIRQ_SIS497_IRQ_MASK | PIRQ_SIS497_IRQ_ENABLE);
+ x |= irq ? (PIRQ_SIS497_IRQ_ENABLE | irq) : PIRQ_SIS497_IRQ_MASK;
+ pci_write_config_byte(router, reg, x);
+ return 1;
+}
+
/*
* PIRQ routing for SiS 85C503 router used in several SiS chipsets.
* We have to deal with the following issues here:
@@ -389,11 +812,12 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
* bit 6-4 are probably unused, not like 5595
*/
-#define PIRQ_SIS_IRQ_MASK 0x0f
-#define PIRQ_SIS_IRQ_DISABLE 0x80
-#define PIRQ_SIS_USB_ENABLE 0x40
+#define PIRQ_SIS503_IRQ_MASK 0x0f
+#define PIRQ_SIS503_IRQ_DISABLE 0x80
+#define PIRQ_SIS503_USB_ENABLE 0x40
-static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+static int pirq_sis503_get(struct pci_dev *router, struct pci_dev *dev,
+ int pirq)
{
u8 x;
int reg;
@@ -402,10 +826,11 @@ static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
if (reg >= 0x01 && reg <= 0x04)
reg += 0x40;
pci_read_config_byte(router, reg, &x);
- return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
+ return (x & PIRQ_SIS503_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS503_IRQ_MASK);
}
-static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int pirq_sis503_set(struct pci_dev *router, struct pci_dev *dev,
+ int pirq, int irq)
{
u8 x;
int reg;
@@ -414,8 +839,8 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
if (reg >= 0x01 && reg <= 0x04)
reg += 0x40;
pci_read_config_byte(router, reg, &x);
- x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
- x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
+ x &= ~(PIRQ_SIS503_IRQ_MASK | PIRQ_SIS503_IRQ_DISABLE);
+ x |= irq ? irq : PIRQ_SIS503_IRQ_DISABLE;
pci_write_config_byte(router, reg, x);
return 1;
}
@@ -549,6 +974,11 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
return 0;
switch (device) {
+ case PCI_DEVICE_ID_INTEL_82375:
+ r->name = "PCEB/ESC";
+ r->get = pirq_esc_get;
+ r->set = pirq_esc_set;
+ return 1;
case PCI_DEVICE_ID_INTEL_82371FB_0:
case PCI_DEVICE_ID_INTEL_82371SB_0:
case PCI_DEVICE_ID_INTEL_82371AB_0:
@@ -594,6 +1024,11 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
r->get = pirq_piix_get;
r->set = pirq_piix_set;
return 1;
+ case PCI_DEVICE_ID_INTEL_82425:
+ r->name = "PSC/IB";
+ r->get = pirq_ib_get;
+ r->set = pirq_ib_set;
+ return 1;
}
if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN &&
@@ -697,13 +1132,19 @@ static __init int serverworks_router_probe(struct irq_router *r,
static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
- if (device != PCI_DEVICE_ID_SI_503)
- return 0;
-
- r->name = "SIS";
- r->get = pirq_sis_get;
- r->set = pirq_sis_set;
- return 1;
+ switch (device) {
+ case PCI_DEVICE_ID_SI_496:
+ r->name = "SiS85C497";
+ r->get = pirq_sis497_get;
+ r->set = pirq_sis497_set;
+ return 1;
+ case PCI_DEVICE_ID_SI_503:
+ r->name = "SiS85C503";
+ r->get = pirq_sis503_get;
+ r->set = pirq_sis503_set;
+ return 1;
+ }
+ return 0;
}
static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
@@ -745,6 +1186,12 @@ static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router,
static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
switch (device) {
+ case PCI_DEVICE_ID_AL_M1489:
+ r->name = "FinALi";
+ r->get = pirq_finali_get;
+ r->set = pirq_finali_set;
+ r->lvl = pirq_finali_lvl;
+ return 1;
case PCI_DEVICE_ID_AL_M1533:
case PCI_DEVICE_ID_AL_M1563:
r->name = "ALI";
@@ -817,10 +1264,32 @@ static struct pci_dev *pirq_router_dev;
* chipset" ?
*/
+static bool __init pirq_try_router(struct irq_router *r,
+ struct irq_routing_table *rt,
+ struct pci_dev *dev)
+{
+ struct irq_router_handler *h;
+
+ DBG(KERN_DEBUG "PCI: Trying IRQ router for [%04x:%04x]\n",
+ dev->vendor, dev->device);
+
+ for (h = pirq_routers; h->vendor; h++) {
+ /* First look for a router match */
+ if (rt->rtr_vendor == h->vendor &&
+ h->probe(r, dev, rt->rtr_device))
+ return true;
+ /* Fall back to a device match */
+ if (dev->vendor == h->vendor &&
+ h->probe(r, dev, dev->device))
+ return true;
+ }
+ return false;
+}
+
static void __init pirq_find_router(struct irq_router *r)
{
struct irq_routing_table *rt = pirq_table;
- struct irq_router_handler *h;
+ struct pci_dev *dev;
#ifdef CONFIG_PCI_BIOS
if (!rt->signature) {
@@ -839,50 +1308,94 @@ static void __init pirq_find_router(struct irq_router *r)
DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
rt->rtr_vendor, rt->rtr_device);
- pirq_router_dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus,
- rt->rtr_devfn);
- if (!pirq_router_dev) {
- DBG(KERN_DEBUG "PCI: Interrupt router not found at "
- "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
- return;
+ /* Use any vendor:device provided by the routing table or try all. */
+ if (rt->rtr_vendor) {
+ dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus,
+ rt->rtr_devfn);
+ if (dev && pirq_try_router(r, rt, dev))
+ pirq_router_dev = dev;
+ } else {
+ dev = NULL;
+ for_each_pci_dev(dev) {
+ if (pirq_try_router(r, rt, dev)) {
+ pirq_router_dev = dev;
+ break;
+ }
+ }
}
- for (h = pirq_routers; h->vendor; h++) {
- /* First look for a router match */
- if (rt->rtr_vendor == h->vendor &&
- h->probe(r, pirq_router_dev, rt->rtr_device))
- break;
- /* Fall back to a device match */
- if (pirq_router_dev->vendor == h->vendor &&
- h->probe(r, pirq_router_dev, pirq_router_dev->device))
- break;
- }
- dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
- pirq_router.name,
- pirq_router_dev->vendor, pirq_router_dev->device);
+ if (pirq_router_dev)
+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
+ pirq_router.name,
+ pirq_router_dev->vendor, pirq_router_dev->device);
+ else
+ DBG(KERN_DEBUG "PCI: Interrupt router not found at "
+ "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
/* The device remains referenced for the kernel lifetime */
}
-static struct irq_info *pirq_get_info(struct pci_dev *dev)
+/*
+ * We're supposed to match on the PCI device only and not the function,
+ * but some BIOSes build their tables with the PCI function included
+ * for motherboard devices, so if a complete match is found, then give
+ * it precedence over a slot match.
+ */
+static struct irq_info *pirq_get_dev_info(struct pci_dev *dev)
{
struct irq_routing_table *rt = pirq_table;
int entries = (rt->size - sizeof(struct irq_routing_table)) /
sizeof(struct irq_info);
+ struct irq_info *slotinfo = NULL;
struct irq_info *info;
for (info = rt->slots; entries--; info++)
- if (info->bus == dev->bus->number &&
- PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
- return info;
- return NULL;
+ if (info->bus == dev->bus->number) {
+ if (info->devfn == dev->devfn)
+ return info;
+ if (!slotinfo &&
+ PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
+ slotinfo = info;
+ }
+ return slotinfo;
+}
+
+/*
+ * Buses behind bridges are typically not listed in the PIRQ routing table.
+ * Do the usual dance then and walk the tree of bridges up adjusting the
+ * pin number accordingly on the way until the originating root bus device
+ * has been reached and then use its routing information.
+ */
+static struct irq_info *pirq_get_info(struct pci_dev *dev, u8 *pin)
+{
+ struct pci_dev *temp_dev = dev;
+ struct irq_info *info;
+ u8 temp_pin = *pin;
+ u8 dpin = temp_pin;
+
+ info = pirq_get_dev_info(dev);
+ while (!info && temp_dev->bus->parent) {
+ struct pci_dev *bridge = temp_dev->bus->self;
+
+ temp_pin = pci_swizzle_interrupt_pin(temp_dev, temp_pin);
+ info = pirq_get_dev_info(bridge);
+ if (info)
+ dev_warn(&dev->dev,
+ "using bridge %s INT %c to get INT %c\n",
+ pci_name(bridge),
+ 'A' + temp_pin - 1, 'A' + dpin - 1);
+
+ temp_dev = bridge;
+ }
+ *pin = temp_pin;
+ return info;
}
static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
{
- u8 pin;
struct irq_info *info;
int i, pirq, newirq;
+ u8 dpin, pin;
int irq = 0;
u32 mask;
struct irq_router *r = &pirq_router;
@@ -890,8 +1403,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
char *msg = NULL;
/* Find IRQ pin */
- pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
- if (!pin) {
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &dpin);
+ if (!dpin) {
dev_dbg(&dev->dev, "no interrupt pin\n");
return 0;
}
@@ -904,20 +1417,21 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
if (!pirq_table)
return 0;
- info = pirq_get_info(dev);
+ pin = dpin;
+ info = pirq_get_info(dev, &pin);
if (!info) {
dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
- 'A' + pin - 1);
+ 'A' + dpin - 1);
return 0;
}
pirq = info->irq[pin - 1].link;
mask = info->irq[pin - 1].bitmap;
if (!pirq) {
- dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1);
+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + dpin - 1);
return 0;
}
dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
- 'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs);
+ 'A' + dpin - 1, pirq, mask, pirq_table->exclusive_irqs);
mask &= pcibios_irq_mask;
/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -959,7 +1473,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
newirq = i;
}
}
- dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq);
+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + dpin - 1, newirq);
/* Check if it is hardcoded */
if ((pirq & 0xf0) == 0xf0) {
@@ -968,11 +1482,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
msg = "found";
- elcr_set_level_irq(irq);
+ if (r->lvl)
+ r->lvl(pirq_router_dev, dev, pirq, irq);
+ else
+ elcr_set_level_irq(irq);
} else if (newirq && r->set &&
(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
if (r->set(pirq_router_dev, dev, pirq, newirq)) {
- elcr_set_level_irq(newirq);
+ if (r->lvl)
+ r->lvl(pirq_router_dev, dev, pirq, newirq);
+ else
+ elcr_set_level_irq(newirq);
msg = "assigned";
irq = newirq;
}
@@ -987,15 +1507,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
return 0;
}
}
- dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n",
+ msg, 'A' + dpin - 1, irq);
/* Update IRQ for all devices with the same pirq value */
for_each_pci_dev(dev2) {
- pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
- if (!pin)
+ pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &dpin);
+ if (!dpin)
continue;
- info = pirq_get_info(dev2);
+ pin = dpin;
+ info = pirq_get_info(dev2, &pin);
if (!info)
continue;
if (info->irq[pin - 1].link == pirq) {
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index de6bf0e7e8f8..758cbfe55daa 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -461,7 +461,7 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
}
if (size < (16UL<<20) && size != old_size)
- return 0;
+ return false;
if (dev)
dev_info(dev, "MMCONFIG at %pR reserved in %s\n",
@@ -493,7 +493,7 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
&cfg->res, (unsigned long) cfg->address);
}
- return 1;
+ return true;
}
static bool __ref
@@ -501,7 +501,7 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e
{
if (!early && !acpi_disabled) {
if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
- return 1;
+ return true;
if (dev)
dev_info(dev, FW_INFO
@@ -522,14 +522,14 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e
* _CBA method, just assume it's reserved.
*/
if (pci_mmcfg_running_state)
- return 1;
+ return true;
/* Don't try to do this check unless configuration
type 1 is available. how about type 2 ?*/
if (raw_pci_ops)
return is_mmconf_reserved(e820__mapped_all, cfg, dev, 1);
- return 0;
+ return false;
}
static void __init pci_mmcfg_reject_broken(int early)
diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
index 01a085d9135a..4f0147d4e225 100644
--- a/arch/x86/pci/numachip.c
+++ b/arch/x86/pci/numachip.c
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <asm/pci_x86.h>
+#include <asm/numachip/numachip.h>
static u8 limit __read_mostly;
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 7d2525691854..7368afc03998 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev)
int size = STA2X11_SWIOTLB_SIZE;
/* First instance: register your own swiotlb area */
dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size);
- if (swiotlb_late_init_with_default_size(size))
+ if (swiotlb_init_late(size, GFP_DMA, NULL))
dev_emerg(&pdev->dev, "init swiotlb failed\n");
}
list_add(&instance->list, &sta2x11_instance_list);
@@ -146,8 +146,7 @@ static void sta2x11_map_ep(struct pci_dev *pdev)
dev_err(dev, "sta2x11: could not set DMA offset\n");
dev->bus_dma_limit = max_amba_addr;
- pci_set_consistent_dma_mask(pdev, max_amba_addr);
- pci_set_dma_mask(pdev, max_amba_addr);
+ dma_set_mask_and_coherent(&pdev->dev, max_amba_addr);
/* Configure AHB mapping */
pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0);
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 3d41a09c2c14..b94f727251b6 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -23,6 +23,7 @@
#include <xen/features.h>
#include <xen/events.h>
+#include <xen/pci.h>
#include <asm/xen/pci.h>
#include <asm/xen/cpuid.h>
#include <asm/apic.h>
@@ -113,7 +114,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
false /* no mapping of GSI to PIRQ */);
}
-#ifdef CONFIG_XEN_DOM0
+#ifdef CONFIG_XEN_PV_DOM0
static int xen_register_gsi(u32 gsi, int triggering, int polarity)
{
int rc, irq;
@@ -183,7 +184,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (ret)
goto error;
i = 0;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
(type == PCI_CAP_ID_MSI) ? nvec : 1,
(type == PCI_CAP_ID_MSIX) ?
@@ -234,7 +235,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
pirq = xen_allocate_pirq_msi(dev, msidesc);
if (pirq < 0) {
irq = -ENODEV;
@@ -261,7 +262,7 @@ error:
return irq;
}
-#ifdef CONFIG_XEN_DOM0
+#ifdef CONFIG_XEN_PV_DOM0
static bool __read_mostly pci_seg_supported = true;
static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
@@ -269,7 +270,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int ret = 0;
struct msi_desc *msidesc;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
struct physdev_map_pirq map_irq;
domid_t domid;
@@ -305,7 +306,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return -EINVAL;
map_irq.table_base = pci_resource_start(dev, bir);
- map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ map_irq.entry_nr = msidesc->msi_index;
}
ret = -EINVAL;
@@ -350,10 +351,13 @@ out:
return ret;
}
-static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
+bool xen_initdom_restore_msi(struct pci_dev *dev)
{
int ret = 0;
+ if (!xen_initial_domain())
+ return true;
+
if (pci_seg_supported) {
struct physdev_pci_device restore_ext;
@@ -374,30 +378,26 @@ static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret);
}
+ return false;
}
-#else /* CONFIG_XEN_DOM0 */
+#else /* CONFIG_XEN_PV_DOM0 */
#define xen_initdom_setup_msi_irqs NULL
-#define xen_initdom_restore_msi_irqs NULL
-#endif /* !CONFIG_XEN_DOM0 */
+#endif /* !CONFIG_XEN_PV_DOM0 */
static void xen_teardown_msi_irqs(struct pci_dev *dev)
{
struct msi_desc *msidesc;
int i;
- for_each_pci_msi_entry(msidesc, dev) {
- if (msidesc->irq) {
- for (i = 0; i < msidesc->nvec_used; i++)
- xen_destroy_irq(msidesc->irq + i);
- }
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
+ for (i = 0; i < msidesc->nvec_used; i++)
+ xen_destroy_irq(msidesc->irq + i);
}
}
static void xen_pv_teardown_msi_irqs(struct pci_dev *dev)
{
- struct msi_desc *msidesc = first_pci_msi_entry(dev);
-
- if (msidesc->msi_attrib.is_msix)
+ if (dev->msix_enabled)
xen_pci_frontend_disable_msix(dev);
else
xen_pci_frontend_disable_msi(dev);
@@ -413,10 +413,7 @@ static int xen_msi_domain_alloc_irqs(struct irq_domain *domain,
if (WARN_ON_ONCE(!dev_is_pci(dev)))
return -EINVAL;
- if (first_msi_entry(dev)->msi_attrib.is_msix)
- type = PCI_CAP_ID_MSIX;
- else
- type = PCI_CAP_ID_MSI;
+ type = to_pci_dev(dev)->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
return xen_msi_ops.setup_msi_irqs(to_pci_dev(dev), nvec, type);
}
@@ -465,14 +462,11 @@ static __init struct irq_domain *xen_create_pci_msi_domain(void)
static __init void xen_setup_pci_msi(void)
{
if (xen_pv_domain()) {
- if (xen_initial_domain()) {
+ if (xen_initial_domain())
xen_msi_ops.setup_msi_irqs = xen_initdom_setup_msi_irqs;
- x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
- } else {
+ else
xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs;
- }
xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs;
- pci_msi_ignore_mask = 1;
} else if (xen_hvm_domain()) {
xen_msi_ops.setup_msi_irqs = xen_hvm_setup_msi_irqs;
xen_msi_ops.teardown_msi_irqs = xen_teardown_msi_irqs;
@@ -486,6 +480,11 @@ static __init void xen_setup_pci_msi(void)
* in allocating the native domain and never use it.
*/
x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain;
+ /*
+ * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
+ * controlled by the hypervisor.
+ */
+ pci_msi_ignore_mask = 1;
}
#else /* CONFIG_PCI_MSI */
@@ -555,7 +554,7 @@ int __init pci_xen_hvm_init(void)
return 0;
}
-#ifdef CONFIG_XEN_DOM0
+#ifdef CONFIG_XEN_PV_DOM0
int __init pci_xen_initial_domain(void)
{
int irq;
@@ -583,77 +582,5 @@ int __init pci_xen_initial_domain(void)
}
return 0;
}
-
-struct xen_device_domain_owner {
- domid_t domain;
- struct pci_dev *dev;
- struct list_head list;
-};
-
-static DEFINE_SPINLOCK(dev_domain_list_spinlock);
-static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
-
-static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
-{
- struct xen_device_domain_owner *owner;
-
- list_for_each_entry(owner, &dev_domain_list, list) {
- if (owner->dev == dev)
- return owner;
- }
- return NULL;
-}
-
-int xen_find_device_domain_owner(struct pci_dev *dev)
-{
- struct xen_device_domain_owner *owner;
- int domain = -ENODEV;
-
- spin_lock(&dev_domain_list_spinlock);
- owner = find_device(dev);
- if (owner)
- domain = owner->domain;
- spin_unlock(&dev_domain_list_spinlock);
- return domain;
-}
-EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
-
-int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
-{
- struct xen_device_domain_owner *owner;
-
- owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
- if (!owner)
- return -ENODEV;
-
- spin_lock(&dev_domain_list_spinlock);
- if (find_device(dev)) {
- spin_unlock(&dev_domain_list_spinlock);
- kfree(owner);
- return -EEXIST;
- }
- owner->domain = domain;
- owner->dev = dev;
- list_add_tail(&owner->list, &dev_domain_list);
- spin_unlock(&dev_domain_list_spinlock);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
-
-int xen_unregister_device_domain_owner(struct pci_dev *dev)
-{
- struct xen_device_domain_owner *owner;
-
- spin_lock(&dev_domain_list_spinlock);
- owner = find_device(dev);
- if (!owner) {
- spin_unlock(&dev_domain_list_spinlock);
- return -ENODEV;
- }
- list_del(&owner->list);
- spin_unlock(&dev_domain_list_spinlock);
- kfree(owner);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
#endif
+
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index 0ac3d4357136..65fa3d866226 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -249,7 +249,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
@@ -263,7 +263,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index 84b09c230cbd..a50245157685 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
KASAN_SANITIZE := n
GCOV_PROFILE := n
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 8a26e705cb06..1591d67e0bcd 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -93,6 +93,9 @@ static const unsigned long * const efi_tables[] = {
#ifdef CONFIG_LOAD_UEFI_KEYS
&efi.mokvar_table,
#endif
+#ifdef CONFIG_EFI_COCO_SECRET
+ &efi.coco_secret,
+#endif
};
u64 efi_setup; /* efi setup_data physical address */
@@ -468,7 +471,7 @@ void __init efi_init(void)
*/
if (!efi_runtime_supported())
- pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
+ pr_err("No EFI runtime due to 32/64-bit mismatch with kernel\n");
if (!efi_runtime_supported() || efi_runtime_disabled()) {
efi_memmap_unmap();
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 1b82d77019b1..1f3675453a57 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -33,7 +33,7 @@
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/ucs2_string.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/sched/task.h>
#include <asm/setup.h>
@@ -47,7 +47,7 @@
#include <asm/realmode.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
/*
* We allocate runtime services regions top-down, starting from -4G, i.e.
@@ -195,7 +195,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
}
/*
- * Certain firmware versions are way too sentimential and still believe
+ * Certain firmware versions are way too sentimental and still believe
* they are exclusive and unquestionable owners of the first physical page,
* even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY
* (but then write-access it later during SetVirtualAddressMap()).
@@ -284,7 +284,8 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va)
if (!(md->attribute & EFI_MEMORY_WB))
flags |= _PAGE_PCD;
- if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO)
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
+ md->type != EFI_MEMORY_MAPPED_IO)
flags |= _PAGE_ENC;
pfn = md->phys_addr >> PAGE_SHIFT;
@@ -390,7 +391,7 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m
if (!(md->attribute & EFI_MEMORY_RO))
pf |= _PAGE_RW;
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
pf |= _PAGE_ENC;
return efi_update_mappings(md, pf);
@@ -438,7 +439,7 @@ void __init efi_runtime_update_mappings(void)
(md->type != EFI_RUNTIME_SERVICES_CODE))
pf |= _PAGE_RW;
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
pf |= _PAGE_ENC;
efi_update_mappings(md, pf);
@@ -457,7 +458,7 @@ void __init efi_dump_pagetable(void)
* in a kernel thread and user context. Preemption needs to remain disabled
* while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm
* can not change under us.
- * It should be ensured that there are no concurent calls to this function.
+ * It should be ensured that there are no concurrent calls to this function.
*/
void efi_enter_mm(void)
{
diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
index 09ec84f6ef51..f3cfdb1c9a35 100644
--- a/arch/x86/platform/efi/efi_stub_32.S
+++ b/arch/x86/platform/efi/efi_stub_32.S
@@ -56,5 +56,5 @@ SYM_FUNC_START(efi_call_svam)
movl 16(%esp), %ebx
leave
- ret
+ RET
SYM_FUNC_END(efi_call_svam)
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 90380a17ab23..2206b8bc47b8 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -23,5 +23,5 @@ SYM_FUNC_START(__efi_call)
mov %rsi, %rcx
CALL_NOSPEC rdi
leave
- ret
+ RET
SYM_FUNC_END(__efi_call)
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index fd3dd1708eba..9ffe2bad27d5 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -8,7 +8,7 @@
* The below thunking functions are only used after ExitBootServices()
* has been called. This simplifies things considerably as compared with
* the early EFI thunking because we can leave all the kernel state
- * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime
+ * intact (GDT, IDT, etc) and simply invoke the 32-bit EFI runtime
* services from __KERNEL32_CS. This means we can continue to service
* interrupts across an EFI mixed mode call.
*
@@ -20,12 +20,14 @@
*/
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/page_types.h>
#include <asm/segment.h>
.text
.code64
-SYM_CODE_START(__efi64_thunk)
+SYM_FUNC_START(__efi64_thunk)
+STACK_FRAME_NON_STANDARD __efi64_thunk
push %rbp
push %rbx
@@ -37,6 +39,17 @@ SYM_CODE_START(__efi64_thunk)
push %rax
/*
+ * Copy args passed via the stack
+ */
+ subq $0x24, %rsp
+ movq 0x18(%rax), %rbp
+ movq 0x20(%rax), %rbx
+ movq 0x28(%rax), %rax
+ movl %ebp, 0x18(%rsp)
+ movl %ebx, 0x1c(%rsp)
+ movl %eax, 0x20(%rsp)
+
+ /*
* Calculate the physical address of the kernel text.
*/
movq $__START_KERNEL_map, %rax
@@ -47,7 +60,6 @@ SYM_CODE_START(__efi64_thunk)
subq %rax, %rbp
subq %rax, %rbx
- subq $28, %rsp
movl %ebx, 0x0(%rsp) /* return address */
movl %esi, 0x4(%rsp)
movl %edx, 0x8(%rsp)
@@ -60,16 +72,16 @@ SYM_CODE_START(__efi64_thunk)
pushq %rdi /* EFI runtime service address */
lretq
-1: movq 24(%rsp), %rsp
+1: movq 0x20(%rsp), %rsp
pop %rbx
pop %rbp
- retq
+ RET
.code32
2: pushl $__KERNEL_CS
pushl %ebp
lret
-SYM_CODE_END(__efi64_thunk)
+SYM_FUNC_END(__efi64_thunk)
.bss
.balign 8
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 67d93a243c35..b0b848d6933a 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -277,7 +277,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
- new = early_memremap(data.phys_map, data.size);
+ new = early_memremap_prot(data.phys_map, data.size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
@@ -441,7 +442,7 @@ void __init efi_free_boot_services(void)
* 1.4.4 with SGX enabled booting Linux via Fedora 24's
* grub2-efi on a hard disk. (And no, I don't know why
* this happened, but Linux should still try to boot rather
- * panicing early.)
+ * panicking early.)
*/
rm_size = real_mode_size_needed();
if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
@@ -450,6 +451,18 @@ void __init efi_free_boot_services(void)
size -= rm_size;
}
+ /*
+ * Don't free memory under 1M for two reasons:
+ * - BIOS might clobber it
+ * - Crash kernel needs it to be reserved
+ */
+ if (start + size < SZ_1M)
+ continue;
+ if (start < SZ_1M) {
+ size -= (SZ_1M - start);
+ start = SZ_1M;
+ }
+
memblock_free_late(start, size);
}
@@ -726,7 +739,7 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
* Buggy efi_reset_system() is handled differently from other EFI
* Runtime Services as it doesn't use efi_rts_wq. Although,
* native_machine_emergency_restart() says that machine_real_restart()
- * could fail, it's better not to compilcate this fault handler
+ * could fail, it's better not to complicate this fault handler
* because this case occurs *very* rarely and hence could be improved
* on a need by basis.
*/
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 0286fe1b14b5..d3d456925b2a 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
* imr.c -- Intel Isolated Memory Region driver
*
* Copyright(c) 2013 Intel Corporation.
@@ -551,7 +551,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
/*
* Setup an unlocked IMR around the physical extent of the kernel
- * from the beginning of the .text secton to the end of the
+ * from the beginning of the .text section to the end of the
* .rodata section as one physically contiguous block.
*
* We don't round up @size since it is already PAGE_SIZE aligned.
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 570e3062faac..761f3689f60a 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/**
+/*
* imr_selftest.c -- Intel Isolated Memory Region self-test driver
*
* Copyright(c) 2013 Intel Corporation.
diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c
index 526f70f27c1c..fdd49d70b437 100644
--- a/arch/x86/platform/intel/iosf_mbi.c
+++ b/arch/x86/platform/intel/iosf_mbi.c
@@ -187,7 +187,7 @@ bool iosf_mbi_available(void)
EXPORT_SYMBOL(iosf_mbi_available);
/*
- **************** P-Unit/kernel shared I2C bus arbritration ****************
+ **************** P-Unit/kernel shared I2C bus arbitration ****************
*
* Some Bay Trail and Cherry Trail devices have the P-Unit and us (the kernel)
* share a single I2C bus to the PMIC. Below are helpers to arbitrate the
@@ -493,7 +493,7 @@ static void iosf_sideband_debug_init(void)
/* mcrx */
debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
- /* mcr - initiates mailbox tranaction */
+ /* mcr - initiates mailbox transaction */
debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
}
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 85f4638764d6..994a229cb79f 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -27,7 +27,7 @@ static bool lid_wake_on_close;
* wake-on-close. This is implemented as standard by the XO-1.5 DSDT.
*
* We provide here a sysfs attribute that will additionally enable
- * wake-on-close behavior. This is useful (e.g.) when we oportunistically
+ * wake-on-close behavior. This is useful (e.g.) when we opportunistically
* suspend with the display running; if the lid is then closed, we want to
* wake up to turn the display off.
*
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index ee2beda590d0..1d4a00e767ec 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -274,7 +274,7 @@ static struct olpc_ec_driver ec_xo1_driver = {
static struct olpc_ec_driver ec_xo1_5_driver = {
.ec_cmd = olpc_xo1_ec_cmd,
-#ifdef CONFIG_OLPC_XO1_5_SCI
+#ifdef CONFIG_OLPC_XO15_SCI
/*
* XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
* compiled in
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index 26d1f6693789..75e3319e8bee 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -131,7 +131,7 @@ void * __init prom_early_alloc(unsigned long size)
const size_t chunk_size = max(PAGE_SIZE, size);
/*
- * To mimimize the number of allocations, grab at least
+ * To minimize the number of allocations, grab at least
* PAGE_SIZE of memory (that's an arbitrary choice that's
* fast enough on the platforms we care about while minimizing
* wasted bootmem) and hand off chunks of it to callers.
diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S
index 75f4faff8468..3a5abffe5660 100644
--- a/arch/x86/platform/olpc/xo1-wakeup.S
+++ b/arch/x86/platform/olpc/xo1-wakeup.S
@@ -77,7 +77,7 @@ save_registers:
pushfl
popl saved_context_eflags
- ret
+ RET
restore_registers:
movl saved_context_ebp, %ebp
@@ -88,7 +88,7 @@ restore_registers:
pushl saved_context_eflags
popfl
- ret
+ RET
SYM_CODE_START(do_olpc_suspend_lowlevel)
call save_processor_state
@@ -109,7 +109,7 @@ ret_point:
call restore_registers
call restore_processor_state
- ret
+ RET
SYM_CODE_END(do_olpc_suspend_lowlevel)
.data
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 9ac7457f52a3..ed0442e35434 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -16,15 +16,15 @@
/*
* PVH variables.
*
- * pvh_bootparams and pvh_start_info need to live in the data segment since
+ * pvh_bootparams and pvh_start_info need to live in a data segment since
* they are used after startup_{32|64}, which clear .bss, are invoked.
*/
-struct boot_params pvh_bootparams __section(".data");
-struct hvm_start_info pvh_start_info __section(".data");
+struct boot_params __initdata pvh_bootparams;
+struct hvm_start_info __initdata pvh_start_info;
-unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
+const unsigned int __initconst pvh_start_info_sz = sizeof(pvh_start_info);
-static u64 pvh_get_root_pointer(void)
+static u64 __init pvh_get_root_pointer(void)
{
return pvh_start_info.rsdp_paddr;
}
@@ -107,7 +107,7 @@ void __init __weak xen_pvh_init(struct boot_params *boot_params)
BUG();
}
-static void hypervisor_specific_init(bool xen_guest)
+static void __init hypervisor_specific_init(bool xen_guest)
{
if (xen_guest)
xen_pvh_init(&pvh_bootparams);
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index d2ccadc247e6..7fe564eaf228 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -30,10 +30,10 @@
* the boot start info structure.
* - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared.
* - `cr4`: all bits are cleared.
- * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’
- * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified.
+ * - `cs `: must be a 32-bit read/execute code segment with a base of `0`
+ * and a limit of `0xFFFFFFFF`. The selector value is unspecified.
* - `ds`, `es`: must be a 32-bit read/write data segment with a base of
- * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all
+ * `0` and a limit of `0xFFFFFFFF`. The selector values are all
* unspecified.
* - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit
* of '0x67'.
@@ -46,12 +46,11 @@
#define PVH_GDT_ENTRY_CS 1
#define PVH_GDT_ENTRY_DS 2
-#define PVH_GDT_ENTRY_CANARY 3
#define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8)
#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
-#define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8)
SYM_CODE_START_LOCAL(pvh_start_xen)
+ UNWIND_HINT_EMPTY
cld
lgdt (_pa(gdt))
@@ -111,17 +110,6 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
#else /* CONFIG_X86_64 */
- /* Set base address in stack canary descriptor. */
- movl $_pa(gdt_start),%eax
- movl $_pa(canary),%ecx
- movw %cx, (PVH_GDT_ENTRY_CANARY * 8) + 2(%eax)
- shrl $16, %ecx
- movb %cl, (PVH_GDT_ENTRY_CANARY * 8) + 4(%eax)
- movb %ch, (PVH_GDT_ENTRY_CANARY * 8) + 7(%eax)
-
- mov $PVH_CANARY_SEL,%eax
- mov %eax,%gs
-
call mk_early_pgtbl_32
mov $_pa(initial_page_table), %eax
@@ -165,7 +153,6 @@ SYM_DATA_START_LOCAL(gdt_start)
.quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */
#endif
.quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */
- .quad GDT_ENTRY(0x4090, 0, 0x18) /* PVH_CANARY_SEL */
SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end)
.balign 16
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index eafc530c8767..a60af0230e27 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -24,6 +24,7 @@
#include <asm/kdebug.h>
#include <asm/local64.h>
#include <asm/nmi.h>
+#include <asm/reboot.h>
#include <asm/traps.h>
#include <asm/uv/uv.h>
#include <asm/uv/uv_hub.h>
@@ -91,6 +92,8 @@ static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1);
static atomic_t uv_nmi_slave_continue;
static cpumask_var_t uv_nmi_cpu_mask;
+static atomic_t uv_nmi_kexec_failed;
+
/* Values for uv_nmi_slave_continue */
#define SLAVE_CLEAR 0
#define SLAVE_CONTINUE 1
@@ -241,8 +244,10 @@ static inline bool uv_nmi_action_is(const char *action)
/* Setup which NMI support is present in system */
static void uv_nmi_setup_mmrs(void)
{
+ bool new_nmi_method_only = false;
+
/* First determine arch specific MMRs to handshake with BIOS */
- if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) {
+ if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) { /* UV2,3,4 setup */
uvh_nmi_mmrx = UVH_EVENT_OCCURRED0;
uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED0_ALIAS;
uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT;
@@ -252,26 +257,25 @@ static void uv_nmi_setup_mmrs(void)
uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2;
uvh_nmi_mmrx_req_shift = 62;
- } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) {
+ } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) { /* UV5+ setup */
uvh_nmi_mmrx = UVH_EVENT_OCCURRED1;
uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED1_ALIAS;
uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED1_EXTIO_INT0_SHFT;
uvh_nmi_mmrx_type = "OCRD1-EXTIO_INT0";
- uvh_nmi_mmrx_supported = UVH_EXTIO_INT0_BROADCAST;
- uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2;
- uvh_nmi_mmrx_req_shift = 62;
+ new_nmi_method_only = true; /* Newer nmi always valid on UV5+ */
+ uvh_nmi_mmrx_req = 0; /* no request bit to clear */
} else {
- pr_err("UV:%s:cannot find EVENT_OCCURRED*_EXTIO_INT0\n",
- __func__);
+ pr_err("UV:%s:NMI support not available on this system\n", __func__);
return;
}
/* Then find out if new NMI is supported */
- if (likely(uv_read_local_mmr(uvh_nmi_mmrx_supported))) {
- uv_write_local_mmr(uvh_nmi_mmrx_req,
- 1UL << uvh_nmi_mmrx_req_shift);
+ if (new_nmi_method_only || uv_read_local_mmr(uvh_nmi_mmrx_supported)) {
+ if (uvh_nmi_mmrx_req)
+ uv_write_local_mmr(uvh_nmi_mmrx_req,
+ 1UL << uvh_nmi_mmrx_req_shift);
nmi_mmr = uvh_nmi_mmrx;
nmi_mmr_clear = uvh_nmi_mmrx_clear;
nmi_mmr_pending = 1UL << uvh_nmi_mmrx_shift;
@@ -834,38 +838,35 @@ static void uv_nmi_touch_watchdogs(void)
touch_nmi_watchdog();
}
-static atomic_t uv_nmi_kexec_failed;
-
-#if defined(CONFIG_KEXEC_CORE)
-static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
+static void uv_nmi_kdump(int cpu, int main, struct pt_regs *regs)
{
+ /* Check if kdump kernel loaded for both main and secondary CPUs */
+ if (!kexec_crash_image) {
+ if (main)
+ pr_err("UV: NMI error: kdump kernel not loaded\n");
+ return;
+ }
+
/* Call crash to dump system state */
- if (master) {
+ if (main) {
pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu);
crash_kexec(regs);
- pr_emerg("UV: crash_kexec unexpectedly returned, ");
+ pr_emerg("UV: crash_kexec unexpectedly returned\n");
atomic_set(&uv_nmi_kexec_failed, 1);
- if (!kexec_crash_image) {
- pr_cont("crash kernel not loaded\n");
- return;
- }
- pr_cont("kexec busy, stalling cpus while waiting\n");
- }
- /* If crash exec fails the slaves should return, otherwise stall */
- while (atomic_read(&uv_nmi_kexec_failed) == 0)
- mdelay(10);
-}
+ } else { /* secondary */
-#else /* !CONFIG_KEXEC_CORE */
-static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
-{
- if (master)
- pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
- atomic_set(&uv_nmi_kexec_failed, 1);
+ /* If kdump kernel fails, secondaries will exit this loop */
+ while (atomic_read(&uv_nmi_kexec_failed) == 0) {
+
+ /* Once shootdown cpus starts, they do not return */
+ run_crash_ipi_callback(regs);
+
+ mdelay(10);
+ }
+ }
}
-#endif /* !CONFIG_KEXEC_CORE */
#ifdef CONFIG_KGDB
#ifdef CONFIG_KGDB_KDB
@@ -889,7 +890,7 @@ static inline int uv_nmi_kdb_reason(void)
* Call KGDB/KDB from NMI handler
*
* Note that if both KGDB and KDB are configured, then the action of 'kgdb' or
- * 'kdb' has no affect on which is used. See the KGDB documention for further
+ * 'kdb' has no affect on which is used. See the KGDB documentation for further
* information.
*/
static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
@@ -985,7 +986,7 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
/* Clear global flags */
if (master) {
- if (cpumask_weight(uv_nmi_cpu_mask))
+ if (!cpumask_empty(uv_nmi_cpu_mask))
uv_nmi_cleanup_mask();
atomic_set(&uv_nmi_cpus_in_nmi, -1);
atomic_set(&uv_nmi_cpu, -1);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index db1378c6ff26..bb176c72891c 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -20,11 +20,12 @@
#include <asm/page.h>
#include <asm/mce.h>
#include <asm/suspend.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/cpu.h>
#include <asm/mmu_context.h>
#include <asm/cpu_device_id.h>
+#include <asm/microcode.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -40,7 +41,8 @@ static void msr_save_context(struct saved_context *ctxt)
struct saved_msr *end = msr + ctxt->saved_msrs.num;
while (msr < end) {
- msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q);
+ if (msr->valid)
+ rdmsrl(msr->info.msr_no, msr->info.reg.q);
msr++;
}
}
@@ -58,19 +60,20 @@ static void msr_restore_context(struct saved_context *ctxt)
}
/**
- * __save_processor_state - save CPU registers before creating a
- * hibernation image and before restoring the memory state from it
- * @ctxt - structure to store the registers contents in
+ * __save_processor_state() - Save CPU registers before creating a
+ * hibernation image and before restoring
+ * the memory state from it
+ * @ctxt: Structure to store the registers contents in.
*
- * NOTE: If there is a CPU register the modification of which by the
- * boot kernel (ie. the kernel used for loading the hibernation image)
- * might affect the operations of the restored target kernel (ie. the one
- * saved in the hibernation image), then its contents must be saved by this
- * function. In other words, if kernel A is hibernated and different
- * kernel B is used for loading the hibernation image into memory, the
- * kernel A's __save_processor_state() function must save all registers
- * needed by kernel A, so that it can operate correctly after the resume
- * regardless of what kernel B does in the meantime.
+ * NOTE: If there is a CPU register the modification of which by the
+ * boot kernel (ie. the kernel used for loading the hibernation image)
+ * might affect the operations of the restored target kernel (ie. the one
+ * saved in the hibernation image), then its contents must be saved by this
+ * function. In other words, if kernel A is hibernated and different
+ * kernel B is used for loading the hibernation image into memory, the
+ * kernel A's __save_processor_state() function must save all registers
+ * needed by kernel A, so that it can operate correctly after the resume
+ * regardless of what kernel B does in the meantime.
*/
static void __save_processor_state(struct saved_context *ctxt)
{
@@ -99,11 +102,8 @@ static void __save_processor_state(struct saved_context *ctxt)
/*
* segment registers
*/
-#ifdef CONFIG_X86_32_LAZY_GS
savesegment(gs, ctxt->gs);
-#endif
#ifdef CONFIG_X86_64
- savesegment(gs, ctxt->gs);
savesegment(fs, ctxt->fs);
savesegment(ds, ctxt->ds);
savesegment(es, ctxt->es);
@@ -184,9 +184,9 @@ static void fix_processor_context(void)
}
/**
- * __restore_processor_state - restore the contents of CPU registers saved
- * by __save_processor_state()
- * @ctxt - structure to load the registers contents from
+ * __restore_processor_state() - Restore the contents of CPU registers saved
+ * by __save_processor_state()
+ * @ctxt: Structure to load the registers contents from.
*
* The asm code that gets us here will have restored a usable GDT, although
* it will be pointing to the wrong alias.
@@ -232,7 +232,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
#else
loadsegment(fs, __KERNEL_PERCPU);
- loadsegment(gs, __KERNEL_STACK_CANARY);
#endif
/* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */
@@ -255,7 +254,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
*/
wrmsrl(MSR_FS_BASE, ctxt->fs_base);
wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);
-#elif defined(CONFIG_X86_32_LAZY_GS)
+#else
loadsegment(gs, ctxt->gs);
#endif
@@ -264,11 +263,18 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
perf_restore_debug_store();
- msr_restore_context(ctxt);
c = &cpu_data(smp_processor_id());
if (cpu_has(c, X86_FEATURE_MSR_IA32_FEAT_CTL))
init_ia32_feat_ctl(c);
+
+ microcode_bsp_resume();
+
+ /*
+ * This needs to happen after the microcode has been updated upon resume
+ * because some of the MSRs are "emulated" in microcode.
+ */
+ msr_restore_context(ctxt);
}
/* Needed by apm.c */
@@ -321,7 +327,7 @@ int hibernate_resume_nonboot_cpu_disable(void)
/*
* When bsp_check() is called in hibernate and suspend, cpu hotplug
- * is disabled already. So it's unnessary to handle race condition between
+ * is disabled already. So it's unnecessary to handle race condition between
* cpumask query and cpu hotplug.
*/
static int bsp_check(void)
@@ -427,8 +433,10 @@ static int msr_build_context(const u32 *msr_id, const int num)
}
for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) {
+ u64 dummy;
+
msr_array[i].info.msr_no = msr_id[j];
- msr_array[i].valid = false;
+ msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy);
msr_array[i].info.reg.q = 0;
}
saved_msrs->num = total_num;
@@ -503,10 +511,24 @@ static int pm_cpu_check(const struct x86_cpu_id *c)
return ret;
}
+static void pm_save_spec_msr(void)
+{
+ u32 spec_msr_id[] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_TSX_CTRL,
+ MSR_TSX_FORCE_ABORT,
+ MSR_IA32_MCU_OPT_CTRL,
+ MSR_AMD64_LS_CFG,
+ };
+
+ msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id));
+}
+
static int pm_check_save_msr(void)
{
dmi_check_system(msr_save_dmi_table);
pm_cpu_check(msr_save_cpu_table);
+ pm_save_spec_msr();
return 0;
}
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index cd3914fc9f3d..e94e0050a583 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -13,8 +13,8 @@
#include <linux/kdebug.h>
#include <linux/cpu.h>
#include <linux/pgtable.h>
-
-#include <crypto/hash.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
#include <asm/e820/api.h>
#include <asm/init.h>
@@ -54,95 +54,33 @@ int pfn_is_nosave(unsigned long pfn)
return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn;
}
-
-#define MD5_DIGEST_SIZE 16
-
struct restore_data_record {
unsigned long jump_address;
unsigned long jump_address_phys;
unsigned long cr3;
unsigned long magic;
- u8 e820_digest[MD5_DIGEST_SIZE];
+ unsigned long e820_checksum;
};
-#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
/**
- * get_e820_md5 - calculate md5 according to given e820 table
+ * compute_e820_crc32 - calculate crc32 of a given e820 table
*
* @table: the e820 table to be calculated
- * @buf: the md5 result to be stored to
+ *
+ * Return: the resulting checksum
*/
-static int get_e820_md5(struct e820_table *table, void *buf)
+static inline u32 compute_e820_crc32(struct e820_table *table)
{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- int size;
- int ret = 0;
-
- tfm = crypto_alloc_shash("md5", 0, 0);
- if (IS_ERR(tfm))
- return -ENOMEM;
-
- desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
- GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto free_tfm;
- }
-
- desc->tfm = tfm;
-
- size = offsetof(struct e820_table, entries) +
+ int size = offsetof(struct e820_table, entries) +
sizeof(struct e820_entry) * table->nr_entries;
- if (crypto_shash_digest(desc, (u8 *)table, size, buf))
- ret = -EINVAL;
-
- kfree_sensitive(desc);
-
-free_tfm:
- crypto_free_shash(tfm);
- return ret;
-}
-
-static int hibernation_e820_save(void *buf)
-{
- return get_e820_md5(e820_table_firmware, buf);
-}
-
-static bool hibernation_e820_mismatch(void *buf)
-{
- int ret;
- u8 result[MD5_DIGEST_SIZE];
-
- memset(result, 0, MD5_DIGEST_SIZE);
- /* If there is no digest in suspend kernel, let it go. */
- if (!memcmp(result, buf, MD5_DIGEST_SIZE))
- return false;
-
- ret = get_e820_md5(e820_table_firmware, result);
- if (ret)
- return true;
-
- return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
-}
-#else
-static int hibernation_e820_save(void *buf)
-{
- return 0;
-}
-
-static bool hibernation_e820_mismatch(void *buf)
-{
- /* If md5 is not builtin for restore kernel, let it go. */
- return false;
+ return ~crc32_le(~0, (unsigned char const *)table, size);
}
-#endif
#ifdef CONFIG_X86_64
-#define RESTORE_MAGIC 0x23456789ABCDEF01UL
+#define RESTORE_MAGIC 0x23456789ABCDEF02UL
#else
-#define RESTORE_MAGIC 0x12345678UL
+#define RESTORE_MAGIC 0x12345679UL
#endif
/**
@@ -179,7 +117,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
*/
rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
- return hibernation_e820_save(rdr->e820_digest);
+ rdr->e820_checksum = compute_e820_crc32(e820_table_firmware);
+ return 0;
}
/**
@@ -200,7 +139,7 @@ int arch_hibernation_header_restore(void *addr)
jump_address_phys = rdr->jump_address_phys;
restore_cr3 = rdr->cr3;
- if (hibernation_e820_mismatch(rdr->e820_digest)) {
+ if (rdr->e820_checksum != compute_e820_crc32(e820_table_firmware)) {
pr_crit("Hibernate inconsistent memory map detected!\n");
return -ENODEV;
}
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 8786653ad3c0..5606a15cf9a1 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
FRAME_BEGIN
call swsusp_save
FRAME_END
- ret
+ RET
SYM_FUNC_END(swsusp_arch_suspend)
SYM_CODE_START(restore_image)
@@ -108,5 +108,5 @@ SYM_FUNC_START(restore_registers)
/* tell the hibernation core that we've just restored the memory */
movl %eax, in_suspend
- ret
+ RET
SYM_FUNC_END(restore_registers)
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index d9bed596d849..0a0539e1cc81 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -66,7 +66,7 @@ SYM_FUNC_START(restore_registers)
/* tell the hibernation core that we've just restored the memory */
movq %rax, in_suspend(%rip)
- ret
+ RET
SYM_FUNC_END(restore_registers)
SYM_FUNC_START(swsusp_arch_suspend)
@@ -96,7 +96,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
FRAME_BEGIN
call swsusp_save
FRAME_END
- ret
+ RET
SYM_FUNC_END(swsusp_arch_suspend)
SYM_FUNC_START(restore_image)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 95ea17a9d20c..ae53d54d7959 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -16,7 +16,7 @@ CFLAGS_sha256.o := -D__DISABLE_EXPORTS
# When linking purgatory.ro with -r unresolved symbols are not checked,
# also link a purgatory.chk binary without -r to check for unresolved symbols.
-PURGATORY_LDFLAGS := -e purgatory_start -nostdlib -z nodefaultlib
+PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib
LDFLAGS_purgatory.ro := -r $(PURGATORY_LDFLAGS)
LDFLAGS_purgatory.chk := $(PURGATORY_LDFLAGS)
targets += purgatory.ro purgatory.chk
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index f03b64d9cb51..7558139920f8 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -9,6 +9,8 @@
*/
#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
#include <crypto/sha2.h>
#include <asm/purgatory.h>
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index 6b1f3a4eeb44..a0b491ae2de8 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -10,7 +10,6 @@
# Sanitizer runtimes are unavailable and cannot be linked here.
KASAN_SANITIZE := n
KCSAN_SANITIZE := n
-OBJECT_FILES_NON_STANDARD := y
subdir- := rm
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 22fda7d99159..41d7669a97ad 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -2,14 +2,14 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/memblock.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/pgtable.h>
#include <asm/set_memory.h>
#include <asm/realmode.h>
#include <asm/tlbflush.h>
#include <asm/crash.h>
-#include <asm/sev-es.h>
+#include <asm/sev.h>
struct real_mode_header *real_mode_header;
u32 *trampoline_cr4_features;
@@ -17,6 +17,32 @@ u32 *trampoline_cr4_features;
/* Hold the pgd entry used on booting additional CPUs */
pgd_t trampoline_pgd_entry;
+void load_trampoline_pgtable(void)
+{
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ /*
+ * This function is called before exiting to real-mode and that will
+ * fail with CR4.PCIDE still set.
+ */
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
+
+ write_cr3(real_mode_header->trampoline_pgd);
+#endif
+
+ /*
+ * The CR3 write above will not flush global TLB entries.
+ * Stale, global entries from previous page tables may still be
+ * present. Flush those stale entries.
+ *
+ * This ensures that memory accessed while running with
+ * trampoline_pgd is *actually* mapped into trampoline_pgd.
+ */
+ __flush_tlb_all();
+}
+
void __init reserve_real_mode(void)
{
phys_addr_t mem;
@@ -28,24 +54,26 @@ void __init reserve_real_mode(void)
WARN_ON(slab_is_available());
/* Has to be under 1M so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (!mem) {
+ mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20);
+ if (!mem)
pr_info("No sub-1M memory is available for the trampoline\n");
- return;
- }
+ else
+ set_real_mode_mem(mem);
- memblock_reserve(mem, size);
- set_real_mode_mem(mem);
- crash_reserve_low_1M();
+ /*
+ * Unconditionally reserve the entire fisrt 1M, see comment in
+ * setup_arch().
+ */
+ memblock_reserve(0, SZ_1M);
}
-static void sme_sev_setup_real_mode(struct trampoline_header *th)
+static void __init sme_sev_setup_real_mode(struct trampoline_header *th)
{
#ifdef CONFIG_AMD_MEM_ENCRYPT
- if (sme_active())
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
th->flags |= TH_FLAGS_SME_ACTIVE;
- if (sev_es_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
/*
* Skip the call to verify_cpu() in secondary_startup_64 as it
* will cause #VC exceptions when the AP can't handle them yet.
@@ -70,6 +98,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -79,7 +108,7 @@ static void __init setup_real_mode(void)
* decrypted memory in order to bring up other processors
* successfully. This is not needed for SEV.
*/
- if (sme_active())
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
memcpy(base, real_mode_blob, size);
@@ -103,7 +132,7 @@ static void __init setup_real_mode(void)
*ptr += phys_base;
}
- /* Must be perfomed *after* relocation. */
+ /* Must be performed *after* relocation. */
trampoline_header = (struct trampoline_header *)
__va(real_mode_header->trampoline_header);
@@ -126,8 +155,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags = 0;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif
sme_sev_setup_real_mode(trampoline_header);
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index 8c1db5bf5d78..2eb62be6d256 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -24,6 +24,7 @@ SYM_DATA_START(real_mode_header)
.long pa_sev_es_trampoline_start
#endif
#ifdef CONFIG_X86_64
+ .long pa_trampoline_start64
.long pa_trampoline_pgd;
#endif
/* ACPI S3 wakeup */
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 84c5d1b33d10..e38d61d6562e 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -70,7 +70,7 @@ SYM_CODE_START(trampoline_start)
movw $__KERNEL_DS, %dx # Data segment descriptor
# Enable protected mode
- movl $X86_CR0_PE, %eax # protected mode (PE) bit
+ movl $(CR0_STATE & ~X86_CR0_PG), %eax
movl %eax, %cr0 # into protected mode
# flush prefetch and jump to startup_32
@@ -123,9 +123,9 @@ SYM_CODE_START(startup_32)
*/
btl $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
jnc .Ldone
- movl $MSR_K8_SYSCFG, %ecx
+ movl $MSR_AMD64_SYSCFG, %ecx
rdmsr
- bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
+ bts $MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT, %eax
jc .Ldone
/*
@@ -143,13 +143,24 @@ SYM_CODE_START(startup_32)
movl %eax, %cr3
# Set up EFER
+ movl $MSR_EFER, %ecx
+ rdmsr
+ /*
+ * Skip writing to EFER if the register already has desired
+ * value (to avoid #VE for the TDX guest).
+ */
+ cmp pa_tr_efer, %eax
+ jne .Lwrite_efer
+ cmp pa_tr_efer + 4, %edx
+ je .Ldone_efer
+.Lwrite_efer:
movl pa_tr_efer, %eax
movl pa_tr_efer + 4, %edx
- movl $MSR_EFER, %ecx
wrmsr
- # Enable paging and in turn activate Long Mode
- movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
+.Ldone_efer:
+ # Enable paging and in turn activate Long Mode.
+ movl $CR0_STATE, %eax
movl %eax, %cr0
/*
@@ -161,6 +172,19 @@ SYM_CODE_START(startup_32)
ljmpl $__KERNEL_CS, $pa_startup_64
SYM_CODE_END(startup_32)
+SYM_CODE_START(pa_trampoline_compat)
+ /*
+ * In compatibility mode. Prep ESP and DX for startup_32, then disable
+ * paging and complete the switch to legacy 32-bit mode.
+ */
+ movl $rm_stack_end, %esp
+ movw $__KERNEL_DS, %dx
+
+ movl $(CR0_STATE & ~X86_CR0_PG), %eax
+ movl %eax, %cr0
+ ljmpl $__KERNEL32_CS, $pa_startup_32
+SYM_CODE_END(pa_trampoline_compat)
+
.section ".text64","ax"
.code64
.balign 4
@@ -169,6 +193,20 @@ SYM_CODE_START(startup_64)
jmpq *tr_start(%rip)
SYM_CODE_END(startup_64)
+SYM_CODE_START(trampoline_start64)
+ /*
+ * APs start here on a direct transfer from 64-bit BIOS with identity
+ * mapped page tables. Load the kernel's GDT in order to gear down to
+ * 32-bit mode (to handle 4-level vs. 5-level paging), and to (re)load
+ * segment registers. Load the zero IDT so any fault triggers a
+ * shutdown instead of jumping back into BIOS.
+ */
+ lidt tr_idt(%rip)
+ lgdt tr_gdt64(%rip)
+
+ ljmpl *tr_compat(%rip)
+SYM_CODE_END(trampoline_start64)
+
.section ".rodata","a"
# Duplicate the global descriptor table
# so the kernel can live anywhere
@@ -182,6 +220,17 @@ SYM_DATA_START(tr_gdt)
.quad 0x00cf93000000ffff # __KERNEL_DS
SYM_DATA_END_LABEL(tr_gdt, SYM_L_LOCAL, tr_gdt_end)
+SYM_DATA_START(tr_gdt64)
+ .short tr_gdt_end - tr_gdt - 1 # gdt limit
+ .long pa_tr_gdt
+ .long 0
+SYM_DATA_END(tr_gdt64)
+
+SYM_DATA_START(tr_compat)
+ .long pa_trampoline_compat
+ .short __KERNEL32_CS
+SYM_DATA_END(tr_compat)
+
.bss
.balign PAGE_SIZE
SYM_DATA(trampoline_pgd, .space PAGE_SIZE)
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
index 5033e640f957..4331c32c47f8 100644
--- a/arch/x86/realmode/rm/trampoline_common.S
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -1,4 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
.section ".rodata","a"
.balign 16
-SYM_DATA_LOCAL(tr_idt, .fill 1, 6, 0)
+
+/*
+ * When a bootloader hands off to the kernel in 32-bit mode an
+ * IDT with a 2-byte limit and 4-byte base is needed. When a boot
+ * loader hands off to a kernel 64-bit mode the base address
+ * extends to 8-bytes. Reserve enough space for either scenario.
+ */
+SYM_DATA_START_LOCAL(tr_idt)
+ .short 0
+ .quad 0
+SYM_DATA_END(tr_idt)
diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c
index 1d6437e6d2ba..a6f4d8388ad8 100644
--- a/arch/x86/realmode/rm/wakemain.c
+++ b/arch/x86/realmode/rm/wakemain.c
@@ -62,8 +62,12 @@ static void send_morse(const char *pattern)
}
}
+struct port_io_ops pio_ops;
+
void main(void)
{
+ init_default_io_ops();
+
/* Kill machine if structures are wrong */
if (wakeup_header.real_magic != 0x12345678)
while (1)
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
index fd1ab80be0de..a4cf678cf5c8 100644
--- a/arch/x86/tools/chkobjdump.awk
+++ b/arch/x86/tools/chkobjdump.awk
@@ -10,6 +10,7 @@ BEGIN {
/^GNU objdump/ {
verstr = ""
+ gsub(/\(.*\)/, "");
for (i = 3; i <= NF; i++)
if (match($(i), "^[0-9]")) {
verstr = $(i);
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
index 34eda63c124b..472540aeabc2 100644
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -120,7 +120,7 @@ int main(int argc, char **argv)
while (fgets(line, BUFSIZE, stdin)) {
char copy[BUFSIZE], *s, *tab1, *tab2;
- int nb = 0;
+ int nb = 0, ret;
unsigned int b;
if (line[0] == '<') {
@@ -148,10 +148,12 @@ int main(int argc, char **argv)
} else
break;
}
+
/* Decode an instruction */
- insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64);
- insn_get_length(&insn);
- if (insn.length != nb) {
+ ret = insn_decode(&insn, insn_buff, sizeof(insn_buff),
+ x86_64 ? INSN_MODE_64 : INSN_MODE_32);
+
+ if (ret < 0 || insn.length != nb) {
warnings++;
pr_warn("Found an x86 instruction decoder bug, "
"please report this.\n", sym);
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
index c6a0000ae635..213f35f94feb 100644
--- a/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@ -218,8 +218,8 @@ static void parse_args(int argc, char **argv)
int main(int argc, char **argv)
{
+ int insns = 0, ret;
struct insn insn;
- int insns = 0;
int errors = 0;
unsigned long i;
unsigned char insn_buff[MAX_INSN_SIZE * 2];
@@ -237,15 +237,15 @@ int main(int argc, char **argv)
continue;
/* Decode an instruction */
- insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64);
- insn_get_length(&insn);
+ ret = insn_decode(&insn, insn_buff, sizeof(insn_buff),
+ x86_64 ? INSN_MODE_64 : INSN_MODE_32);
if (insn.next_byte <= insn.kaddr ||
insn.kaddr + MAX_INSN_SIZE < insn.next_byte) {
/* Access out-of-range memory */
dump_stream(stderr, "Error: Found an access violation", i, insn_buff, &insn);
errors++;
- } else if (verbose && !insn_complete(&insn))
+ } else if (verbose && ret < 0)
dump_stream(stdout, "Info: Found an undecodable input", i, insn_buff, &insn);
else if (verbose >= 2)
dump_insn(stdout, &insn);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 04c5a44b9682..e2c5b296120d 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -14,6 +14,10 @@
static Elf_Ehdr ehdr;
static unsigned long shnum;
static unsigned int shstrndx;
+static unsigned int shsymtabndx;
+static unsigned int shxsymtabndx;
+
+static int sym_index(Elf_Sym *sym);
struct relocs {
uint32_t *offset;
@@ -26,12 +30,16 @@ static struct relocs relocs32;
#if ELF_BITS == 64
static struct relocs relocs32neg;
static struct relocs relocs64;
+#define FMT PRIu64
+#else
+#define FMT PRIu32
#endif
struct section {
Elf_Shdr shdr;
struct section *link;
Elf_Sym *symtab;
+ Elf32_Word *xsymtab;
Elf_Rel *reltab;
char *strtab;
};
@@ -57,12 +65,14 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
[S_REL] =
"^(__init_(begin|end)|"
"__x86_cpu_dev_(start|end)|"
- "(__parainstructions|__alt_instructions)(|_end)|"
- "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
+ "(__parainstructions|__alt_instructions)(_end)?|"
+ "(__iommu_table|__apicdrivers|__smp_locks)(_end)?|"
"__(start|end)_pci_.*|"
+#if CONFIG_FW_LOADER
"__(start|end)_builtin_fw|"
- "__(start|stop)___ksymtab(|_gpl)|"
- "__(start|stop)___kcrctab(|_gpl)|"
+#endif
+ "__(start|stop)___ksymtab(_gpl)?|"
+ "__(start|stop)___kcrctab(_gpl)?|"
"__(start|stop)___param|"
"__(start|stop)___modver|"
"__(start|stop)___bug_table|"
@@ -265,7 +275,7 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym)
name = sym_strtab + sym->st_name;
}
else {
- name = sec_name(sym->st_shndx);
+ name = sec_name(sym_index(sym));
}
return name;
}
@@ -335,6 +345,23 @@ static uint64_t elf64_to_cpu(uint64_t val)
#define elf_xword_to_cpu(x) elf32_to_cpu(x)
#endif
+static int sym_index(Elf_Sym *sym)
+{
+ Elf_Sym *symtab = secs[shsymtabndx].symtab;
+ Elf32_Word *xsymtab = secs[shxsymtabndx].xsymtab;
+ unsigned long offset;
+ int index;
+
+ if (sym->st_shndx != SHN_XINDEX)
+ return sym->st_shndx;
+
+ /* calculate offset of sym from head of table. */
+ offset = (unsigned long)sym - (unsigned long)symtab;
+ index = offset / sizeof(*sym);
+
+ return elf32_to_cpu(xsymtab[index]);
+}
+
static void read_ehdr(FILE *fp)
{
if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
@@ -389,7 +416,7 @@ static void read_ehdr(FILE *fp)
Elf_Shdr shdr;
if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0)
- die("Seek to %d failed: %s\n", ehdr.e_shoff, strerror(errno));
+ die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno));
if (fread(&shdr, sizeof(shdr), 1, fp) != 1)
die("Cannot read initial ELF section header: %s\n", strerror(errno));
@@ -412,17 +439,17 @@ static void read_shdrs(FILE *fp)
secs = calloc(shnum, sizeof(struct section));
if (!secs) {
- die("Unable to allocate %d section headers\n",
+ die("Unable to allocate %ld section headers\n",
shnum);
}
if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
- die("Seek to %d failed: %s\n",
- ehdr.e_shoff, strerror(errno));
+ die("Seek to %" FMT " failed: %s\n",
+ ehdr.e_shoff, strerror(errno));
}
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
if (fread(&shdr, sizeof(shdr), 1, fp) != 1)
- die("Cannot read ELF section headers %d/%d: %s\n",
+ die("Cannot read ELF section headers %d/%ld: %s\n",
i, shnum, strerror(errno));
sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name);
sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type);
@@ -450,12 +477,12 @@ static void read_strtabs(FILE *fp)
}
sec->strtab = malloc(sec->shdr.sh_size);
if (!sec->strtab) {
- die("malloc of %d bytes for strtab failed\n",
- sec->shdr.sh_size);
+ die("malloc of %" FMT " bytes for strtab failed\n",
+ sec->shdr.sh_size);
}
if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %d failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
+ die("Seek to %" FMT " failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
}
if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
!= sec->shdr.sh_size) {
@@ -468,31 +495,60 @@ static void read_strtabs(FILE *fp)
static void read_symtabs(FILE *fp)
{
int i,j;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_SYMTAB) {
+ int num_syms;
+
+ switch (sec->shdr.sh_type) {
+ case SHT_SYMTAB_SHNDX:
+ sec->xsymtab = malloc(sec->shdr.sh_size);
+ if (!sec->xsymtab) {
+ die("malloc of %" FMT " bytes for xsymtab failed\n",
+ sec->shdr.sh_size);
+ }
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+ die("Seek to %" FMT " failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
+ }
+ if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp)
+ != sec->shdr.sh_size) {
+ die("Cannot read extended symbol table: %s\n",
+ strerror(errno));
+ }
+ shxsymtabndx = i;
+ continue;
+
+ case SHT_SYMTAB:
+ num_syms = sec->shdr.sh_size / sizeof(Elf_Sym);
+
+ sec->symtab = malloc(sec->shdr.sh_size);
+ if (!sec->symtab) {
+ die("malloc of %" FMT " bytes for symtab failed\n",
+ sec->shdr.sh_size);
+ }
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+ die("Seek to %" FMT " failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
+ }
+ if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
+ != sec->shdr.sh_size) {
+ die("Cannot read symbol table: %s\n",
+ strerror(errno));
+ }
+ for (j = 0; j < num_syms; j++) {
+ Elf_Sym *sym = &sec->symtab[j];
+
+ sym->st_name = elf_word_to_cpu(sym->st_name);
+ sym->st_value = elf_addr_to_cpu(sym->st_value);
+ sym->st_size = elf_xword_to_cpu(sym->st_size);
+ sym->st_shndx = elf_half_to_cpu(sym->st_shndx);
+ }
+ shsymtabndx = i;
+ continue;
+
+ default:
continue;
- }
- sec->symtab = malloc(sec->shdr.sh_size);
- if (!sec->symtab) {
- die("malloc of %d bytes for symtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %d failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
- for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) {
- Elf_Sym *sym = &sec->symtab[j];
- sym->st_name = elf_word_to_cpu(sym->st_name);
- sym->st_value = elf_addr_to_cpu(sym->st_value);
- sym->st_size = elf_xword_to_cpu(sym->st_size);
- sym->st_shndx = elf_half_to_cpu(sym->st_shndx);
}
}
}
@@ -508,12 +564,12 @@ static void read_relocs(FILE *fp)
}
sec->reltab = malloc(sec->shdr.sh_size);
if (!sec->reltab) {
- die("malloc of %d bytes for relocs failed\n",
- sec->shdr.sh_size);
+ die("malloc of %" FMT " bytes for relocs failed\n",
+ sec->shdr.sh_size);
}
if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %d failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
+ die("Seek to %" FMT " failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
}
if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
!= sec->shdr.sh_size) {
@@ -759,7 +815,9 @@ static void percpu_init(void)
*/
static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
{
- return (sym->st_shndx == per_cpu_shndx) &&
+ int shndx = sym_index(sym);
+
+ return (shndx == per_cpu_shndx) &&
strcmp(symname, "__init_begin") &&
strcmp(symname, "__per_cpu_load") &&
strncmp(symname, "init_per_cpu_", 13);
@@ -1092,7 +1150,7 @@ static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
sec_name(sec->shdr.sh_info),
rel_type(ELF_R_TYPE(rel->r_info)),
symname,
- sec_name(sym->st_shndx));
+ sec_name(sym_index(sym)));
return 0;
}
diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h
index 43c83c0fd22c..4c49c82446eb 100644
--- a/arch/x86/tools/relocs.h
+++ b/arch/x86/tools/relocs.h
@@ -17,6 +17,7 @@
#include <regex.h>
#include <tools/le_byteshift.h>
+__attribute__((__format__(printf, 1, 2)))
void die(char *fmt, ...) __attribute__((noreturn));
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 95d26a69088b..1bcd42c53039 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -8,7 +8,8 @@ endmenu
config UML_X86
def_bool y
- select GENERIC_FIND_FIRST_BIT
+ select ARCH_BINFMT_ELF_EXTRA_PHDRS if X86_32
+ select DCACHE_WORD_ACCESS
config 64BIT
bool "64-bit kernel" if "$(SUBARCH)" = "x86"
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 77f70b969d14..ba5789c35809 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -21,6 +21,7 @@ obj-y += checksum_32.o syscalls_32.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
+subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o
subarch-y += ../kernel/sys_ia32.o
else
@@ -39,7 +40,7 @@ $(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
-Iarch/x86/include/generated
targets += user-offsets.s
-include/generated/user_constants.h: $(obj)/user-offsets.s
+include/generated/user_constants.h: $(obj)/user-offsets.s FORCE
$(call filechk,offsets,__USER_CONSTANT_H__)
UNPROFILE_OBJS := stub_segv.o
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 165be7f9a964..4da336965698 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -2,6 +2,7 @@
#ifndef _ASM_UM_BARRIER_H_
#define _ASM_UM_BARRIER_H_
+#include <asm/cpufeatures.h>
#include <asm/alternative.h>
/*
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index c907b20d4993..dcaf3b38a9e0 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -212,6 +212,6 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
extern long elf_aux_hwcap;
#define ELF_HWCAP (elf_aux_hwcap)
-#define SET_PERSONALITY(ex) do ; while(0)
+#define SET_PERSONALITY(ex) do {} while(0)
#endif
diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h
index 453db377150d..2ef507bc6989 100644
--- a/arch/x86/um/asm/segment.h
+++ b/arch/x86/um/asm/segment.h
@@ -8,12 +8,4 @@ extern int host_gdt_entry_tls_min;
#define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min
#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
-typedef struct {
- unsigned long seg;
-} mm_segment_t;
-
-#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
-#define KERNEL_DS MAKE_MM_SEG(~0UL)
-#define USER_DS MAKE_MM_SEG(TASK_SIZE)
-
#endif
diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
index 13f118dec74f..aed782ab7721 100644
--- a/arch/x86/um/checksum_32.S
+++ b/arch/x86/um/checksum_32.S
@@ -110,7 +110,7 @@ csum_partial:
7:
popl %ebx
popl %esi
- ret
+ RET
#else
@@ -208,7 +208,7 @@ csum_partial:
80:
popl %ebx
popl %esi
- ret
+ RET
#endif
EXPORT_SYMBOL(csum_partial)
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 3ee234b6234d..255a44dd415a 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -23,9 +23,11 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func,
{
long res;
void *stub_addr;
+
+ BUILD_BUG_ON(sizeof(*desc) % sizeof(long));
+
res = syscall_stub_data(mm_idp, (unsigned long *)desc,
- (sizeof(*desc) + sizeof(long) - 1) &
- ~(sizeof(long) - 1),
+ sizeof(*desc) / sizeof(long),
addr, &stub_addr);
if (!res) {
unsigned long args[] = { func,
diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c
index 3c423dfcd78b..df8f4b4bf98b 100644
--- a/arch/x86/um/os-Linux/registers.c
+++ b/arch/x86/um/os-Linux/registers.c
@@ -15,6 +15,7 @@
#include <sys/uio.h>
#include <asm/sigcontext.h>
#include <linux/elf.h>
+#include <registers.h>
int have_xstate_support;
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index 2497bac56066..0bc4b73a9cde 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <asm/ptrace-abi.h>
+#include <registers.h>
#include <skas.h>
extern int arch_switch_tls(struct task_struct *to);
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 1401899dee9b..289d0159b041 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -11,6 +11,7 @@
#define __FRAME_OFFSETS
#include <asm/ptrace.h>
#include <linux/uaccess.h>
+#include <registers.h>
#include <asm/ptrace-abi.h>
/*
diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S
index 62eaf8c80e04..2d991ddbcca5 100644
--- a/arch/x86/um/setjmp_32.S
+++ b/arch/x86/um/setjmp_32.S
@@ -34,7 +34,7 @@ kernel_setjmp:
movl %esi,12(%edx)
movl %edi,16(%edx)
movl %ecx,20(%edx) # Return address
- ret
+ RET
.size kernel_setjmp,.-kernel_setjmp
diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S
index 1b5d40d4ff46..b46acb6a8ebd 100644
--- a/arch/x86/um/setjmp_64.S
+++ b/arch/x86/um/setjmp_64.S
@@ -33,7 +33,7 @@ kernel_setjmp:
movq %r14,40(%rdi)
movq %r15,48(%rdi)
movq %rsi,56(%rdi) # Return address
- ret
+ RET
.size kernel_setjmp,.-kernel_setjmp
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index c3891c1ada26..4c6c2be0c899 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -77,7 +77,7 @@ static inline void trap_myself(void)
__asm("int3");
}
-static void inline remap_stack_and_trap(void)
+static inline void remap_stack_and_trap(void)
{
__asm__ volatile (
"movl %%esp,%%ebx ;"
@@ -101,4 +101,16 @@ static void inline remap_stack_and_trap(void)
"memory");
}
+static __always_inline void *get_stub_page(void)
+{
+ unsigned long ret;
+
+ asm volatile (
+ "movl %%esp,%0 ;"
+ "andl %1,%0"
+ : "=a" (ret)
+ : "g" (~(UM_KERN_PAGE_SIZE - 1)));
+
+ return (void *)ret;
+}
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 6e2626b77a2e..e9c4b2b38803 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -108,4 +108,16 @@ static inline void remap_stack_and_trap(void)
__syscall_clobber, "r10", "r8", "r9");
}
+static __always_inline void *get_stub_page(void)
+{
+ unsigned long ret;
+
+ asm volatile (
+ "movq %%rsp,%0 ;"
+ "andq %1,%0"
+ : "=a" (ret)
+ : "g" (~(UM_KERN_PAGE_SIZE - 1)));
+
+ return (void *)ret;
+}
#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
index 8a7d5e1da98e..b6b997225841 100644
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ b/arch/x86/um/shared/sysdep/syscalls_64.h
@@ -10,22 +10,18 @@
#include <linux/msg.h>
#include <linux/shm.h>
-typedef long syscall_handler_t(void);
+typedef long syscall_handler_t(long, long, long, long, long, long);
extern syscall_handler_t *sys_call_table[];
#define EXECUTE_SYSCALL(syscall, regs) \
- (((long (*)(long, long, long, long, long, long)) \
- (*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(&regs->regs), \
+ (((*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(&regs->regs), \
UPT_SYSCALL_ARG2(&regs->regs), \
UPT_SYSCALL_ARG3(&regs->regs), \
UPT_SYSCALL_ARG4(&regs->regs), \
UPT_SYSCALL_ARG5(&regs->regs), \
UPT_SYSCALL_ARG6(&regs->regs)))
-extern long old_mmap(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff);
extern syscall_handler_t sys_modify_ldt;
extern syscall_handler_t sys_arch_prctl;
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 7c11c9e5d7ea..263e1d08f216 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -12,6 +12,7 @@
#include <linux/uaccess.h>
#include <asm/ucontext.h>
#include <frame_kern.h>
+#include <registers.h>
#include <skas.h>
#ifdef CONFIG_X86_32
diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c
index 21836eaf1725..f7eefba034f9 100644
--- a/arch/x86/um/stub_segv.c
+++ b/arch/x86/um/stub_segv.c
@@ -11,9 +11,8 @@
void __attribute__ ((__section__ (".__syscall_stub")))
stub_segv_handler(int sig, siginfo_t *info, void *p)
{
- int stack;
+ struct faultinfo *f = get_stub_page();
ucontext_t *uc = p;
- struct faultinfo *f = (void *)(((unsigned long)&stack) & ~(UM_KERN_PAGE_SIZE - 1));
GET_FAULTINFO_FROM_MC(*f, &uc->uc_mcontext);
trap_myself();
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 2ed81e581755..89df5d89d664 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -7,11 +7,8 @@
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
-#define __NO_STUBS
-
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -24,22 +21,17 @@
#define sys_vm86old sys_ni_syscall
#define sys_vm86 sys_ni_syscall
-#define old_mmap sys_old_mmap
+#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 2e8544dafbb0..b0b4cfd2308c 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -7,11 +7,8 @@
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
-#define __NO_STUBS
-
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -21,38 +18,15 @@
#define sys_iopl sys_ni_syscall
#define sys_ioperm sys_ni_syscall
-/*
- * The UML TLS problem. Note that x86_64 does not implement this, so the below
- * is needed only for the ia32 compatibility.
- */
-
-/* On UML we call it this way ("old" means it's not mmap2) */
-#define sys_mmap old_mmap
-
-#define stub_clone sys_clone
-#define stub_fork sys_fork
-#define stub_vfork sys_vfork
-#define stub_execve sys_execve
-#define stub_execveat sys_execveat
-#define stub_rt_sigreturn sys_rt_sigreturn
-
-#define __SYSCALL_X32(nr, sym)
-#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym)
-
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
-#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym) [ nr ] = sym,
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 58f51667e2e4..27b29ae6c471 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -10,6 +10,7 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <asm/prctl.h> /* XXX This should get the constants from libc */
+#include <registers.h>
#include <os.h>
long arch_prctl(struct task_struct *task, int option,
@@ -35,7 +36,7 @@ long arch_prctl(struct task_struct *task, int option,
switch (option) {
case ARCH_SET_FS:
case ARCH_SET_GS:
- ret = restore_registers(pid, &current->thread.regs.regs);
+ ret = restore_pid_registers(pid, &current->thread.regs.regs);
if (ret)
return ret;
break;
@@ -87,3 +88,13 @@ void arch_switch_to(struct task_struct *to)
arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs);
}
+
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, off)
+{
+ if (off & ~PAGE_MASK)
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+}
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index bae61554abcc..e54a9814ccf1 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -8,12 +8,11 @@
#define __FRAME_OFFSETS
#include <linux/ptrace.h>
#include <asm/types.h>
+#include <linux/kbuild.h>
-#define DEFINE(sym, val) \
- asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-
-#define DEFINE_LONGS(sym, val) \
- asm volatile("\n->" #sym " %0 " #val : : "i" (val/sizeof(unsigned long)))
+#define DEFINE_LONGS(sym, val) \
+ COMMENT(#val " / sizeof(unsigned long)"); \
+ DEFINE(sym, val / sizeof(unsigned long))
void foo(void)
{
diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
new file mode 100644
index 000000000000..49a54356ae99
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/tdxcall.S
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/asm-offsets.h>
+#include <asm/tdx.h>
+
+/*
+ * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
+ */
+#define tdcall .byte 0x66,0x0f,0x01,0xcc
+#define seamcall .byte 0x66,0x0f,0x01,0xcf
+
+/*
+ * TDX_MODULE_CALL - common helper macro for both
+ * TDCALL and SEAMCALL instructions.
+ *
+ * TDCALL - used by TDX guests to make requests to the
+ * TDX module and hypercalls to the VMM.
+ * SEAMCALL - used by TDX hosts to make requests to the
+ * TDX module.
+ */
+.macro TDX_MODULE_CALL host:req
+ /*
+ * R12 will be used as temporary storage for struct tdx_module_output
+ * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL
+ * services supported by this function, it can be reused.
+ */
+
+ /* Callee saved, so preserve it */
+ push %r12
+
+ /*
+ * Push output pointer to stack.
+ * After the operation, it will be fetched into R12 register.
+ */
+ push %r9
+
+ /* Mangle function call ABI into TDCALL/SEAMCALL ABI: */
+ /* Move Leaf ID to RAX */
+ mov %rdi, %rax
+ /* Move input 4 to R9 */
+ mov %r8, %r9
+ /* Move input 3 to R8 */
+ mov %rcx, %r8
+ /* Move input 1 to RCX */
+ mov %rsi, %rcx
+ /* Leave input param 2 in RDX */
+
+ .if \host
+ seamcall
+ /*
+ * SEAMCALL instruction is essentially a VMExit from VMX root
+ * mode to SEAM VMX root mode. VMfailInvalid (CF=1) indicates
+ * that the targeted SEAM firmware is not loaded or disabled,
+ * or P-SEAMLDR is busy with another SEAMCALL. %rax is not
+ * changed in this case.
+ *
+ * Set %rax to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid.
+ * This value will never be used as actual SEAMCALL error code as
+ * it is from the Reserved status code class.
+ */
+ jnc .Lno_vmfailinvalid
+ mov $TDX_SEAMCALL_VMFAILINVALID, %rax
+.Lno_vmfailinvalid:
+
+ .else
+ tdcall
+ .endif
+
+ /*
+ * Fetch output pointer from stack to R12 (It is used
+ * as temporary storage)
+ */
+ pop %r12
+
+ /*
+ * Since this macro can be invoked with NULL as an output pointer,
+ * check if caller provided an output struct before storing output
+ * registers.
+ *
+ * Update output registers, even if the call failed (RAX != 0).
+ * Other registers may contain details of the failure.
+ */
+ test %r12, %r12
+ jz .Lno_output_struct
+
+ /* Copy result registers to output struct: */
+ movq %rcx, TDX_MODULE_rcx(%r12)
+ movq %rdx, TDX_MODULE_rdx(%r12)
+ movq %r8, TDX_MODULE_r8(%r12)
+ movq %r9, TDX_MODULE_r9(%r12)
+ movq %r10, TDX_MODULE_r10(%r12)
+ movq %r11, TDX_MODULE_r11(%r12)
+
+.Lno_output_struct:
+ /* Restore the state of R12 register */
+ pop %r12
+.endm
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index afc1da68b06d..85246dd9faa1 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -23,6 +23,7 @@ config XEN_PV
select PARAVIRT_XXL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
+ select GUEST_PERF_EVENTS
help
Support running as a Xen PV guest.
@@ -43,13 +44,9 @@ config XEN_PV_SMP
def_bool y
depends on XEN_PV && SMP
-config XEN_DOM0
- bool "Xen PV Dom0 support"
- default y
- depends on XEN_PV && PCI_XEN && SWIOTLB_XEN
- depends on X86_IO_APIC && ACPI && PCI
- help
- Support running as a Xen PV Dom0 guest.
+config XEN_PV_DOM0
+ def_bool y
+ depends on XEN_PV && XEN_DOM0
config XEN_PVHVM
def_bool y
@@ -86,3 +83,12 @@ config XEN_PVH
def_bool n
help
Support for running as a Xen PVH guest.
+
+config XEN_DOM0
+ bool "Xen Dom0 support"
+ default XEN_PV
+ depends on (XEN_PV && SWIOTLB_XEN) || (XEN_PVH && X86_64)
+ depends on X86_IO_APIC && ACPI && PCI
+ select X86_X2APIC if XEN_PVH && X86_64
+ help
+ Support running as a Xen Dom0 guest.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 40b5779fce21..3c5b52fbe4a7 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -45,8 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-obj-$(CONFIG_XEN_DOM0) += vga.o
-
-obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
+obj-$(CONFIG_XEN_PV_DOM0) += vga.o
obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 0d46cc283cf5..62d34b6611c5 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -51,7 +51,7 @@ static u32 xen_apic_read(u32 reg)
.interface_version = XENPF_INTERFACE_VERSION,
.u.pcpu_info.xen_cpuid = 0,
};
- int ret = 0;
+ int ret;
/* Shouldn't need this as APIC is turned off for PV, and we only
* get called on the bootup processor. But just in case. */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index aa9f50fccc5d..30c6e986a6cd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -3,18 +3,23 @@
#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
#include <linux/memblock.h>
#endif
+#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/kexec.h>
#include <linux/slab.h>
+#include <linux/panic_notifier.h>
#include <xen/xen.h>
#include <xen/features.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/version.h>
#include <xen/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/cpu.h>
#include <asm/e820/api.h>
+#include <asm/setup.h>
#include "xen-ops.h"
#include "smp.h"
@@ -26,34 +31,16 @@ EXPORT_SYMBOL_GPL(hypercall_page);
* Pointer to the xen_vcpu_info structure or
* &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
* and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
- * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
- * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
- * acknowledge pending events.
- * Also more subtly it is used by the patched version of irq enable/disable
- * e.g. xen_irq_enable_direct and xen_iret in PV mode.
- *
- * The desire to be able to do those mask/unmask operations as a single
- * instruction by using the per-cpu offset held in %gs is the real reason
- * vcpu info is in a per-cpu pointer and the original reason for this
- * hypercall.
- *
+ * but during boot it is switched to point to xen_vcpu_info.
+ * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events.
*/
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-
-/*
- * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
- * hypercall. This can be used both in PV and PVHVM mode. The structure
- * overrides the default per_cpu(xen_vcpu, cpu) value.
- */
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
/* Linux <-> Xen vCPU id mapping */
DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
EXPORT_PER_CPU_SYMBOL(xen_vcpu_id);
-enum xen_domain_type xen_domain_type = XEN_NATIVE;
-EXPORT_SYMBOL_GPL(xen_domain_type);
-
unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
EXPORT_SYMBOL(machine_to_phys_mapping);
unsigned long machine_to_phys_nr;
@@ -68,10 +55,12 @@ __read_mostly int xen_have_vector_callback;
EXPORT_SYMBOL_GPL(xen_have_vector_callback);
/*
- * NB: needs to live in .data because it's used by xen_prepare_pvh which runs
- * before clearing the bss.
+ * NB: These need to live in .data or alike because they're used by
+ * xen_prepare_pvh() which runs before clearing the bss.
*/
-uint32_t xen_start_flags __section(".data") = 0;
+enum xen_domain_type __ro_after_init xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
+uint32_t __ro_after_init xen_start_flags;
EXPORT_SYMBOL(xen_start_flags);
/*
@@ -80,21 +69,6 @@ EXPORT_SYMBOL(xen_start_flags);
*/
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
-/*
- * Flag to determine whether vcpu info placement is available on all
- * VCPUs. We assume it is to start with, and then set it to zero on
- * the first failure. This is because it can succeed on some VCPUs
- * and not others, since it can involve hypervisor memory allocation,
- * or because the guest failed to guarantee all the appropriate
- * constraints on all VCPUs (ie buffer can't cross a page boundary).
- *
- * Note that any particular CPU may be using a placed vcpu structure,
- * but we can only optimise if the all are.
- *
- * 0: not available, 1: available
- */
-int xen_have_vcpu_info_placement = 1;
-
static int xen_cpu_up_online(unsigned int cpu)
{
xen_init_lock_cpu(cpu);
@@ -120,10 +94,8 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
return rc >= 0 ? 0 : rc;
}
-static int xen_vcpu_setup_restore(int cpu)
+static void xen_vcpu_setup_restore(int cpu)
{
- int rc = 0;
-
/* Any per_cpu(xen_vcpu) is stale, so reset it */
xen_vcpu_info_reset(cpu);
@@ -132,11 +104,8 @@ static int xen_vcpu_setup_restore(int cpu)
* be handled by hotplug.
*/
if (xen_pv_domain() ||
- (xen_hvm_domain() && cpu_online(cpu))) {
- rc = xen_vcpu_setup(cpu);
- }
-
- return rc;
+ (xen_hvm_domain() && cpu_online(cpu)))
+ xen_vcpu_setup(cpu);
}
/*
@@ -146,7 +115,7 @@ static int xen_vcpu_setup_restore(int cpu)
*/
void xen_vcpu_restore(void)
{
- int cpu, rc;
+ int cpu;
for_each_possible_cpu(cpu) {
bool other_cpu = (cpu != smp_processor_id());
@@ -166,20 +135,9 @@ void xen_vcpu_restore(void)
if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_runstate_info(cpu);
- rc = xen_vcpu_setup_restore(cpu);
- if (rc)
- pr_emerg_once("vcpu restore failed for cpu=%d err=%d. "
- "System will hang.\n", cpu, rc);
- /*
- * In case xen_vcpu_setup_restore() fails, do not bring up the
- * VCPU. This helps us avoid the resulting OOPS when the VCPU
- * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.)
- * Note that this does not improve the situation much -- now the
- * VM hangs instead of OOPSing -- with the VCPUs that did not
- * fail, spinning in stop_machine(), waiting for the failed
- * VCPUs to come up.
- */
- if (other_cpu && is_up && (rc == 0) &&
+ xen_vcpu_setup_restore(cpu);
+
+ if (other_cpu && is_up &&
HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
BUG();
}
@@ -196,7 +154,7 @@ void xen_vcpu_info_reset(int cpu)
}
}
-int xen_vcpu_setup(int cpu)
+void xen_vcpu_setup(int cpu)
{
struct vcpu_register_vcpu_info info;
int err;
@@ -217,44 +175,65 @@ int xen_vcpu_setup(int cpu)
*/
if (xen_hvm_domain()) {
if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
- return 0;
+ return;
}
- if (xen_have_vcpu_info_placement) {
- vcpup = &per_cpu(xen_vcpu_info, cpu);
- info.mfn = arbitrary_virt_to_mfn(vcpup);
- info.offset = offset_in_page(vcpup);
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
+ info.mfn = arbitrary_virt_to_mfn(vcpup);
+ info.offset = offset_in_page(vcpup);
- /*
- * Check to see if the hypervisor will put the vcpu_info
- * structure where we want it, which allows direct access via
- * a percpu-variable.
- * N.B. This hypercall can _only_ be called once per CPU.
- * Subsequent calls will error out with -EINVAL. This is due to
- * the fact that hypervisor has no unregister variant and this
- * hypercall does not allow to over-write info.mfn and
- * info.offset.
- */
- err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info,
- xen_vcpu_nr(cpu), &info);
-
- if (err) {
- pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n",
- cpu, err);
- xen_have_vcpu_info_placement = 0;
- } else {
- /*
- * This cpu is using the registered vcpu info, even if
- * later ones fail to.
- */
- per_cpu(xen_vcpu, cpu) = vcpup;
- }
- }
+ /*
+ * N.B. This hypercall can _only_ be called once per CPU.
+ * Subsequent calls will error out with -EINVAL. This is due to
+ * the fact that hypervisor has no unregister variant and this
+ * hypercall does not allow to over-write info.mfn and
+ * info.offset.
+ */
+ err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu),
+ &info);
+ if (err)
+ panic("register_vcpu_info failed: cpu=%d err=%d\n", cpu, err);
+
+ per_cpu(xen_vcpu, cpu) = vcpup;
+}
+
+void __init xen_banner(void)
+{
+ unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ struct xen_extraversion extra;
- if (!xen_have_vcpu_info_placement)
- xen_vcpu_info_reset(cpu);
+ HYPERVISOR_xen_version(XENVER_extraversion, &extra);
- return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0);
+ pr_info("Booting kernel on %s\n", pv_info.name);
+ pr_info("Xen version: %u.%u%s%s\n",
+ version >> 16, version & 0xffff, extra.extraversion,
+ xen_feature(XENFEAT_mmu_pt_update_preserve_ad)
+ ? " (preserve-AD)" : "");
+}
+
+/* Check if running on Xen version (major, minor) or later */
+bool xen_running_on_version_or_later(unsigned int major, unsigned int minor)
+{
+ unsigned int version;
+
+ if (!xen_domain())
+ return false;
+
+ version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
+ ((version >> 16) > major))
+ return true;
+ return false;
+}
+
+void __init xen_add_preferred_consoles(void)
+{
+ add_preferred_console("xenboot", 0, NULL);
+ if (!boot_params.screen_info.orig_video_isVGA)
+ add_preferred_console("tty", 0, NULL);
+ add_preferred_console("hvc", 0, NULL);
+ if (boot_params.screen_info.orig_video_isVGA)
+ add_preferred_console("tty", 0, NULL);
}
void xen_reboot(int reason)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index e68ea5f4ad1c..8b71b1dd7639 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -9,6 +9,7 @@
#include <xen/events.h>
#include <xen/interface/memory.h>
+#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/smp.h>
#include <asm/io_apic.h>
@@ -163,9 +164,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
else
per_cpu(xen_vcpu_id, cpu) = cpu;
- rc = xen_vcpu_setup(cpu);
- if (rc || !xen_have_vector_callback)
- return rc;
+ xen_vcpu_setup(cpu);
+ if (!xen_have_vector_callback)
+ return 0;
if (xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_timer(cpu);
@@ -184,8 +185,7 @@ static int xen_cpu_dead_hvm(unsigned int cpu)
if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock))
xen_teardown_timer(cpu);
-
- return 0;
+ return 0;
}
static bool no_vector_callback __initdata;
@@ -195,6 +195,8 @@ static void __init xen_hvm_guest_init(void)
if (xen_pv_domain())
return;
+ xen_set_restricted_virtio_memory_access();
+
init_hvm_pv_info();
reserve_shared_info();
@@ -242,15 +244,14 @@ static __init int xen_parse_no_vector_callback(char *arg)
}
early_param("xen_no_vector_callback", xen_parse_no_vector_callback);
-bool __init xen_hvm_need_lapic(void)
+static __init bool xen_x2apic_available(void)
{
- if (xen_pv_domain())
- return false;
- if (!xen_hvm_domain())
- return false;
- if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
- return false;
- return true;
+ return x2apic_supported();
+}
+
+static bool __init msi_ext_dest_id(void)
+{
+ return cpuid_eax(xen_cpuid_base() + 4) & XEN_HVM_CPUID_EXT_DEST_ID;
}
static __init void xen_hvm_guest_late_init(void)
@@ -312,9 +313,10 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = {
.detect = xen_platform_hvm,
.type = X86_HYPER_XEN_HVM,
.init.init_platform = xen_hvm_guest_init,
- .init.x2apic_available = xen_x2apic_para_available,
+ .init.x2apic_available = xen_x2apic_available,
.init.init_mem_mapping = xen_hvm_init_mem_mapping,
.init.guest_late_init = xen_hvm_guest_late_init,
+ .init.msi_ext_dest_id = msi_ext_dest_id,
.runtime.pin_vcpu = xen_pin_vcpu,
.ignore_nopv = true,
};
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index dc0a337f985b..e3297b15701c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -27,12 +27,10 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
-#include <linux/highmem.h>
-#include <linux/console.h>
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/edd.h>
-#include <linux/objtool.h>
+#include <linux/reboot.h>
#include <xen/xen.h>
#include <xen/events.h>
@@ -109,20 +107,10 @@ struct tls_descs {
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
-static void __init xen_banner(void)
-{
- unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
- struct xen_extraversion extra;
- HYPERVISOR_xen_version(XENVER_extraversion, &extra);
-
- pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
- printk(KERN_INFO "Xen version: %d.%d%s%s\n",
- version >> 16, version & 0xffff, extra.extraversion,
- xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
-}
-
static void __init xen_pv_init_platform(void)
{
+ xen_set_restricted_virtio_memory_access();
+
populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
@@ -143,22 +131,6 @@ static void __init xen_pv_guest_late_init(void)
#endif
}
-/* Check if running on Xen version (major, minor) or later */
-bool
-xen_running_on_version_or_later(unsigned int major, unsigned int minor)
-{
- unsigned int version;
-
- if (!xen_domain())
- return false;
-
- version = HYPERVISOR_xen_version(XENVER_version, NULL);
- if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
- ((version >> 16) > major))
- return true;
- return false;
-}
-
static __read_mostly unsigned int cpuid_leaf5_ecx_val;
static __read_mostly unsigned int cpuid_leaf5_edx_val;
@@ -195,7 +167,6 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
*bx &= maskebx;
}
-STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
static bool __init xen_check_mwait(void)
{
@@ -312,12 +283,12 @@ static void __init xen_init_capabilities(void)
}
}
-static void xen_set_debugreg(int reg, unsigned long val)
+static noinstr void xen_set_debugreg(int reg, unsigned long val)
{
HYPERVISOR_set_debugreg(reg, val);
}
-static unsigned long xen_get_debugreg(int reg)
+static noinstr unsigned long xen_get_debugreg(int reg)
{
return HYPERVISOR_get_debugreg(reg);
}
@@ -592,8 +563,10 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
{
/* This should never happen and there is no way to handle it. */
+ instrumentation_begin();
pr_err("Unknown trap in Xen PV mode.");
BUG();
+ instrumentation_end();
}
#ifdef CONFIG_X86_MCE
@@ -652,6 +625,9 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_coprocessor_error, false ),
TRAP_ENTRY(exc_alignment_check, false ),
TRAP_ENTRY(exc_simd_coprocessor_error, false ),
+#ifdef CONFIG_X86_KERNEL_IBT
+ TRAP_ENTRY(exc_control_protection, false ),
+#endif
};
static bool __ref get_trap_addr(void **addr, unsigned int ist)
@@ -754,8 +730,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
preempt_enable();
}
-static void xen_convert_trap_info(const struct desc_ptr *desc,
- struct trap_info *traps)
+static unsigned xen_convert_trap_info(const struct desc_ptr *desc,
+ struct trap_info *traps, bool full)
{
unsigned in, out, count;
@@ -765,17 +741,18 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
for (in = out = 0; in < count; in++) {
gate_desc *entry = (gate_desc *)(desc->address) + in;
- if (cvt_gate_to_trap(in, entry, &traps[out]))
+ if (cvt_gate_to_trap(in, entry, &traps[out]) || full)
out++;
}
- traps[out].address = 0;
+
+ return out;
}
void xen_copy_trap_info(struct trap_info *traps)
{
const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
- xen_convert_trap_info(desc, traps);
+ xen_convert_trap_info(desc, traps, true);
}
/* Load a new IDT into Xen. In principle this can be per-CPU, so we
@@ -785,6 +762,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
+ unsigned out;
trace_xen_cpu_load_idt(desc);
@@ -792,7 +770,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
- xen_convert_trap_info(desc, traps);
+ out = xen_convert_trap_info(desc, traps, false);
+ memset(&traps[out], 0, sizeof(traps[0]));
xen_mc_flush();
if (HYPERVISOR_set_trap_table(traps))
@@ -1017,31 +996,13 @@ void __init xen_setup_vcpu_info_placement(void)
for_each_possible_cpu(cpu) {
/* Set up direct vCPU id mapping for PV guests. */
per_cpu(xen_vcpu_id, cpu) = cpu;
-
- /*
- * xen_vcpu_setup(cpu) can fail -- in which case it
- * falls back to the shared_info version for cpus
- * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS.
- *
- * xen_cpu_up_prepare_pv() handles the rest by failing
- * them in hotplug.
- */
- (void) xen_vcpu_setup(cpu);
+ xen_vcpu_setup(cpu);
}
- /*
- * xen_vcpu_setup managed to place the vcpu_info within the
- * percpu area for all cpus, so make use of it.
- */
- if (xen_have_vcpu_info_placement) {
- pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
- pv_ops.irq.irq_disable =
- __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
- pv_ops.irq.irq_enable =
- __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
- pv_ops.mmu.read_cr2 =
- __PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
- }
+ pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+ pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+ pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
+ pv_ops.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
}
static const struct pv_info xen_info __initconst = {
@@ -1049,54 +1010,54 @@ static const struct pv_info xen_info __initconst = {
.name = "Xen",
};
-static const struct pv_cpu_ops xen_cpu_ops __initconst = {
- .cpuid = xen_cpuid,
-
- .set_debugreg = xen_set_debugreg,
- .get_debugreg = xen_get_debugreg,
+static const typeof(pv_ops) xen_cpu_ops __initconst = {
+ .cpu = {
+ .cpuid = xen_cpuid,
- .read_cr0 = xen_read_cr0,
- .write_cr0 = xen_write_cr0,
+ .set_debugreg = xen_set_debugreg,
+ .get_debugreg = xen_get_debugreg,
- .write_cr4 = xen_write_cr4,
+ .read_cr0 = xen_read_cr0,
+ .write_cr0 = xen_write_cr0,
- .wbinvd = native_wbinvd,
+ .write_cr4 = xen_write_cr4,
- .read_msr = xen_read_msr,
- .write_msr = xen_write_msr,
+ .wbinvd = native_wbinvd,
- .read_msr_safe = xen_read_msr_safe,
- .write_msr_safe = xen_write_msr_safe,
+ .read_msr = xen_read_msr,
+ .write_msr = xen_write_msr,
- .read_pmc = xen_read_pmc,
+ .read_msr_safe = xen_read_msr_safe,
+ .write_msr_safe = xen_write_msr_safe,
- .iret = xen_iret,
+ .read_pmc = xen_read_pmc,
- .load_tr_desc = paravirt_nop,
- .set_ldt = xen_set_ldt,
- .load_gdt = xen_load_gdt,
- .load_idt = xen_load_idt,
- .load_tls = xen_load_tls,
- .load_gs_index = xen_load_gs_index,
+ .load_tr_desc = paravirt_nop,
+ .set_ldt = xen_set_ldt,
+ .load_gdt = xen_load_gdt,
+ .load_idt = xen_load_idt,
+ .load_tls = xen_load_tls,
+ .load_gs_index = xen_load_gs_index,
- .alloc_ldt = xen_alloc_ldt,
- .free_ldt = xen_free_ldt,
+ .alloc_ldt = xen_alloc_ldt,
+ .free_ldt = xen_free_ldt,
- .store_tr = xen_store_tr,
+ .store_tr = xen_store_tr,
- .write_ldt_entry = xen_write_ldt_entry,
- .write_gdt_entry = xen_write_gdt_entry,
- .write_idt_entry = xen_write_idt_entry,
- .load_sp0 = xen_load_sp0,
+ .write_ldt_entry = xen_write_ldt_entry,
+ .write_gdt_entry = xen_write_gdt_entry,
+ .write_idt_entry = xen_write_idt_entry,
+ .load_sp0 = xen_load_sp0,
#ifdef CONFIG_X86_IOPL_IOPERM
- .invalidate_io_bitmap = xen_invalidate_io_bitmap,
- .update_io_bitmap = xen_update_io_bitmap,
+ .invalidate_io_bitmap = xen_invalidate_io_bitmap,
+ .update_io_bitmap = xen_update_io_bitmap,
#endif
- .io_delay = xen_io_delay,
+ .io_delay = xen_io_delay,
- .start_context_switch = paravirt_start_context_switch,
- .end_context_switch = xen_end_context_switch,
+ .start_context_switch = paravirt_start_context_switch,
+ .end_context_switch = xen_end_context_switch,
+ },
};
static void xen_restart(char *msg)
@@ -1111,8 +1072,7 @@ static void xen_machine_halt(void)
static void xen_machine_power_off(void)
{
- if (pm_power_off)
- pm_power_off();
+ do_kernel_power_off();
xen_reboot(SHUTDOWN_poweroff);
}
@@ -1204,7 +1164,6 @@ static void __init xen_setup_gdt(int cpu)
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
pv_ops.cpu.load_gdt = xen_load_gdt_boot;
- setup_stack_canary_segment(cpu);
switch_to_new_gdt(cpu);
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
@@ -1216,6 +1175,13 @@ static void __init xen_dom0_set_legacy_features(void)
x86_platform.legacy.rtc = 1;
}
+static void __init xen_domu_set_legacy_features(void)
+{
+ x86_platform.legacy.rtc = 0;
+}
+
+extern void early_xen_iret_patch(void);
+
/* First C function to be called on Xen boot */
asmlinkage __visible void __init xen_start_kernel(void)
{
@@ -1226,6 +1192,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
if (!xen_start_info)
return;
+ __text_gen_insn(&early_xen_iret_patch,
+ JMP32_INSN_OPCODE, &early_xen_iret_patch, &xen_iret,
+ JMP32_INSN_SIZE);
+
xen_domain_type = XEN_PV_DOMAIN;
xen_start_flags = xen_start_info->flags;
@@ -1233,8 +1203,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
- pv_ops.init.patch = paravirt_patch_default;
- pv_ops.cpu = xen_cpu_ops;
+ pv_ops.cpu = xen_cpu_ops.cpu;
xen_init_irq_ops();
/*
@@ -1267,25 +1236,19 @@ asmlinkage __visible void __init xen_start_kernel(void)
__supported_pte_mask &= ~_PAGE_GLOBAL;
__default_kernel_pte_mask &= ~_PAGE_GLOBAL;
- /*
- * Prevent page tables from being allocated in highmem, even
- * if CONFIG_HIGHPTE is enabled.
- */
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
/* Get mfn list */
xen_build_dynamic_phys_to_machine();
+ /* Work out if we support NX */
+ get_cpu_cap(&boot_cpu_data);
+ x86_configure_nx();
+
/*
* Set up kernel GDT and segment registers, mainly so that
* -fstack-protector code can be executed.
*/
xen_setup_gdt(0);
- /* Work out if we support NX */
- get_cpu_cap(&boot_cpu_data);
- x86_configure_nx();
-
/* Determine virtual and physical address sizes */
get_cpu_address_sizes(&boot_cpu_data);
@@ -1303,13 +1266,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_init_apic();
#endif
- if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
- pv_ops.mmu.ptep_modify_prot_start =
- xen_ptep_modify_prot_start;
- pv_ops.mmu.ptep_modify_prot_commit =
- xen_ptep_modify_prot_commit;
- }
-
machine_ops = xen_machine_ops;
/*
@@ -1365,9 +1321,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
if (!xen_initial_domain()) {
- add_preferred_console("xenboot", 0, NULL);
if (pci_xen)
x86_init.pci.arch_init = pci_xen_init;
+ x86_platform.set_legacy_features =
+ xen_domu_set_legacy_features;
} else {
const struct dom0_vga_console_info *info =
(void *)((char *)xen_start_info +
@@ -1392,10 +1349,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_acpi_sleep_register();
- /* Avoid searching for BIOS MP tables */
- x86_init.mpparse.find_smp_config = x86_init_noop;
- x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-
xen_boot_params_init_edd();
#ifdef CONFIG_ACPI
@@ -1408,11 +1361,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
#endif
}
- if (!boot_params.screen_info.orig_video_isVGA)
- add_preferred_console("tty", 0, NULL);
- add_preferred_console("hvc", 0, NULL);
- if (boot_params.screen_info.orig_video_isVGA)
- add_preferred_console("tty", 0, NULL);
+ xen_add_preferred_consoles();
#ifdef CONFIG_PCI
/* PCI BIOS service won't work from a PV guest. */
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 0d5e34b9e6f9..bcae606bbc5c 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/acpi.h>
+#include <linux/export.h>
#include <xen/hvc-console.h>
@@ -18,10 +19,11 @@
/*
* PVH variables.
*
- * The variable xen_pvh needs to live in the data segment since it is used
+ * The variable xen_pvh needs to live in a data segment since it is used
* after startup_{32|64} is invoked, which will clear the .bss segment.
*/
-bool xen_pvh __section(".data") = 0;
+bool __ro_after_init xen_pvh;
+EXPORT_SYMBOL_GPL(xen_pvh);
void __init xen_pvh_init(struct boot_params *boot_params)
{
@@ -36,6 +38,10 @@ void __init xen_pvh_init(struct boot_params *boot_params)
pfn = __pa(hypercall_page);
wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ if (xen_initial_domain())
+ x86_init.oem.arch_setup = xen_add_preferred_consoles;
+ x86_init.oem.banner = xen_banner;
+
xen_efi_init(boot_params);
}
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index dfa091d79c2e..06c3c2fb4b06 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -19,65 +19,11 @@
* callback mask. We do this in a very simple manner, by making a call
* down into Xen. The pending flag will be checked by Xen on return.
*/
-void xen_force_evtchn_callback(void)
+noinstr void xen_force_evtchn_callback(void)
{
(void)HYPERVISOR_xen_version(0, NULL);
}
-asmlinkage __visible unsigned long xen_save_fl(void)
-{
- struct vcpu_info *vcpu;
- unsigned long flags;
-
- vcpu = this_cpu_read(xen_vcpu);
-
- /* flag has opposite sense of mask */
- flags = !vcpu->evtchn_upcall_mask;
-
- /* convert to IF type flag
- -0 -> 0x00000000
- -1 -> 0xffffffff
- */
- return (-flags) & X86_EFLAGS_IF;
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
-
-asmlinkage __visible void xen_irq_disable(void)
-{
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
- preempt_enable_no_resched();
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
-
-asmlinkage __visible void xen_irq_enable(void)
-{
- struct vcpu_info *vcpu;
-
- /*
- * We may be preempted as soon as vcpu->evtchn_upcall_mask is
- * cleared, so disable preemption to ensure we check for
- * events on the VCPU we are still running on.
- */
- preempt_disable();
-
- vcpu = this_cpu_read(xen_vcpu);
- vcpu->evtchn_upcall_mask = 0;
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
-
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- xen_force_evtchn_callback();
-
- preempt_enable();
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
-
static void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
@@ -94,17 +40,20 @@ static void xen_halt(void)
xen_safe_halt();
}
-static const struct pv_irq_ops xen_irq_ops __initconst = {
- .save_fl = PV_CALLEE_SAVE(xen_save_fl),
- .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
- .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+static const typeof(pv_ops) xen_irq_ops __initconst = {
+ .irq = {
+ /* Initial interrupt flag handling only called while interrupts off. */
+ .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0),
+ .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop),
+ .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG),
- .safe_halt = xen_safe_halt,
- .halt = xen_halt,
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+ },
};
void __init xen_init_irq_ops(void)
{
- pv_ops.irq = xen_irq_ops;
+ pv_ops.irq = xen_irq_ops.irq;
x86_init.irqs.intr_init = xen_init_IRQ;
}
diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index 57409373750f..509bdee3ab90 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -9,39 +9,28 @@
#ifdef CONFIG_PROC_VMCORE
/*
- * This function is used in two contexts:
- * - the kdump kernel has to check whether a pfn of the crashed kernel
- * was a ballooned page. vmcore is using this function to decide
- * whether to access a pfn of the crashed kernel.
- * - the kexec kernel has to check whether a pfn was ballooned by the
- * previous kernel. If the pfn is ballooned, handle it properly.
- * Returns 0 if the pfn is not backed by a RAM page, the caller may
+ * The kdump kernel has to check whether a pfn of the crashed kernel
+ * was a ballooned page. vmcore is using this function to decide
+ * whether to access a pfn of the crashed kernel.
+ * Returns "false" if the pfn is not backed by a RAM page, the caller may
* handle the pfn special in this case.
*/
-static int xen_oldmem_pfn_is_ram(unsigned long pfn)
+static bool xen_vmcore_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
{
struct xen_hvm_get_mem_type a = {
.domid = DOMID_SELF,
.pfn = pfn,
};
- int ram;
- if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
- return -ENXIO;
-
- switch (a.mem_type) {
- case HVMMEM_mmio_dm:
- ram = 0;
- break;
- case HVMMEM_ram_rw:
- case HVMMEM_ram_ro:
- default:
- ram = 1;
- break;
+ if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) {
+ pr_warn_once("Unexpected HVMOP_get_mem_type failure\n");
+ return true;
}
-
- return ram;
+ return a.mem_type != HVMMEM_mmio_dm;
}
+static struct vmcore_cb xen_vmcore_cb = {
+ .pfn_is_ram = xen_vmcore_pfn_is_ram,
+};
#endif
static void xen_hvm_exit_mmap(struct mm_struct *mm)
@@ -75,6 +64,6 @@ void __init xen_hvm_init_mmu_ops(void)
if (is_pagetable_dying_supported())
pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap;
#ifdef CONFIG_PROC_VMCORE
- WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram));
+ register_vmcore_cb(&xen_vmcore_cb);
#endif
}
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index cf2ade864c30..ee29fb558f2e 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -41,7 +41,6 @@
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/sched/mm.h>
-#include <linux/highmem.h>
#include <linux/debugfs.h>
#include <linux/bug.h>
#include <linux/vmalloc.h>
@@ -81,13 +80,16 @@
#include <xen/interface/version.h>
#include <xen/interface/memory.h>
#include <xen/hvc-console.h>
+#include <xen/swiotlb-xen.h>
#include "multicalls.h"
#include "mmu.h"
#include "debugfs.h"
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
/* l3 pud for userspace vsyscall mapping */
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif
/*
* Protects atomic reservation decrease/increase against concurrent increases.
@@ -241,9 +243,11 @@ static void xen_set_pmd(pmd_t *ptr, pmd_t val)
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
-void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
- set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
+ if (HYPERVISOR_update_va_mapping(vaddr, mfn_pte(mfn, flags),
+ UVMF_INVLPG))
+ BUG();
}
static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
@@ -789,7 +793,9 @@ static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
static void __init xen_after_bootmem(void)
{
static_branch_enable(&xen_struct_pages_ready);
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
@@ -1025,7 +1031,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
make_lowmem_page_readwrite(vaddr);
- memblock_free(paddr, size);
+ memblock_phys_free(paddr, size);
}
static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
@@ -1151,7 +1157,7 @@ static void __init xen_pagetable_p2m_free(void)
xen_cleanhighmap(addr, addr + size);
size = PAGE_ALIGN(xen_start_info->nr_pages *
sizeof(unsigned long));
- memblock_free(__pa(addr), size);
+ memblock_free((void *)addr, size);
} else {
xen_cleanmfnmap(addr);
}
@@ -1192,6 +1198,13 @@ static void __init xen_pagetable_p2m_setup(void)
static void __init xen_pagetable_init(void)
{
+ /*
+ * The majority of further PTE writes is to pagetables already
+ * announced as such to Xen. Hence it is more efficient to use
+ * hypercalls for these updates.
+ */
+ pv_ops.mmu.set_pte = __xen_set_pte;
+
paging_init();
xen_post_allocator_init();
@@ -1204,7 +1217,8 @@ static void __init xen_pagetable_init(void)
xen_remap_memory();
xen_setup_mfn_list_list();
}
-static void xen_write_cr2(unsigned long cr2)
+
+static noinstr void xen_write_cr2(unsigned long cr2)
{
this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
}
@@ -1247,8 +1261,8 @@ static void xen_flush_tlb_one_user(unsigned long addr)
preempt_enable();
}
-static void xen_flush_tlb_others(const struct cpumask *cpus,
- const struct flush_tlb_info *info)
+static void xen_flush_tlb_multi(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
{
struct {
struct mmuext_op op;
@@ -1258,7 +1272,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
const size_t mc_entry_size = sizeof(args->op) +
sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
- trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
+ trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
if (cpumask_empty(cpus))
return; /* nothing to do */
@@ -1267,9 +1281,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
args = mcs.args;
args->op.arg2.vcpumask = to_cpumask(args->mask);
- /* Remove us, and any offline CPUS. */
+ /* Remove any offline CPUs */
cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
if (info->end != TLB_FLUSH_ALL &&
@@ -1421,10 +1434,18 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
*
* Many of these PTE updates are done on unpinned and writable pages
* and doing a hypercall for these is unnecessary and expensive. At
- * this point it is not possible to tell if a page is pinned or not,
- * so always write the PTE directly and rely on Xen trapping and
+ * this point it is rarely possible to tell if a page is pinned, so
+ * mostly write the PTE directly and rely on Xen trapping and
* emulating any updates as necessary.
*/
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+ if (unlikely(is_early_ioremap_ptep(ptep)))
+ __xen_set_pte(ptep, pte);
+ else
+ native_set_pte(ptep, pte);
+}
+
__visible pte_t xen_make_pte_init(pteval_t pte)
{
unsigned long pfn;
@@ -1446,11 +1467,6 @@ __visible pte_t xen_make_pte_init(pteval_t pte)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
-static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
- __xen_set_pte(ptep, pte);
-}
-
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1519,14 +1535,17 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
if (pinned) {
struct page *page = pfn_to_page(pfn);
- if (static_branch_likely(&xen_struct_pages_ready))
+ pinned = false;
+ if (static_branch_likely(&xen_struct_pages_ready)) {
+ pinned = PagePinned(page);
SetPagePinned(page);
+ }
xen_mc_batch();
__set_pfn_prot(pfn, PAGE_KERNEL_RO);
- if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned)
__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
xen_mc_issue(PARAVIRT_LAZY_MMU);
@@ -1747,7 +1766,6 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
@@ -1764,6 +1782,13 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+ /* Pin user vsyscall L3 */
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+ PFN_DOWN(__pa_symbol(level3_user_vsyscall)));
+#endif
+
/*
* At this stage there can be no user pgd, and no page structure to
* attach it to, so make sure we just set kernel pgd.
@@ -1953,7 +1978,7 @@ void __init xen_relocate_p2m(void)
pfn_end = p2m_pfn_end;
}
- memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+ memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
while (pfn < pfn_end) {
if (pfn == p2m_pfn) {
pfn = p2m_pfn_end;
@@ -1996,6 +2021,7 @@ static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
{
pte_t pte;
+ unsigned long vaddr;
phys >>= PAGE_SHIFT;
@@ -2036,15 +2062,15 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
break;
}
- __native_set_fixmap(idx, pte);
+ vaddr = __fix_to_virt(idx);
+ if (HYPERVISOR_update_va_mapping(vaddr, pte, UVMF_INVLPG))
+ BUG();
#ifdef CONFIG_X86_VSYSCALL_EMULATION
/* Replicate changes to map the vsyscall page into the user
pagetable vsyscall mapping. */
- if (idx == VSYSCALL_PAGE) {
- unsigned long vaddr = __fix_to_virt(idx);
+ if (idx == VSYSCALL_PAGE)
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
- }
#endif
}
@@ -2076,67 +2102,69 @@ static void xen_leave_lazy_mmu(void)
preempt_enable();
}
-static const struct pv_mmu_ops xen_mmu_ops __initconst = {
- .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
- .write_cr2 = xen_write_cr2,
+static const typeof(pv_ops) xen_mmu_ops __initconst = {
+ .mmu = {
+ .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
+ .write_cr2 = xen_write_cr2,
- .read_cr3 = xen_read_cr3,
- .write_cr3 = xen_write_cr3_init,
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3_init,
- .flush_tlb_user = xen_flush_tlb,
- .flush_tlb_kernel = xen_flush_tlb,
- .flush_tlb_one_user = xen_flush_tlb_one_user,
- .flush_tlb_others = xen_flush_tlb_others,
- .tlb_remove_table = tlb_remove_table,
+ .flush_tlb_user = xen_flush_tlb,
+ .flush_tlb_kernel = xen_flush_tlb,
+ .flush_tlb_one_user = xen_flush_tlb_one_user,
+ .flush_tlb_multi = xen_flush_tlb_multi,
+ .tlb_remove_table = tlb_remove_table,
- .pgd_alloc = xen_pgd_alloc,
- .pgd_free = xen_pgd_free,
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
- .alloc_pte = xen_alloc_pte_init,
- .release_pte = xen_release_pte_init,
- .alloc_pmd = xen_alloc_pmd_init,
- .release_pmd = xen_release_pmd_init,
+ .alloc_pte = xen_alloc_pte_init,
+ .release_pte = xen_release_pte_init,
+ .alloc_pmd = xen_alloc_pmd_init,
+ .release_pmd = xen_release_pmd_init,
- .set_pte = xen_set_pte_init,
- .set_pmd = xen_set_pmd_hyper,
+ .set_pte = xen_set_pte_init,
+ .set_pmd = xen_set_pmd_hyper,
- .ptep_modify_prot_start = __ptep_modify_prot_start,
- .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+ .ptep_modify_prot_start = xen_ptep_modify_prot_start,
+ .ptep_modify_prot_commit = xen_ptep_modify_prot_commit,
- .pte_val = PV_CALLEE_SAVE(xen_pte_val),
- .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+ .pte_val = PV_CALLEE_SAVE(xen_pte_val),
+ .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
- .make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
- .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+ .make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
+ .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
- .set_pud = xen_set_pud_hyper,
+ .set_pud = xen_set_pud_hyper,
- .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
- .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+ .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+ .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
- .pud_val = PV_CALLEE_SAVE(xen_pud_val),
- .make_pud = PV_CALLEE_SAVE(xen_make_pud),
- .set_p4d = xen_set_p4d_hyper,
+ .pud_val = PV_CALLEE_SAVE(xen_pud_val),
+ .make_pud = PV_CALLEE_SAVE(xen_make_pud),
+ .set_p4d = xen_set_p4d_hyper,
- .alloc_pud = xen_alloc_pmd_init,
- .release_pud = xen_release_pmd_init,
+ .alloc_pud = xen_alloc_pmd_init,
+ .release_pud = xen_release_pmd_init,
#if CONFIG_PGTABLE_LEVELS >= 5
- .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
- .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
+ .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
+ .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
#endif
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
- .exit_mmap = xen_exit_mmap,
+ .activate_mm = xen_activate_mm,
+ .dup_mmap = xen_dup_mmap,
+ .exit_mmap = xen_exit_mmap,
- .lazy_mode = {
- .enter = paravirt_enter_lazy_mmu,
- .leave = xen_leave_lazy_mmu,
- .flush = paravirt_flush_lazy_mmu,
- },
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_mmu,
+ .leave = xen_leave_lazy_mmu,
+ .flush = paravirt_flush_lazy_mmu,
+ },
- .set_fixmap = xen_set_fixmap,
+ .set_fixmap = xen_set_fixmap,
+ },
};
void __init xen_init_mmu_ops(void)
@@ -2144,7 +2172,7 @@ void __init xen_init_mmu_ops(void)
x86_init.paging.pagetable_init = xen_pagetable_init;
x86_init.hyper.init_after_bootmem = xen_after_bootmem;
- pv_ops.mmu = xen_mmu_ops;
+ pv_ops.mmu = xen_mmu_ops.mmu;
memset(dummy_mapping, 0xff, PAGE_SIZE);
}
@@ -2396,7 +2424,7 @@ static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data)
int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot,
- unsigned int domid, bool no_translate, struct page **pages)
+ unsigned int domid, bool no_translate)
{
int err = 0;
struct remap_data rmd;
@@ -2410,7 +2438,7 @@ int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
rmd.prot = prot;
/*
* We use the err_ptr to indicate if there we are doing a contiguous
- * mapping or a discontigious mapping.
+ * mapping or a discontiguous mapping.
*/
rmd.contiguous = !err_ptr;
rmd.no_translate = no_translate;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index ac06ca32e9ef..58db86f7b384 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void)
static void __ref free_p2m_page(void *p)
{
if (unlikely(!slab_is_available())) {
- memblock_free((unsigned long)p, PAGE_SIZE);
+ memblock_free(p, PAGE_SIZE);
return;
}
@@ -618,8 +618,8 @@ int xen_alloc_p2m_entry(unsigned long pfn)
}
/* Expanded the p2m? */
- if (pfn > xen_p2m_last_pfn) {
- xen_p2m_last_pfn = pfn;
+ if (pfn >= xen_p2m_last_pfn) {
+ xen_p2m_last_pfn = ALIGN(pfn + 1, P2M_PER_PAGE);
HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
}
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
deleted file mode 100644
index 19ae3e4fe4e9..000000000000
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/* Glue code to lib/swiotlb-xen.c */
-
-#include <linux/dma-map-ops.h>
-#include <linux/pci.h>
-#include <xen/swiotlb-xen.h>
-
-#include <asm/xen/hypervisor.h>
-#include <xen/xen.h>
-#include <asm/iommu_table.h>
-
-
-#include <asm/xen/swiotlb-xen.h>
-#ifdef CONFIG_X86_64
-#include <asm/iommu.h>
-#include <asm/dma.h>
-#endif
-#include <linux/export.h>
-
-int xen_swiotlb __read_mostly;
-
-/*
- * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
- *
- * This returns non-zero if we are forced to use xen_swiotlb (by the boot
- * option).
- */
-int __init pci_xen_swiotlb_detect(void)
-{
-
- if (!xen_pv_domain())
- return 0;
-
- /* If running as PV guest, either iommu=soft, or swiotlb=force will
- * activate this IOMMU. If running as PV privileged, activate it
- * irregardless.
- */
- if (xen_initial_domain() || swiotlb || swiotlb_force == SWIOTLB_FORCE)
- xen_swiotlb = 1;
-
- /* If we are running under Xen, we MUST disable the native SWIOTLB.
- * Don't worry about swiotlb_force flag activating the native, as
- * the 'swiotlb' flag is the only one turning it on. */
- swiotlb = 0;
-
-#ifdef CONFIG_X86_64
- /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0
- * (so no iommu=X command line over-writes).
- * Considering that PV guests do not want the *native SWIOTLB* but
- * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here.
- */
- if (max_pfn > MAX_DMA32_PFN)
- no_iommu = 1;
-#endif
- return xen_swiotlb;
-}
-
-void __init pci_xen_swiotlb_init(void)
-{
- if (xen_swiotlb) {
- xen_swiotlb_init(1, true /* early */);
- dma_ops = &xen_swiotlb_dma_ops;
-
-#ifdef CONFIG_PCI
- /* Make sure ACS will be enabled */
- pci_request_acs();
-#endif
- }
-}
-
-int pci_xen_swiotlb_init_late(void)
-{
- int rc;
-
- if (xen_swiotlb)
- return 0;
-
- rc = xen_swiotlb_init(1, false /* late */);
- if (rc)
- return rc;
-
- dma_ops = &xen_swiotlb_dma_ops;
-#ifdef CONFIG_PCI
- /* Make sure ACS will be enabled */
- pci_request_acs();
-#endif
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
-
-IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
- NULL,
- pci_xen_swiotlb_init,
- NULL);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 96d7f7d39cb9..62ac4898df1a 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -7,6 +7,8 @@
* Copyright (c) 2010, Citrix
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <linux/io.h>
#include <linux/export.h>
@@ -30,13 +32,13 @@ static int check_platform_magic(void)
magic = inw(XEN_IOPORT_MAGIC);
if (magic != XEN_IOPORT_MAGIC_VAL) {
- printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+ pr_err("Xen Platform PCI: unrecognised magic value\n");
return XEN_PLATFORM_ERR_MAGIC;
}
protocol = inb(XEN_IOPORT_PROTOVER);
- printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+ pr_debug("Xen Platform PCI: I/O protocol version %d\n",
protocol);
switch (protocol) {
@@ -44,12 +46,12 @@ static int check_platform_magic(void)
outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
- printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+ pr_err("Xen Platform: blacklisted by host\n");
return XEN_PLATFORM_ERR_BLACKLIST;
}
break;
default:
- printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version\n");
+ pr_warn("Xen Platform PCI: unknown I/O protocol version\n");
return XEN_PLATFORM_ERR_PROTOCOL;
}
@@ -155,12 +157,12 @@ void xen_unplug_emulated_devices(void)
* been compiled for this kernel (modules or built-in are both OK). */
if (!xen_emul_unplug) {
if (xen_must_unplug_nics()) {
- printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+ pr_info("Netfront and the Xen platform PCI driver have "
"been compiled for this kernel: unplug emulated NICs.\n");
xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
}
if (xen_must_unplug_disks()) {
- printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+ pr_info("Blkfront and the Xen platform PCI driver have "
"been compiled for this kernel: unplug emulated disks.\n"
"You might have to change the root device\n"
"from /dev/hd[a-d] to /dev/xvd[a-d]\n"
@@ -200,7 +202,7 @@ static int __init parse_xen_emul_unplug(char *arg)
else if (!strncmp(p, "never", l))
xen_emul_unplug |= XEN_UNPLUG_NEVER;
else
- printk(KERN_WARNING "unrecognised option '%s' "
+ pr_warn("unrecognised option '%s' "
"in parameter 'xen_emul_unplug'\n", p);
}
return 0;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index e13b0b49fcdf..21ecbe754cb2 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -413,34 +413,29 @@ int pmu_apic_update(uint32_t val)
}
/* perf callbacks */
-static int xen_is_in_guest(void)
+static unsigned int xen_guest_state(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ unsigned int state = 0;
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
- return 0;
+ return state;
}
if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
- return 0;
+ return state;
- return 1;
-}
+ state |= PERF_GUEST_ACTIVE;
-static int xen_is_user_mode(void)
-{
- const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
-
- if (!xenpmu_data) {
- pr_warn_once("%s: pmudata not initialized\n", __func__);
- return 0;
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) {
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER)
+ state |= PERF_GUEST_USER;
+ } else if (xenpmu_data->pmu.r.regs.cpl & 3) {
+ state |= PERF_GUEST_USER;
}
- if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
- return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
- else
- return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+ return state;
}
static unsigned long xen_get_guest_ip(void)
@@ -456,9 +451,8 @@ static unsigned long xen_get_guest_ip(void)
}
static struct perf_guest_info_callbacks xen_guest_cbs = {
- .is_in_guest = xen_is_in_guest,
- .is_user_mode = xen_is_user_mode,
- .get_guest_ip = xen_get_guest_ip,
+ .state = xen_guest_state,
+ .get_ip = xen_get_guest_ip,
};
/* Convert registers from Xen's format to Linux' */
@@ -512,10 +506,7 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
return ret;
}
-bool is_xen_pmu(int cpu)
-{
- return (get_xenpmu_data() != NULL);
-}
+bool is_xen_pmu;
void xen_pmu_init(int cpu)
{
@@ -526,7 +517,7 @@ void xen_pmu_init(int cpu)
BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
- if (xen_hvm_domain())
+ if (xen_hvm_domain() || (cpu != 0 && !is_xen_pmu))
return;
xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
@@ -547,7 +538,8 @@ void xen_pmu_init(int cpu)
per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
per_cpu(xenpmu_shared, cpu).flags = 0;
- if (cpu == 0) {
+ if (!is_xen_pmu) {
+ is_xen_pmu = true;
perf_register_guest_info_callbacks(&xen_guest_cbs);
xen_pmu_arch_init();
}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
index 0e83a160589b..65c58894fc79 100644
--- a/arch/x86/xen/pmu.h
+++ b/arch/x86/xen/pmu.h
@@ -4,6 +4,8 @@
#include <xen/interface/xenpmu.h>
+extern bool is_xen_pmu;
+
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
#ifdef CONFIG_XEN_HAVE_VPMU
void xen_pmu_init(int cpu);
@@ -12,7 +14,6 @@ void xen_pmu_finish(int cpu);
static inline void xen_pmu_init(int cpu) {}
static inline void xen_pmu_finish(int cpu) {}
#endif
-bool is_xen_pmu(int cpu);
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
int pmu_apic_update(uint32_t reg);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8bfc10330107..81aa46f770c5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn,
break;
}
}
- memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+ memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
/*
@@ -306,10 +306,6 @@ static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
BUG();
}
- /* Update kernel mapping, but not for highmem. */
- if (pfn >= PFN_UP(__pa(high_memory - 1)))
- return;
-
if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
mfn_pte(mfn, PAGE_KERNEL), 0)) {
WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
@@ -429,13 +425,13 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
}
/*
- * If the PFNs are currently mapped, the VA mapping also needs
- * to be updated to be 1:1.
+ * If the PFNs are currently mapped, their VA mappings need to be
+ * zapped.
*/
for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
(void)HYPERVISOR_update_va_mapping(
(unsigned long)__va(pfn << PAGE_SHIFT),
- mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+ native_make_pte(0), 0);
return remap_pfn;
}
@@ -719,11 +715,11 @@ static void __init xen_reserve_xen_mfnlist(void)
return;
xen_relocate_p2m();
- memblock_free(start, size);
+ memblock_phys_free(start, size);
}
/**
- * machine_specific_memory_setup - Hook for machine specific memory setup.
+ * xen_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
@@ -885,7 +881,7 @@ char * __init xen_memory_setup(void)
xen_phys_memcpy(new_area, start, size);
pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
start, start + size, new_area, new_area + size);
- memblock_free(start, size);
+ memblock_phys_free(start, size);
boot_params.hdr.ramdisk_image = new_area;
boot_params.ext_ramdisk_image = new_area >> 32;
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c1b2f764b29a..c3e1f9a7d43a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -121,34 +121,10 @@ int xen_smp_intr_init(unsigned int cpu)
void __init xen_smp_cpus_done(unsigned int max_cpus)
{
- int cpu, rc, count = 0;
-
if (xen_hvm_domain())
native_smp_cpus_done(max_cpus);
else
calculate_max_logical_packages();
-
- if (xen_have_vcpu_info_placement)
- return;
-
- for_each_online_cpu(cpu) {
- if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
- continue;
-
- rc = remove_cpu(cpu);
-
- if (rc == 0) {
- /*
- * Reset vcpu_info so this cpu cannot be onlined again.
- */
- xen_vcpu_info_reset(cpu);
- count++;
- } else {
- pr_warn("%s: failed to bring CPU %d down, error %d\n",
- __func__, cpu, rc);
- }
- }
- WARN(count, "%s: brought %d CPUs offline\n", __func__, count);
}
void xen_smp_send_reschedule(int cpu)
@@ -268,20 +244,16 @@ void xen_send_IPI_allbutself(int vector)
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
- irq_enter();
generic_smp_call_function_interrupt();
inc_irq_stat(irq_call_count);
- irq_exit();
return IRQ_HANDLED;
}
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
- irq_enter();
generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
- irq_exit();
return IRQ_HANDLED;
}
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index 6ff3c887e0b9..b70afdff419c 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -20,6 +20,12 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void)
xen_vcpu_setup(0);
/*
+ * Called again in case the kernel boots on vcpu >= MAX_VIRT_CPUS.
+ * Refer to comments in xen_hvm_init_time_ops().
+ */
+ xen_hvm_init_time_ops();
+
+ /*
* The alternative logic (which patches the unlock/lock) runs before
* the smp bootup up code is activated. Hence we need to set this up
* the core kernel is being patched. Otherwise we will have only
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index c2ac319f11a4..ba7af2eca755 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -64,7 +64,6 @@ static void cpu_bringup(void)
cr4_init();
cpu_init();
touch_softlockup_watchdog();
- preempt_disable();
/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
@@ -130,7 +129,7 @@ int xen_smp_intr_init_pv(unsigned int cpu)
per_cpu(xen_irq_work, cpu).irq = rc;
per_cpu(xen_irq_work, cpu).name = callfunc_name;
- if (is_xen_pmu(cpu)) {
+ if (is_xen_pmu) {
pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
xen_pmu_irq_handler,
@@ -149,28 +148,12 @@ int xen_smp_intr_init_pv(unsigned int cpu)
return rc;
}
-static void __init xen_fill_possible_map(void)
-{
- int i, rc;
-
- if (xen_initial_domain())
- return;
-
- for (i = 0; i < nr_cpu_ids; i++) {
- rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
- if (rc >= 0) {
- num_processors++;
- set_cpu_possible(i, true);
- }
- }
-}
-
-static void __init xen_filter_cpu_maps(void)
+static void __init _get_smp_config(unsigned int early)
{
int i, rc;
unsigned int subtract = 0;
- if (!xen_initial_domain())
+ if (early)
return;
num_processors = 0;
@@ -211,7 +194,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void)
* sure the old memory can be recycled. */
make_lowmem_page_readwrite(xen_initial_gdt);
- xen_filter_cpu_maps();
xen_setup_vcpu_info_placement();
/*
@@ -226,7 +208,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void)
static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
- unsigned int i;
if (skip_ioapic_setup) {
char *m = (max_cpus == 0) ?
@@ -239,16 +220,9 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
}
xen_init_lock_cpu(0);
- smp_store_boot_cpu_info();
- cpu_data(0).x86_max_cores = 1;
+ smp_prepare_cpus_common();
- for_each_possible_cpu(i) {
- zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
- }
- set_cpu_sibling_map(0);
+ cpu_data(0).x86_max_cores = 1;
speculative_store_bypass_ht_init();
@@ -286,13 +260,14 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
- if (ctxt == NULL)
+ if (ctxt == NULL) {
+ cpumask_clear_cpu(cpu, xen_cpu_initialized_map);
+ cpumask_clear_cpu(cpu, cpu_callout_mask);
return -ENOMEM;
+ }
gdt = get_cpu_gdt_rw(cpu);
- memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
-
/*
* Bring up the CPU in cpu_bringup_and_idle() with the stack
* pointing just below where pt_regs would be if it were a normal
@@ -309,8 +284,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
xen_copy_trap_info(ctxt->trap_ctxt);
- ctxt->ldt_ents = 0;
-
BUG_ON((unsigned long)gdt & ~PAGE_MASK);
gdt_mfn = arbitrary_virt_to_mfn(gdt);
@@ -463,10 +436,8 @@ static void xen_pv_stop_other_cpus(int wait)
static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
{
- irq_enter();
irq_work_run();
inc_irq_stat(apic_irq_work_irqs);
- irq_exit();
return IRQ_HANDLED;
}
@@ -491,5 +462,8 @@ static const struct smp_ops xen_smp_ops __initconst = {
void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
- xen_fill_possible_map();
+
+ /* Avoid searching for BIOS MP tables */
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = _get_smp_config;
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 91f5b330dcc6..9ef0a5cca96e 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -379,11 +379,6 @@ void xen_timer_resume(void)
}
}
-static const struct pv_time_ops xen_time_ops __initconst = {
- .sched_clock = xen_sched_clock,
- .steal_clock = xen_steal_clock,
-};
-
static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
static u64 xen_clock_value_saved;
@@ -525,17 +520,24 @@ static void __init xen_time_init(void)
pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
}
-void __init xen_init_time_ops(void)
+static void __init xen_init_time_common(void)
{
xen_sched_clock_offset = xen_clocksource_read();
- pv_ops.time = xen_time_ops;
+ static_call_update(pv_steal_clock, xen_steal_clock);
+ paravirt_set_sched_clock(xen_sched_clock);
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+}
+
+void __init xen_init_time_ops(void)
+{
+ xen_init_time_common();
x86_init.timers.timer_init = xen_time_init;
x86_init.timers.setup_percpu_clockev = x86_init_noop;
x86_cpuinit.setup_percpu_clockev = x86_init_noop;
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
/* Dom0 uses the native method to set the hardware RTC. */
if (!xen_initial_domain())
x86_platform.set_wallclock = xen_set_wallclock;
@@ -556,6 +558,11 @@ static void xen_hvm_setup_cpu_clockevents(void)
void __init xen_hvm_init_time_ops(void)
{
+ static bool hvm_time_initialized;
+
+ if (hvm_time_initialized)
+ return;
+
/*
* vector callback is needed otherwise we cannot receive interrupts
* on cpu > 0 and at this point we don't know how many cpus are
@@ -565,18 +572,33 @@ void __init xen_hvm_init_time_ops(void)
return;
if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
- pr_info("Xen doesn't support pvclock on HVM, disable pv timer");
+ pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
return;
}
- xen_sched_clock_offset = xen_clocksource_read();
- pv_ops.time = xen_time_ops;
+ /*
+ * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
+ * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
+ * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
+ * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
+ *
+ * The xen_hvm_init_time_ops() should be called again later after
+ * __this_cpu_read(xen_vcpu) is available.
+ */
+ if (!__this_cpu_read(xen_vcpu)) {
+ pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
+ xen_vcpu_nr(0));
+ return;
+ }
+
+ xen_init_time_common();
+
x86_init.timers.setup_percpu_clockev = xen_time_init;
x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
x86_platform.set_wallclock = xen_set_wallclock;
+
+ hvm_time_initialized = true;
}
#endif
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index e336f223f7f4..14ea32e734d5 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -57,16 +57,20 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.ext_lfb_base)
+ + sizeof(info->u.vesa_lfb.ext_lfb_base)
+ && info->u.vesa_lfb.ext_lfb_base) {
+ screen_info->ext_lfb_base = info->u.vesa_lfb.ext_lfb_base;
+ screen_info->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ }
+
if (info->video_type == XEN_VGATYPE_EFI_LFB) {
screen_info->orig_video_isVGA = VIDEO_TYPE_EFI;
break;
}
if (size >= offsetof(struct dom0_vga_console_info,
- u.vesa_lfb.gbl_caps)
- + sizeof(info->u.vesa_lfb.gbl_caps))
- screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
- if (size >= offsetof(struct dom0_vga_console_info,
u.vesa_lfb.mode_attrs)
+ sizeof(info->u.vesa_lfb.mode_attrs))
screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 1e626444712b..caa9bc2fa100 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -20,6 +20,46 @@
#include <linux/init.h>
#include <linux/linkage.h>
+#include <../entry/calling.h>
+
+.pushsection .noinstr.text, "ax"
+/*
+ * Disabling events is simply a matter of making the event mask
+ * non-zero.
+ */
+SYM_FUNC_START(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ RET
+SYM_FUNC_END(xen_irq_disable_direct)
+
+/*
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
+ */
+SYM_FUNC_START(check_events)
+ FRAME_BEGIN
+ push %rax
+ push %rcx
+ push %rdx
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ call xen_force_evtchn_callback
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %rcx
+ pop %rax
+ FRAME_END
+ RET
+SYM_FUNC_END(check_events)
/*
* Enable events. This clears the event mask and tests the pending
@@ -44,19 +84,9 @@ SYM_FUNC_START(xen_irq_enable_direct)
call check_events
1:
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_irq_enable_direct)
-
-/*
- * Disabling events is simply a matter of making the event mask
- * non-zero.
- */
-SYM_FUNC_START(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- ret
-SYM_FUNC_END(xen_irq_disable_direct)
-
/*
* (xen_)save_fl is used to get the current interrupt enable status.
* Callers expect the status to be in X86_EFLAGS_IF, and other bits
@@ -70,56 +100,29 @@ SYM_FUNC_START(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
- ret
+ RET
SYM_FUNC_END(xen_save_fl_direct)
-/*
- * Force an event check by making a hypercall, but preserve regs
- * before making the call.
- */
-SYM_FUNC_START(check_events)
- FRAME_BEGIN
- push %rax
- push %rcx
- push %rdx
- push %rsi
- push %rdi
- push %r8
- push %r9
- push %r10
- push %r11
- call xen_force_evtchn_callback
- pop %r11
- pop %r10
- pop %r9
- pop %r8
- pop %rdi
- pop %rsi
- pop %rdx
- pop %rcx
- pop %rax
- FRAME_END
- ret
-SYM_FUNC_END(check_events)
-
SYM_FUNC_START(xen_read_cr2)
FRAME_BEGIN
_ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX
_ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_read_cr2);
SYM_FUNC_START(xen_read_cr2_direct)
FRAME_BEGIN
_ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_read_cr2_direct);
+.popsection
.macro xen_pv_trap name
SYM_CODE_START(xen_\name)
UNWIND_HINT_EMPTY
+ ENDBR
pop %rcx
pop %r11
jmp \name
@@ -145,6 +148,9 @@ xen_pv_trap asm_exc_page_fault
xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
+#ifdef CONFIG_X86_KERNEL_IBT
+xen_pv_trap asm_exc_control_protection
+#endif
#ifdef CONFIG_X86_MCE
xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
@@ -160,6 +166,7 @@ SYM_CODE_START(xen_early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
UNWIND_HINT_EMPTY
+ ENDBR
pop %rcx
pop %r11
jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE
@@ -187,11 +194,31 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
*/
SYM_CODE_START(xen_iret)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
pushq $0
jmp hypercall_iret
SYM_CODE_END(xen_iret)
/*
+ * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
+ * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode()
+ * in XEN pv would cause %rsp to move up to the top of the kernel stack and
+ * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
+ * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
+ * frame at the same address is useless.
+ */
+SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
+ UNWIND_HINT_REGS
+ POP_REGS
+
+ /* stackleak_erase() can work safely on the kernel stack. */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ addq $8, %rsp /* skip regs->orig_ax */
+ jmp xen_iret
+SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
+
+/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs
@@ -209,6 +236,7 @@ SYM_CODE_END(xen_iret)
/* Normal 64-bit system call target */
SYM_CODE_START(xen_syscall_target)
UNWIND_HINT_EMPTY
+ ENDBR
popq %rcx
popq %r11
@@ -228,6 +256,7 @@ SYM_CODE_END(xen_syscall_target)
/* 32-bit compat syscall target */
SYM_CODE_START(xen_syscall32_target)
UNWIND_HINT_EMPTY
+ ENDBR
popq %rcx
popq %r11
@@ -245,6 +274,7 @@ SYM_CODE_END(xen_syscall32_target)
/* 32-bit compat sysenter target */
SYM_CODE_START(xen_sysenter_target)
UNWIND_HINT_EMPTY
+ ENDBR
/*
* NB: Xen is polite and clears TF from EFLAGS for us. This means
* that we don't need to guard against single step exceptions here.
@@ -268,6 +298,7 @@ SYM_CODE_END(xen_sysenter_target)
SYM_CODE_START(xen_syscall32_target)
SYM_CODE_START(xen_sysenter_target)
UNWIND_HINT_EMPTY
+ ENDBR
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $0
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index cb6538ae2fe0..3a2cd93bf059 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -20,21 +20,43 @@
#include <xen/interface/xen-mca.h>
#include <asm/xen/interface.h>
+.pushsection .noinstr.text, "ax"
+ .balign PAGE_SIZE
+SYM_CODE_START(hypercall_page)
+ .rept (PAGE_SIZE / 32)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ret
+ /*
+ * Xen will write the hypercall page, and sort out ENDBR.
+ */
+ .skip 31, 0xcc
+ .endr
+
+#define HYPERCALL(n) \
+ .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
+ .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
+#include <asm/xen-hypercalls.h>
+#undef HYPERCALL
+SYM_CODE_END(hypercall_page)
+.popsection
+
#ifdef CONFIG_XEN_PV
__INIT
SYM_CODE_START(startup_xen)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
cld
/* Clear .bss */
xor %eax,%eax
- mov $__bss_start, %_ASM_DI
- mov $__bss_stop, %_ASM_CX
- sub %_ASM_DI, %_ASM_CX
- shr $__ASM_SEL(2, 3), %_ASM_CX
- rep __ASM_SIZE(stos)
+ mov $__bss_start, %rdi
+ mov $__bss_stop, %rcx
+ sub %rdi, %rcx
+ shr $3, %rcx
+ rep stosq
- mov %_ASM_SI, xen_start_info
+ mov %rsi, xen_start_info
mov initial_stack(%rip), %rsp
/* Set up %gs.
@@ -57,6 +79,7 @@ SYM_CODE_END(startup_xen)
.pushsection .text
SYM_CODE_START(asm_cpu_bringup_and_idle)
UNWIND_HINT_EMPTY
+ ENDBR
call cpu_bringup_and_idle
SYM_CODE_END(asm_cpu_bringup_and_idle)
@@ -64,23 +87,6 @@ SYM_CODE_END(asm_cpu_bringup_and_idle)
#endif
#endif
-.pushsection .text
- .balign PAGE_SIZE
-SYM_CODE_START(hypercall_page)
- .rept (PAGE_SIZE / 32)
- UNWIND_HINT_FUNC
- .skip 31, 0x90
- ret
- .endr
-
-#define HYPERCALL(n) \
- .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
- .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
-#include <asm/xen-hypercalls.h>
-#undef HYPERCALL
-SYM_CODE_END(hypercall_page)
-.popsection
-
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 8d7ec49a35fb..fd0fec6e92f4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -51,6 +51,7 @@ void __init xen_remap_memory(void);
phys_addr_t __init xen_find_free_area(phys_addr_t size);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
+void xen_banner(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
@@ -75,9 +76,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
bool xen_vcpu_stolen(int vcpu);
-extern int xen_have_vcpu_info_placement;
-
-int xen_vcpu_setup(int cpu);
+void xen_vcpu_setup(int cpu);
void xen_vcpu_info_reset(int cpu);
void xen_setup_vcpu_info_placement(void);
@@ -109,7 +108,7 @@ static inline void xen_uninit_lock_cpu(int cpu)
struct dom0_vga_console_info;
-#ifdef CONFIG_XEN_DOM0
+#ifdef CONFIG_XEN_PV_DOM0
void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
@@ -118,6 +117,8 @@ static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
}
#endif
+void xen_add_preferred_consoles(void);
+
void __init xen_init_apic(void);
#ifdef CONFIG_XEN_EFI