aboutsummaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/Kconfig12
-rw-r--r--arch/alpha/include/asm/futex.h26
-rw-r--r--arch/alpha/include/asm/io.h1
-rw-r--r--arch/alpha/include/asm/spinlock.h5
-rw-r--r--arch/alpha/include/asm/types.h2
-rw-r--r--arch/alpha/include/asm/unistd.h2
-rw-r--r--arch/alpha/include/uapi/asm/types.h12
-rw-r--r--arch/alpha/include/uapi/asm/unistd.h14
-rw-r--r--arch/alpha/kernel/core_marvel.c6
-rw-r--r--arch/alpha/kernel/core_titan.c2
-rw-r--r--arch/alpha/kernel/module.c3
-rw-r--r--arch/alpha/kernel/smp.c2
-rw-r--r--arch/alpha/kernel/systbls.S9
-rw-r--r--arch/alpha/lib/Makefile22
-rw-r--r--arch/alpha/lib/copy_user.S2
-rw-r--r--arch/alpha/lib/ev6-copy_user.S7
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/Makefile2
-rw-r--r--arch/arc/boot/dts/axc001.dtsi20
-rw-r--r--arch/arc/boot/dts/axc003.dtsi21
-rw-r--r--arch/arc/boot/dts/axc003_idu.dtsi21
-rw-r--r--arch/arc/boot/dts/axs10x_mb.dtsi2
-rw-r--r--arch/arc/configs/haps_hs_defconfig1
-rw-r--r--arch/arc/configs/haps_hs_smp_defconfig1
-rw-r--r--arch/arc/configs/nps_defconfig1
-rw-r--r--arch/arc/configs/nsim_700_defconfig1
-rw-r--r--arch/arc/configs/nsim_hs_defconfig1
-rw-r--r--arch/arc/configs/nsim_hs_smp_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_hs_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_hs_smp_defconfig1
-rw-r--r--arch/arc/configs/tb10x_defconfig1
-rw-r--r--arch/arc/include/asm/atomic.h2
-rw-r--r--arch/arc/include/asm/cache.h2
-rw-r--r--arch/arc/include/asm/futex.h40
-rw-r--r--arch/arc/include/asm/mmu.h2
-rw-r--r--arch/arc/include/asm/spinlock.h5
-rw-r--r--arch/arc/kernel/intc-arcv2.c10
-rw-r--r--arch/arc/kernel/intc-compact.c14
-rw-r--r--arch/arc/mm/cache.c50
-rw-r--r--arch/arc/mm/dma.c45
-rw-r--r--arch/arc/mm/tlb.c12
-rw-r--r--arch/arc/plat-sim/Kconfig13
-rw-r--r--arch/arc/plat-sim/platform.c5
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/boot/dts/armada-388-gp.dts4
-rw-r--r--arch/arm/boot/dts/da850-evm.dts21
-rw-r--r--arch/arm/boot/dts/da850-lcdk.dts7
-rw-r--r--arch/arm/boot/dts/dm8168-evm.dts34
-rw-r--r--arch/arm/boot/dts/dm816x.dtsi2
-rw-r--r--arch/arm/boot/dts/dra71-evm.dts4
-rw-r--r--arch/arm/boot/dts/exynos4.dtsi3
-rw-r--r--arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi1
-rw-r--r--arch/arm/boot/dts/imx25.dtsi1
-rw-r--r--arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi4
-rw-r--r--arch/arm/boot/dts/imx7d-sdb.dts16
-rw-r--r--arch/arm/boot/dts/ls1021a.dtsi8
-rw-r--r--arch/arm/boot/dts/rk3288.dtsi4
-rw-r--r--arch/arm/boot/dts/sama5d2.dtsi12
-rw-r--r--arch/arm/boot/dts/sun8i-a83t.dtsi16
-rw-r--r--arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts9
-rw-r--r--arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts19
-rw-r--r--arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts7
-rw-r--r--arch/arm/boot/dts/sun8i-h3-orangepi-2.dts8
-rw-r--r--arch/arm/boot/dts/sun8i-h3-orangepi-one.dts8
-rw-r--r--arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts5
-rw-r--r--arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts8
-rw-r--r--arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts22
-rw-r--r--arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts16
-rw-r--r--arch/arm/boot/dts/sunxi-h3-h5.dtsi26
-rw-r--r--arch/arm/boot/dts/tango4-vantage-1172.dts2
-rw-r--r--arch/arm/include/asm/arch_gicv3.h34
-rw-r--r--arch/arm/include/asm/futex.h26
-rw-r--r--arch/arm/include/asm/kvm_host.h6
-rw-r--r--arch/arm/include/asm/spinlock.h16
-rw-r--r--arch/arm/include/asm/thread_info.h15
-rw-r--r--arch/arm/include/asm/tlb.h11
-rw-r--r--arch/arm/include/asm/traps.h7
-rw-r--r--arch/arm/include/asm/uaccess.h2
-rw-r--r--arch/arm/kernel/entry-common.S9
-rw-r--r--arch/arm/kernel/signal.c5
-rw-r--r--arch/arm/mach-at91/Kconfig2
-rw-r--r--arch/arm/mach-at91/pm.c12
-rw-r--r--arch/arm/mach-davinci/board-da850-evm.c4
-rw-r--r--arch/arm/mach-davinci/clock.c9
-rw-r--r--arch/arm/mach-ep93xx/clock.c20
-rw-r--r--arch/arm/mach-hisi/Kconfig1
-rw-r--r--arch/arm/mach-ixp4xx/include/mach/io.h34
-rw-r--r--arch/arm/mach-mmp/devices.c2
-rw-r--r--arch/arm/mach-mvebu/platsmp.c2
-rw-r--r--arch/arm/mach-omap1/board-ams-delta.c12
-rw-r--r--arch/arm/mach-omap1/board-osk.c4
-rw-r--r--arch/arm/mach-omap2/Makefile2
-rw-r--r--arch/arm/mach-omap2/board-generic.c3
-rw-r--r--arch/arm/mach-omap2/display.c119
-rw-r--r--arch/arm/mach-omap2/display.h1
-rw-r--r--arch/arm/mach-omap2/drm.c53
-rw-r--r--arch/arm/mach-omap2/io.c3
-rw-r--r--arch/arm/mach-omap2/pm34xx.c1
-rw-r--r--arch/arm/mach-omap2/prm3xxx.c7
-rw-r--r--arch/arm/mach-omap2/prm44xx.c55
-rw-r--r--arch/arm/mach-prima2/common.c2
-rw-r--r--arch/arm/mach-pxa/Kconfig1
-rw-r--r--arch/arm/mach-pxa/include/mach/mtd-xip.h10
-rw-r--r--arch/arm/mach-rpc/include/mach/hardware.h4
-rw-r--r--arch/arm/mach-sa1100/clock.c25
-rw-r--r--arch/arm/mach-sa1100/include/mach/mtd-xip.h4
-rw-r--r--arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c6
-rw-r--r--arch/arm/mach-w90x900/clock.c29
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts15
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts15
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts16
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts15
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi20
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts17
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts17
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts17
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi3
-rw-r--r--arch/arm64/boot/dts/amlogic/meson-gx.dtsi2
-rw-r--r--arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts4
-rw-r--r--arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts103
-rw-r--r--arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi6
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi12
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi31
-rw-r--r--arch/arm64/boot/dts/marvell/armada-37xx.dtsi2
-rw-r--r--arch/arm64/boot/dts/marvell/armada-ap806.dtsi4
-rw-r--r--arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi1
-rw-r--r--arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi3
-rw-r--r--arch/arm64/boot/dts/renesas/salvator-common.dtsi4
-rw-r--r--arch/arm64/boot/dts/renesas/ulcb.dtsi2
-rw-r--r--arch/arm64/configs/defconfig1
-rw-r--r--arch/arm64/include/asm/arch_gicv3.h7
-rw-r--r--arch/arm64/include/asm/arch_timer.h4
-rw-r--r--arch/arm64/include/asm/elf.h4
-rw-r--r--arch/arm64/include/asm/futex.h26
-rw-r--r--arch/arm64/include/asm/kvm_host.h6
-rw-r--r--arch/arm64/include/asm/memory.h6
-rw-r--r--arch/arm64/include/asm/spinlock.h69
-rw-r--r--arch/arm64/include/asm/thread_info.h4
-rw-r--r--arch/arm64/include/asm/traps.h7
-rw-r--r--arch/arm64/include/asm/uaccess.h3
-rw-r--r--arch/arm64/kernel/fpsimd.c2
-rw-r--r--arch/arm64/kernel/head.S1
-rw-r--r--arch/arm64/kernel/kaslr.c20
-rw-r--r--arch/arm64/kernel/process.c2
-rw-r--r--arch/arm64/kernel/signal.c5
-rw-r--r--arch/arm64/kernel/traps.c2
-rw-r--r--arch/arm64/kvm/sys_regs.c2
-rw-r--r--arch/arm64/mm/fault.c20
-rw-r--r--arch/blackfin/include/asm/spinlock.h5
-rw-r--r--arch/blackfin/kernel/module.c39
-rw-r--r--arch/c6x/configs/dsk6455_defconfig2
-rw-r--r--arch/c6x/configs/evmc6457_defconfig2
-rw-r--r--arch/c6x/configs/evmc6472_defconfig2
-rw-r--r--arch/c6x/configs/evmc6474_defconfig2
-rw-r--r--arch/c6x/configs/evmc6678_defconfig2
-rw-r--r--arch/c6x/platforms/megamod-pic.c22
-rw-r--r--arch/c6x/platforms/plldata.c4
-rw-r--r--arch/c6x/platforms/timer64.c8
-rw-r--r--arch/cris/arch-v32/mach-a3/arbiter.c4
-rw-r--r--arch/cris/arch-v32/mach-fs/arbiter.c4
-rw-r--r--arch/cris/kernel/traps.c6
-rw-r--r--arch/frv/include/asm/futex.h3
-rw-r--r--arch/frv/kernel/futex.c27
-rw-r--r--arch/h8300/include/asm/traps.h6
-rw-r--r--arch/hexagon/include/asm/atomic.h2
-rw-r--r--arch/hexagon/include/asm/futex.h38
-rw-r--r--arch/hexagon/include/asm/spinlock.h5
-rw-r--r--arch/ia64/include/asm/acpi.h2
-rw-r--r--arch/ia64/include/asm/futex.h25
-rw-r--r--arch/ia64/include/asm/spinlock.h21
-rw-r--r--arch/ia64/include/asm/tlb.h8
-rw-r--r--arch/ia64/kernel/efi.c4
-rw-r--r--arch/m32r/include/asm/flat.h3
-rw-r--r--arch/m32r/include/asm/spinlock.h5
-rw-r--r--arch/metag/Kconfig1
-rw-r--r--arch/metag/include/asm/atomic_lock1.h2
-rw-r--r--arch/metag/include/asm/spinlock.h5
-rw-r--r--arch/microblaze/include/asm/flat.h2
-rw-r--r--arch/microblaze/include/asm/futex.h38
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/Makefile15
-rw-r--r--arch/mips/boot/compressed/.gitignore2
-rw-r--r--arch/mips/cavium-octeon/octeon-usb.c2
-rw-r--r--arch/mips/dec/int-handler.S34
-rw-r--r--arch/mips/include/asm/cache.h2
-rw-r--r--arch/mips/include/asm/cpu-features.h3
-rw-r--r--arch/mips/include/asm/futex.h25
-rw-r--r--arch/mips/include/asm/kvm_host.h5
-rw-r--r--arch/mips/include/asm/mach-ralink/ralink_regs.h2
-rw-r--r--arch/mips/include/asm/octeon/cvmx-l2c-defs.h37
-rw-r--r--arch/mips/include/asm/octeon/cvmx-l2d-defs.h60
-rw-r--r--arch/mips/include/asm/octeon/cvmx.h1
-rw-r--r--arch/mips/kernel/ptrace.c10
-rw-r--r--arch/mips/kernel/scall32-o32.S11
-rw-r--r--arch/mips/kernel/scall64-o32.S6
-rw-r--r--arch/mips/kernel/smp.c12
-rw-r--r--arch/mips/mm/uasm-mips.c2
-rw-r--r--arch/mips/net/ebpf_jit.c1950
-rw-r--r--arch/mips/pci/pci.c7
-rw-r--r--arch/mips/ralink/mt7620.c1
-rw-r--r--arch/mips/vdso/gettimeofday.c6
-rw-r--r--arch/mn10300/include/asm/spinlock.h5
-rw-r--r--arch/openrisc/include/asm/futex.h39
-rw-r--r--arch/parisc/Kconfig3
-rw-r--r--arch/parisc/include/asm/atomic.h2
-rw-r--r--arch/parisc/include/asm/futex.h26
-rw-r--r--arch/parisc/include/asm/spinlock.h7
-rw-r--r--arch/parisc/include/asm/thread_info.h2
-rw-r--r--arch/parisc/kernel/cache.c5
-rw-r--r--arch/parisc/kernel/irq.c2
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/boot/Makefile14
-rw-r--r--arch/powerpc/configs/powernv_defconfig3
-rw-r--r--arch/powerpc/configs/ppc64_defconfig3
-rw-r--r--arch/powerpc/configs/pseries_defconfig3
-rw-r--r--arch/powerpc/include/asm/barrier.h7
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h10
-rw-r--r--arch/powerpc/include/asm/futex.h26
-rw-r--r--arch/powerpc/include/asm/kvm_host.h5
-rw-r--r--arch/powerpc/include/asm/mmu_context.h18
-rw-r--r--arch/powerpc/include/asm/pgtable-be-types.h1
-rw-r--r--arch/powerpc/include/asm/pgtable-types.h1
-rw-r--r--arch/powerpc/include/asm/spinlock.h36
-rw-r--r--arch/powerpc/kernel/entry_64.S60
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S10
-rw-r--r--arch/powerpc/kernel/idle_book3s.S8
-rw-r--r--arch/powerpc/kernel/irq.c15
-rw-r--r--arch/powerpc/kernel/process.c9
-rw-r--r--arch/powerpc/kernel/ptrace.c13
-rw-r--r--arch/powerpc/kernel/smp.c18
-rw-r--r--arch/powerpc/kernel/watchdog.c49
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c56
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S3
-rw-r--r--arch/powerpc/kvm/book3s_xive_template.c68
-rw-r--r--arch/powerpc/perf/core-book3s.c3
-rw-r--r--arch/powerpc/platforms/83xx/mpc832x_rdb.c2
-rw-r--r--arch/powerpc/platforms/powernv/idle.c41
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c10
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c8
-rw-r--r--arch/s390/include/asm/compat.h5
-rw-r--r--arch/s390/include/asm/futex.h23
-rw-r--r--arch/s390/include/asm/mmu_context.h5
-rw-r--r--arch/s390/include/asm/spinlock.h7
-rw-r--r--arch/s390/include/asm/tlb.h17
-rw-r--r--arch/s390/kvm/sthyi.c7
-rw-r--r--arch/s390/mm/mmap.c6
-rw-r--r--arch/s390/net/bpf_jit_comp.c3
-rw-r--r--arch/sh/include/asm/futex.h26
-rw-r--r--arch/sh/include/asm/spinlock-cas.h5
-rw-r--r--arch/sh/include/asm/spinlock-llsc.h5
-rw-r--r--arch/sh/include/asm/tlb.h8
-rw-r--r--arch/sparc/configs/sparc32_defconfig4
-rw-r--r--arch/sparc/configs/sparc64_defconfig4
-rw-r--r--arch/sparc/include/asm/atomic_32.h2
-rw-r--r--arch/sparc/include/asm/futex_64.h26
-rw-r--r--arch/sparc/include/asm/mmu_context_64.h14
-rw-r--r--arch/sparc/include/asm/page_32.h2
-rw-r--r--arch/sparc/include/asm/spinlock_32.h5
-rw-r--r--arch/sparc/include/asm/spitfire.h16
-rw-r--r--arch/sparc/kernel/cpu.c6
-rw-r--r--arch/sparc/kernel/cpumap.c1
-rw-r--r--arch/sparc/kernel/head_64.S22
-rw-r--r--arch/sparc/kernel/pci_sun4v.c2
-rw-r--r--arch/sparc/kernel/pcic.c2
-rw-r--r--arch/sparc/kernel/setup_64.c15
-rw-r--r--arch/sparc/kernel/tsb.S12
-rw-r--r--arch/sparc/lib/U3memcpy.S4
-rw-r--r--arch/sparc/lib/multi3.S24
-rw-r--r--arch/sparc/mm/init_64.c39
-rw-r--r--arch/sparc/power/hibernate.c3
-rw-r--r--arch/tile/include/asm/atomic_32.h2
-rw-r--r--arch/tile/include/asm/futex.h40
-rw-r--r--arch/tile/include/asm/spinlock_32.h2
-rw-r--r--arch/tile/include/asm/spinlock_64.h2
-rw-r--r--arch/tile/lib/spinlock_32.c23
-rw-r--r--arch/tile/lib/spinlock_64.c22
-rw-r--r--arch/um/include/asm/tlb.h13
-rw-r--r--arch/um/include/asm/unwind.h8
-rw-r--r--arch/x86/Kbuild3
-rw-r--r--arch/x86/Kconfig60
-rw-r--r--arch/x86/Kconfig.debug59
-rw-r--r--arch/x86/Makefile17
-rw-r--r--arch/x86/boot/compressed/eboot.c2
-rw-r--r--arch/x86/boot/compressed/head_32.S129
-rw-r--r--arch/x86/boot/compressed/head_64.S112
-rw-r--r--arch/x86/boot/compressed/kaslr.c147
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/compressed/pagetable.c7
-rw-r--r--arch/x86/boot/header.S8
-rw-r--r--arch/x86/configs/tiny.config2
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S67
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c2
-rw-r--r--arch/x86/entry/Makefile1
-rw-r--r--arch/x86/entry/calling.h5
-rw-r--r--arch/x86/entry/common.c3
-rw-r--r--arch/x86/entry/entry_64.S190
-rw-r--r--arch/x86/entry/entry_64_compat.S11
-rw-r--r--arch/x86/events/amd/uncore.c21
-rw-r--r--arch/x86/events/core.c80
-rw-r--r--arch/x86/events/intel/bts.c4
-rw-r--r--arch/x86/events/intel/core.c107
-rw-r--r--arch/x86/events/intel/ds.c58
-rw-r--r--arch/x86/events/intel/lbr.c52
-rw-r--r--arch/x86/events/intel/p4.c2
-rw-r--r--arch/x86/events/intel/pt.c5
-rw-r--r--arch/x86/events/intel/rapl.c2
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c12
-rw-r--r--arch/x86/events/intel/uncore_snb.c6
-rw-r--r--arch/x86/events/intel/uncore_snbep.c42
-rw-r--r--arch/x86/events/perf_event.h10
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/acpi.h13
-rw-r--r--arch/x86/include/asm/asm.h6
-rw-r--r--arch/x86/include/asm/atomic.h69
-rw-r--r--arch/x86/include/asm/atomic64_32.h81
-rw-r--r--arch/x86/include/asm/atomic64_64.h73
-rw-r--r--arch/x86/include/asm/cmdline.h2
-rw-r--r--arch/x86/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h5
-rw-r--r--arch/x86/include/asm/disabled-features.h4
-rw-r--r--arch/x86/include/asm/dma-mapping.h5
-rw-r--r--arch/x86/include/asm/dmi.h8
-rw-r--r--arch/x86/include/asm/e820/api.h2
-rw-r--r--arch/x86/include/asm/elf.h23
-rw-r--r--arch/x86/include/asm/fixmap.h20
-rw-r--r--arch/x86/include/asm/fpu/internal.h6
-rw-r--r--arch/x86/include/asm/futex.h40
-rw-r--r--arch/x86/include/asm/hypervisor.h10
-rw-r--r--arch/x86/include/asm/init.h1
-rw-r--r--arch/x86/include/asm/io.h106
-rw-r--r--arch/x86/include/asm/kexec.h11
-rw-r--r--arch/x86/include/asm/kvm_host.h5
-rw-r--r--arch/x86/include/asm/lguest.h91
-rw-r--r--arch/x86/include/asm/lguest_hcall.h74
-rw-r--r--arch/x86/include/asm/mem_encrypt.h80
-rw-r--r--arch/x86/include/asm/mmu.h25
-rw-r--r--arch/x86/include/asm/mmu_context.h19
-rw-r--r--arch/x86/include/asm/module.h9
-rw-r--r--arch/x86/include/asm/mpx.h9
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/orc_lookup.h46
-rw-r--r--arch/x86/include/asm/orc_types.h107
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/page_types.h3
-rw-r--r--arch/x86/include/asm/pgtable.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h58
-rw-r--r--arch/x86/include/asm/processor-flags.h13
-rw-r--r--arch/x86/include/asm/processor.h25
-rw-r--r--arch/x86/include/asm/ptrace.h43
-rw-r--r--arch/x86/include/asm/realmode.h12
-rw-r--r--arch/x86/include/asm/refcount.h109
-rw-r--r--arch/x86/include/asm/rmwcc.h37
-rw-r--r--arch/x86/include/asm/set_memory.h3
-rw-r--r--arch/x86/include/asm/thread_info.h5
-rw-r--r--arch/x86/include/asm/tlb.h14
-rw-r--r--arch/x86/include/asm/tlbflush.h87
-rw-r--r--arch/x86/include/asm/topology.h6
-rw-r--r--arch/x86/include/asm/uaccess.h7
-rw-r--r--arch/x86/include/asm/unwind.h76
-rw-r--r--arch/x86/include/asm/unwind_hints.h105
-rw-r--r--arch/x86/include/asm/vga.h14
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h2
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/boot.c6
-rw-r--r--arch/x86/kernel/alternative.c22
-rw-r--r--arch/x86/kernel/asm-offsets_32.c20
-rw-r--r--arch/x86/kernel/cpu/amd.c55
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c43
-rw-r--r--arch/x86/kernel/cpu/bugs.c8
-rw-r--r--arch/x86/kernel/cpu/common.c40
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c32
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c43
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c27
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c18
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/dumpstack.c14
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c4
-rw-r--r--arch/x86/kernel/e820.c26
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/head64.c100
-rw-r--r--arch/x86/kernel/head_32.S22
-rw-r--r--arch/x86/kernel/head_64.S40
-rw-r--r--arch/x86/kernel/hpet.c27
-rw-r--r--arch/x86/kernel/kdebugfs.c34
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/ksysfs.c32
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/ldt.c21
-rw-r--r--arch/x86/kernel/machine_kexec_64.c25
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/mpparse.c108
-rw-r--r--arch/x86/kernel/nmi.c18
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/pci-nommu.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c15
-rw-r--r--arch/x86/kernel/platform-quirks.c1
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c242
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S14
-rw-r--r--arch/x86/kernel/setup.c12
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smpboot.c30
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c30
-rw-r--r--arch/x86/kernel/unwind_frame.c41
-rw-r--r--arch/x86/kernel/unwind_guess.c5
-rw-r--r--arch/x86/kernel/unwind_orc.c582
-rw-r--r--arch/x86/kernel/vmlinux.lds.S3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/mmu.c41
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/svm.c54
-rw-r--r--arch/x86/kvm/vmx.c264
-rw-r--r--arch/x86/kvm/x86.c51
-rw-r--r--arch/x86/lguest/Kconfig14
-rw-r--r--arch/x86/lguest/Makefile2
-rw-r--r--arch/x86/lguest/boot.c1558
-rw-r--r--arch/x86/lguest/head_32.S192
-rw-r--r--arch/x86/lib/cmdline.c105
-rw-r--r--arch/x86/math-emu/div_Xsig.S1
-rw-r--r--arch/x86/math-emu/div_small.S2
-rw-r--r--arch/x86/math-emu/mul_Xsig.S4
-rw-r--r--arch/x86/math-emu/polynom_Xsig.S1
-rw-r--r--arch/x86/math-emu/reg_norm.S2
-rw-r--r--arch/x86/math-emu/reg_round.S2
-rw-r--r--arch/x86/math-emu/reg_u_add.S1
-rw-r--r--arch/x86/math-emu/reg_u_div.S2
-rw-r--r--arch/x86/math-emu/reg_u_mul.S1
-rw-r--r--arch/x86/math-emu/reg_u_sub.S1
-rw-r--r--arch/x86/math-emu/round_Xsig.S4
-rw-r--r--arch/x86/math-emu/shr_Xsig.S1
-rw-r--r--arch/x86/math-emu/wm_shrx.S2
-rw-r--r--arch/x86/math-emu/wm_sqrt.S1
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c93
-rw-r--r--arch/x86/mm/extable.c44
-rw-r--r--arch/x86/mm/fault.c26
-rw-r--r--arch/x86/mm/hugetlbpage.c27
-rw-r--r--arch/x86/mm/ident_map.c12
-rw-r--r--arch/x86/mm/init.c5
-rw-r--r--arch/x86/mm/ioremap.c287
-rw-r--r--arch/x86/mm/kasan_init_64.c6
-rw-r--r--arch/x86/mm/mem_encrypt.c593
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S149
-rw-r--r--arch/x86/mm/mmap.c19
-rw-r--r--arch/x86/mm/mpx.c33
-rw-r--r--arch/x86/mm/numa_emulation.c55
-rw-r--r--arch/x86/mm/pageattr.c67
-rw-r--r--arch/x86/mm/pat.c9
-rw-r--r--arch/x86/mm/pgtable.c8
-rw-r--r--arch/x86/mm/tlb.c331
-rw-r--r--arch/x86/pci/common.c4
-rw-r--r--arch/x86/platform/efi/efi.c6
-rw-r--r--arch/x86/platform/efi/efi_64.c15
-rw-r--r--arch/x86/platform/uv/tlb_uv.c4
-rw-r--r--arch/x86/realmode/init.c12
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S24
-rw-r--r--arch/x86/um/user-offsets.c2
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/enlighten_hvm.c59
-rw-r--r--arch/x86/xen/enlighten_pv.c66
-rw-r--r--arch/x86/xen/mmu_pv.c5
-rw-r--r--arch/x86/xen/xen-asm.S26
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S27
-rw-r--r--arch/x86/xen/xen-asm_64.S61
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h15
-rw-r--r--arch/xtensa/include/asm/Kbuild2
-rw-r--r--arch/xtensa/include/asm/device.h15
-rw-r--r--arch/xtensa/include/asm/futex.h27
-rw-r--r--arch/xtensa/include/asm/param.h18
-rw-r--r--arch/xtensa/include/asm/spinlock.h5
-rw-r--r--arch/xtensa/kernel/setup.c6
-rw-r--r--arch/xtensa/kernel/xtensa_ksyms.c2
-rw-r--r--arch/xtensa/mm/cache.c16
487 files changed, 9167 insertions, 5297 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 21d0089117fe..2520ca5b42eb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -931,6 +931,18 @@ config STRICT_MODULE_RWX
config ARCH_WANT_RELAX_ORDER
bool
+config ARCH_HAS_REFCOUNT
+ bool
+ help
+ An architecture selects this when it has implemented refcount_t
+ using open coded assembly primitives that provide an optimized
+ refcount_t implementation, possibly at the expense of some full
+ refcount state checks of CONFIG_REFCOUNT_FULL=y.
+
+ The refcount overflow check behavior, however, must be retained.
+ Catching overflows is the primary security concern for protecting
+ against bugs in reference counts.
+
config REFCOUNT_FULL
bool "Perform full reference count validation at the expense of speed"
help
diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index fb01dfb760c2..05a70edd57b6 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -25,18 +25,10 @@
: "r" (uaddr), "r"(oparg) \
: "memory")
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
pagefault_disable();
@@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h
index ff4049155c84..4d61d2a50c52 100644
--- a/arch/alpha/include/asm/io.h
+++ b/arch/alpha/include/asm/io.h
@@ -299,6 +299,7 @@ static inline void __iomem * ioremap_nocache(unsigned long offset,
return ioremap(offset, size);
}
+#define ioremap_wc ioremap_nocache
#define ioremap_uc ioremap_nocache
static inline void iounmap(volatile void __iomem *addr)
diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index a40b9fc0c6c3..718ac0b64adf 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -16,11 +16,6 @@
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
#define arch_spin_is_locked(x) ((x)->lock != 0)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
{
return lock.lock == 0;
diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h
index 4cb4b6d3452c..0bc66e1d3a7e 100644
--- a/arch/alpha/include/asm/types.h
+++ b/arch/alpha/include/asm/types.h
@@ -1,6 +1,6 @@
#ifndef _ALPHA_TYPES_H
#define _ALPHA_TYPES_H
-#include <asm-generic/int-ll64.h>
+#include <uapi/asm/types.h>
#endif /* _ALPHA_TYPES_H */
diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h
index b37153ecf2ac..db7fc0f511e2 100644
--- a/arch/alpha/include/asm/unistd.h
+++ b/arch/alpha/include/asm/unistd.h
@@ -3,7 +3,7 @@
#include <uapi/asm/unistd.h>
-#define NR_SYSCALLS 514
+#define NR_SYSCALLS 523
#define __ARCH_WANT_OLD_READDIR
#define __ARCH_WANT_STAT64
diff --git a/arch/alpha/include/uapi/asm/types.h b/arch/alpha/include/uapi/asm/types.h
index 9fd3cd459777..8d1024d7be05 100644
--- a/arch/alpha/include/uapi/asm/types.h
+++ b/arch/alpha/include/uapi/asm/types.h
@@ -9,8 +9,18 @@
* need to be careful to avoid a name clashes.
*/
-#ifndef __KERNEL__
+/*
+ * This is here because we used to use l64 for alpha
+ * and we don't want to impact user mode with our change to ll64
+ * in the kernel.
+ *
+ * However, some user programs are fine with this. They can
+ * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here.
+ */
+#if !defined(__SANE_USERSPACE_TYPES__) && !defined(__KERNEL__)
#include <asm-generic/int-l64.h>
+#else
+#include <asm-generic/int-ll64.h>
#endif
#endif /* _UAPI_ALPHA_TYPES_H */
diff --git a/arch/alpha/include/uapi/asm/unistd.h b/arch/alpha/include/uapi/asm/unistd.h
index aa33bf5aacb6..a2945fea6c86 100644
--- a/arch/alpha/include/uapi/asm/unistd.h
+++ b/arch/alpha/include/uapi/asm/unistd.h
@@ -475,5 +475,19 @@
#define __NR_getrandom 511
#define __NR_memfd_create 512
#define __NR_execveat 513
+#define __NR_seccomp 514
+#define __NR_bpf 515
+#define __NR_userfaultfd 516
+#define __NR_membarrier 517
+#define __NR_mlock2 518
+#define __NR_copy_file_range 519
+#define __NR_preadv2 520
+#define __NR_pwritev2 521
+#define __NR_statx 522
+
+/* Alpha doesn't have protection keys. */
+#define __IGNORE_pkey_mprotect
+#define __IGNORE_pkey_alloc
+#define __IGNORE_pkey_free
#endif /* _UAPI_ALPHA_UNISTD_H */
diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index d5f0580746a5..03ff832b1cb4 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -351,7 +351,7 @@ marvel_init_io7(struct io7 *io7)
}
}
-void
+void __init
marvel_io7_present(gct6_node *node)
{
int pe;
@@ -369,6 +369,7 @@ marvel_io7_present(gct6_node *node)
static void __init
marvel_find_console_vga_hose(void)
{
+#ifdef CONFIG_VGA_HOSE
u64 *pu64 = (u64 *)((u64)hwrpb + hwrpb->ctbt_offset);
if (pu64[7] == 3) { /* TERM_TYPE == graphics */
@@ -402,9 +403,10 @@ marvel_find_console_vga_hose(void)
pci_vga_hose = hose;
}
}
+#endif
}
-gct6_search_struct gct_wanted_node_list[] = {
+gct6_search_struct gct_wanted_node_list[] __initdata = {
{ GCT_TYPE_HOSE, GCT_SUBTYPE_IO_PORT_MODULE, marvel_io7_present },
{ 0, 0, NULL }
};
diff --git a/arch/alpha/kernel/core_titan.c b/arch/alpha/kernel/core_titan.c
index 219bf271c0ba..b532d925443d 100644
--- a/arch/alpha/kernel/core_titan.c
+++ b/arch/alpha/kernel/core_titan.c
@@ -461,6 +461,7 @@ titan_ioremap(unsigned long addr, unsigned long size)
unsigned long *ptes;
unsigned long pfn;
+#ifdef CONFIG_VGA_HOSE
/*
* Adjust the address and hose, if necessary.
*/
@@ -468,6 +469,7 @@ titan_ioremap(unsigned long addr, unsigned long size)
h = pci_vga_hose->index;
addr += pci_vga_hose->mem_space->start;
}
+#endif
/*
* Find the hose.
diff --git a/arch/alpha/kernel/module.c b/arch/alpha/kernel/module.c
index 936bc8f89a67..47632fa8c24e 100644
--- a/arch/alpha/kernel/module.c
+++ b/arch/alpha/kernel/module.c
@@ -181,6 +181,9 @@ apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab,
switch (r_type) {
case R_ALPHA_NONE:
break;
+ case R_ALPHA_REFLONG:
+ *(u32 *)location = value;
+ break;
case R_ALPHA_REFQUAD:
/* BUG() can produce misaligned relocations. */
((u32 *)location)[0] = value;
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 9fc560459ebd..f6726a746427 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -115,7 +115,7 @@ wait_boot_cpu_to_stop(int cpuid)
/*
* Where secondaries begin a life of C.
*/
-void
+void __init
smp_callin(void)
{
int cpuid = hard_smp_processor_id();
diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S
index 9b62e3fd4f03..5b4514abb234 100644
--- a/arch/alpha/kernel/systbls.S
+++ b/arch/alpha/kernel/systbls.S
@@ -532,6 +532,15 @@ sys_call_table:
.quad sys_getrandom
.quad sys_memfd_create
.quad sys_execveat
+ .quad sys_seccomp
+ .quad sys_bpf /* 515 */
+ .quad sys_userfaultfd
+ .quad sys_membarrier
+ .quad sys_mlock2
+ .quad sys_copy_file_range
+ .quad sys_preadv2 /* 520 */
+ .quad sys_pwritev2
+ .quad sys_statx
.size sys_call_table, . - sys_call_table
.type sys_call_table, @object
diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile
index 7083434dd241..a80815960364 100644
--- a/arch/alpha/lib/Makefile
+++ b/arch/alpha/lib/Makefile
@@ -20,12 +20,8 @@ lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \
checksum.o \
csum_partial_copy.o \
$(ev67-y)strlen.o \
- $(ev67-y)strcat.o \
- strcpy.o \
- $(ev67-y)strncat.o \
- strncpy.o \
- $(ev6-y)stxcpy.o \
- $(ev6-y)stxncpy.o \
+ stycpy.o \
+ styncpy.o \
$(ev67-y)strchr.o \
$(ev67-y)strrchr.o \
$(ev6-y)memchr.o \
@@ -49,3 +45,17 @@ AFLAGS___remlu.o = -DREM -DINTSIZE
$(addprefix $(obj)/,__divqu.o __remqu.o __divlu.o __remlu.o): \
$(src)/$(ev6-y)divide.S FORCE
$(call if_changed_rule,as_o_S)
+
+# There are direct branches between {str*cpy,str*cat} and stx*cpy.
+# Ensure the branches are within range by merging these objects.
+
+LDFLAGS_stycpy.o := -r
+LDFLAGS_styncpy.o := -r
+
+$(obj)/stycpy.o: $(obj)/strcpy.o $(obj)/$(ev67-y)strcat.o \
+ $(obj)/$(ev6-y)stxcpy.o FORCE
+ $(call if_changed,ld)
+
+$(obj)/styncpy.o: $(obj)/strncpy.o $(obj)/$(ev67-y)strncat.o \
+ $(obj)/$(ev6-y)stxncpy.o FORCE
+ $(call if_changed,ld)
diff --git a/arch/alpha/lib/copy_user.S b/arch/alpha/lib/copy_user.S
index 159f1b7e6e49..c277a1a4383e 100644
--- a/arch/alpha/lib/copy_user.S
+++ b/arch/alpha/lib/copy_user.S
@@ -34,7 +34,7 @@
.ent __copy_user
__copy_user:
.prologue 0
- and $18,$18,$0
+ mov $18,$0
and $16,7,$3
beq $0,$35
beq $3,$36
diff --git a/arch/alpha/lib/ev6-copy_user.S b/arch/alpha/lib/ev6-copy_user.S
index 35e6710d0700..954ca03ebebe 100644
--- a/arch/alpha/lib/ev6-copy_user.S
+++ b/arch/alpha/lib/ev6-copy_user.S
@@ -45,9 +45,10 @@
# Pipeline info: Slotting & Comments
__copy_user:
.prologue 0
- andq $18, $18, $0
- subq $18, 32, $1 # .. E .. .. : Is this going to be a small copy?
- beq $0, $zerolength # U .. .. .. : U L U L
+ mov $18, $0 # .. .. .. E
+ subq $18, 32, $1 # .. .. E. .. : Is this going to be a small copy?
+ nop # .. E .. ..
+ beq $18, $zerolength # U .. .. .. : U L U L
and $16,7,$3 # .. .. .. E : is leading dest misalignment
ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index a5459698f0ee..7db85ab00c52 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -96,7 +96,6 @@ menu "ARC Architecture Configuration"
menu "ARC Platform/SoC/Board"
-source "arch/arc/plat-sim/Kconfig"
source "arch/arc/plat-tb10x/Kconfig"
source "arch/arc/plat-axs10x/Kconfig"
#New platform adds here
diff --git a/arch/arc/Makefile b/arch/arc/Makefile
index 44ef35d33956..3a61cfcc38c0 100644
--- a/arch/arc/Makefile
+++ b/arch/arc/Makefile
@@ -107,7 +107,7 @@ core-y += arch/arc/
# w/o this dtb won't embed into kernel binary
core-y += arch/arc/boot/dts/
-core-$(CONFIG_ARC_PLAT_SIM) += arch/arc/plat-sim/
+core-y += arch/arc/plat-sim/
core-$(CONFIG_ARC_PLAT_TB10X) += arch/arc/plat-tb10x/
core-$(CONFIG_ARC_PLAT_AXS10X) += arch/arc/plat-axs10x/
core-$(CONFIG_ARC_PLAT_EZNPS) += arch/arc/plat-eznps/
diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi
index 53ce226f77a5..a380ffa1a458 100644
--- a/arch/arc/boot/dts/axc001.dtsi
+++ b/arch/arc/boot/dts/axc001.dtsi
@@ -15,15 +15,15 @@
/ {
compatible = "snps,arc";
- #address-cells = <1>;
- #size-cells = <1>;
+ #address-cells = <2>;
+ #size-cells = <2>;
cpu_card {
compatible = "simple-bus";
#address-cells = <1>;
#size-cells = <1>;
- ranges = <0x00000000 0xf0000000 0x10000000>;
+ ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
core_clk: core_clk {
#clock-cells = <0>;
@@ -91,23 +91,21 @@
mb_intc: dw-apb-ictl@0xe0012000 {
#interrupt-cells = <1>;
compatible = "snps,dw-apb-ictl";
- reg = < 0xe0012000 0x200 >;
+ reg = < 0x0 0xe0012000 0x0 0x200 >;
interrupt-controller;
interrupt-parent = <&core_intc>;
interrupts = < 7 >;
};
memory {
- #address-cells = <1>;
- #size-cells = <1>;
- ranges = <0x00000000 0x80000000 0x20000000>;
device_type = "memory";
- reg = <0x80000000 0x1b000000>; /* (512 - 32) MiB */
+ /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+ reg = <0x0 0x80000000 0x0 0x1b000000>; /* (512 - 32) MiB */
};
reserved-memory {
- #address-cells = <1>;
- #size-cells = <1>;
+ #address-cells = <2>;
+ #size-cells = <2>;
ranges;
/*
* We just move frame buffer area to the very end of
@@ -118,7 +116,7 @@
*/
frame_buffer: frame_buffer@9e000000 {
compatible = "shared-dma-pool";
- reg = <0x9e000000 0x2000000>;
+ reg = <0x0 0x9e000000 0x0 0x2000000>;
no-map;
};
};
diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi
index 14df46f141bf..cc9239ef8d08 100644
--- a/arch/arc/boot/dts/axc003.dtsi
+++ b/arch/arc/boot/dts/axc003.dtsi
@@ -14,15 +14,15 @@
/ {
compatible = "snps,arc";
- #address-cells = <1>;
- #size-cells = <1>;
+ #address-cells = <2>;
+ #size-cells = <2>;
cpu_card {
compatible = "simple-bus";
#address-cells = <1>;
#size-cells = <1>;
- ranges = <0x00000000 0xf0000000 0x10000000>;
+ ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
core_clk: core_clk {
#clock-cells = <0>;
@@ -94,30 +94,29 @@
mb_intc: dw-apb-ictl@0xe0012000 {
#interrupt-cells = <1>;
compatible = "snps,dw-apb-ictl";
- reg = < 0xe0012000 0x200 >;
+ reg = < 0x0 0xe0012000 0x0 0x200 >;
interrupt-controller;
interrupt-parent = <&core_intc>;
interrupts = < 24 >;
};
memory {
- #address-cells = <1>;
- #size-cells = <1>;
- ranges = <0x00000000 0x80000000 0x40000000>;
device_type = "memory";
- reg = <0x80000000 0x20000000>; /* 512MiB */
+ /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+ reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MiB low mem */
+ 0x1 0xc0000000 0x0 0x40000000>; /* 1 GiB highmem */
};
reserved-memory {
- #address-cells = <1>;
- #size-cells = <1>;
+ #address-cells = <2>;
+ #size-cells = <2>;
ranges;
/*
* Move frame buffer out of IOC aperture (0x8z-0xAz).
*/
frame_buffer: frame_buffer@be000000 {
compatible = "shared-dma-pool";
- reg = <0xbe000000 0x2000000>;
+ reg = <0x0 0xbe000000 0x0 0x2000000>;
no-map;
};
};
diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi
index 695f9fa1996b..4ebb2170abec 100644
--- a/arch/arc/boot/dts/axc003_idu.dtsi
+++ b/arch/arc/boot/dts/axc003_idu.dtsi
@@ -14,15 +14,15 @@
/ {
compatible = "snps,arc";
- #address-cells = <1>;
- #size-cells = <1>;
+ #address-cells = <2>;
+ #size-cells = <2>;
cpu_card {
compatible = "simple-bus";
#address-cells = <1>;
#size-cells = <1>;
- ranges = <0x00000000 0xf0000000 0x10000000>;
+ ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
core_clk: core_clk {
#clock-cells = <0>;
@@ -100,30 +100,29 @@
mb_intc: dw-apb-ictl@0xe0012000 {
#interrupt-cells = <1>;
compatible = "snps,dw-apb-ictl";
- reg = < 0xe0012000 0x200 >;
+ reg = < 0x0 0xe0012000 0x0 0x200 >;
interrupt-controller;
interrupt-parent = <&idu_intc>;
interrupts = <0>;
};
memory {
- #address-cells = <1>;
- #size-cells = <1>;
- ranges = <0x00000000 0x80000000 0x40000000>;
device_type = "memory";
- reg = <0x80000000 0x20000000>; /* 512MiB */
+ /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+ reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MiB low mem */
+ 0x1 0xc0000000 0x0 0x40000000>; /* 1 GiB highmem */
};
reserved-memory {
- #address-cells = <1>;
- #size-cells = <1>;
+ #address-cells = <2>;
+ #size-cells = <2>;
ranges;
/*
* Move frame buffer out of IOC aperture (0x8z-0xAz).
*/
frame_buffer: frame_buffer@be000000 {
compatible = "shared-dma-pool";
- reg = <0xbe000000 0x2000000>;
+ reg = <0x0 0xbe000000 0x0 0x2000000>;
no-map;
};
};
diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi
index 41cfb29b62c1..0ff7e07edcd4 100644
--- a/arch/arc/boot/dts/axs10x_mb.dtsi
+++ b/arch/arc/boot/dts/axs10x_mb.dtsi
@@ -13,7 +13,7 @@
compatible = "simple-bus";
#address-cells = <1>;
#size-cells = <1>;
- ranges = <0x00000000 0xe0000000 0x10000000>;
+ ranges = <0x00000000 0x0 0xe0000000 0x10000000>;
interrupt-parent = <&mb_intc>;
i2sclk: i2sclk@100a0 {
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
index 57b3e599322f..db04ea4dd2d9 100644
--- a/arch/arc/configs/haps_hs_defconfig
+++ b/arch/arc/configs/haps_hs_defconfig
@@ -21,7 +21,6 @@ CONFIG_MODULES=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
CONFIG_ISA_ARCV2=y
CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs"
CONFIG_PREEMPT=y
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
index f85985adebb2..821a2e562f3f 100644
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
CONFIG_ISA_ARCV2=y
CONFIG_SMP=y
CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs_idu"
diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig
index ede625c76216..7c9c706ae7f6 100644
--- a/arch/arc/configs/nps_defconfig
+++ b/arch/arc/configs/nps_defconfig
@@ -39,7 +39,6 @@ CONFIG_IP_PNP=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index b0066a749d4c..6dff83a238b8 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
CONFIG_ARC_BUILTIN_DTB_NAME="nsim_700"
CONFIG_PREEMPT=y
# CONFIG_COMPACTION is not set
diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig
index ebe9ebb92933..31ee51b987e7 100644
--- a/arch/arc/configs/nsim_hs_defconfig
+++ b/arch/arc/configs/nsim_hs_defconfig
@@ -26,7 +26,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
CONFIG_ISA_ARCV2=y
CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs"
CONFIG_PREEMPT=y
diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig
index 4bde43278be6..8d3b1f67cae4 100644
--- a/arch/arc/configs/nsim_hs_smp_defconfig
+++ b/arch/arc/configs/nsim_hs_smp_defconfig
@@ -24,7 +24,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
CONFIG_ISA_ARCV2=y
CONFIG_SMP=y
CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs_idu"
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index f6fb3d26557e..6168ce2ac2ef 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci"
# CONFIG_COMPACTION is not set
CONFIG_NET=y
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index b9f0fe00044b..a70bdeb2b3fd 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
CONFIG_ISA_ARCV2=y
CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci_hs"
# CONFIG_COMPACTION is not set
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index 155add7761ed..ef96406c446e 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -18,7 +18,6 @@ CONFIG_MODULES=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
CONFIG_ISA_ARCV2=y
CONFIG_SMP=y
# CONFIG_ARC_TIMERS_64BIT is not set
diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
index 4c5118384eb5..f30182549395 100644
--- a/arch/arc/configs/tb10x_defconfig
+++ b/arch/arc/configs/tb10x_defconfig
@@ -38,7 +38,6 @@ CONFIG_IP_MULTICAST=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
# CONFIG_INET_DIAG is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 54b54da6384c..11859287c52a 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i)
atomic_ops_unlock(flags);
}
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
#endif
/*
diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
index 19ebddffb279..02fd1cece6ef 100644
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -96,7 +96,9 @@ extern unsigned long perip_base, perip_end;
#define ARC_REG_SLC_FLUSH 0x904
#define ARC_REG_SLC_INVALIDATE 0x905
#define ARC_REG_SLC_RGN_START 0x914
+#define ARC_REG_SLC_RGN_START1 0x915
#define ARC_REG_SLC_RGN_END 0x916
+#define ARC_REG_SLC_RGN_END1 0x917
/* Bit val in SLC_CONTROL */
#define SLC_CTRL_DIS 0x001
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 11e1b1f3acda..eb887dd13e74 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -73,20 +73,11 @@
#endif
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
- return -EFAULT;
-
#ifndef CONFIG_ARC_HAS_LLSC
preempt_disable(); /* to guarantee atomic r-m-w of futex op */
#endif
@@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
preempt_enable();
#endif
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- ret = (oldval == cmparg);
- break;
- case FUTEX_OP_CMP_NE:
- ret = (oldval != cmparg);
- break;
- case FUTEX_OP_CMP_LT:
- ret = (oldval < cmparg);
- break;
- case FUTEX_OP_CMP_GE:
- ret = (oldval >= cmparg);
- break;
- case FUTEX_OP_CMP_LE:
- ret = (oldval <= cmparg);
- break;
- case FUTEX_OP_CMP_GT:
- ret = (oldval > cmparg);
- break;
- default:
- ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h
index db7319e9b506..efb79fafff1d 100644
--- a/arch/arc/include/asm/mmu.h
+++ b/arch/arc/include/asm/mmu.h
@@ -94,6 +94,8 @@ static inline int is_pae40_enabled(void)
return IS_ENABLED(CONFIG_ARC_HAS_PAE40);
}
+extern int pae40_exist_but_not_enab(void);
+
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h
index 233d5ffe6ec7..a325e6a36523 100644
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -16,11 +16,6 @@
#define arch_spin_is_locked(x) ((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__)
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
#ifdef CONFIG_ARC_HAS_LLSC
static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c
index f928795fd07a..067ea362fb3e 100644
--- a/arch/arc/kernel/intc-arcv2.c
+++ b/arch/arc/kernel/intc-arcv2.c
@@ -75,10 +75,20 @@ void arc_init_IRQ(void)
* Set a default priority for all available interrupts to prevent
* switching of register banks if Fast IRQ and multiple register banks
* are supported by CPU.
+ * Also disable private-per-core IRQ lines so faulty external HW won't
+ * trigger interrupt that kernel is not ready to handle.
*/
for (i = NR_EXCEPTIONS; i < irq_bcr.irqs + NR_EXCEPTIONS; i++) {
write_aux_reg(AUX_IRQ_SELECT, i);
write_aux_reg(AUX_IRQ_PRIORITY, ARCV2_IRQ_DEF_PRIO);
+
+ /*
+ * Only mask cpu private IRQs here.
+ * "common" interrupts are masked at IDU, otherwise it would
+ * need to be unmasked at each cpu, with IPIs
+ */
+ if (i < FIRST_EXT_IRQ)
+ write_aux_reg(AUX_IRQ_ENABLE, 0);
}
/* setup status32, don't enable intr yet as kernel doesn't want */
diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c
index 7e608c6b0a01..47b421fa0147 100644
--- a/arch/arc/kernel/intc-compact.c
+++ b/arch/arc/kernel/intc-compact.c
@@ -27,7 +27,7 @@
*/
void arc_init_IRQ(void)
{
- int level_mask = 0;
+ unsigned int level_mask = 0, i;
/* Is timer high priority Interrupt (Level2 in ARCompact jargon) */
level_mask |= IS_ENABLED(CONFIG_ARC_COMPACT_IRQ_LEVELS) << TIMER0_IRQ;
@@ -40,6 +40,18 @@ void arc_init_IRQ(void)
if (level_mask)
pr_info("Level-2 interrupts bitset %x\n", level_mask);
+
+ /*
+ * Disable all IRQ lines so faulty external hardware won't
+ * trigger interrupt that kernel is not ready to handle.
+ */
+ for (i = TIMER0_IRQ; i < NR_CPU_IRQS; i++) {
+ unsigned int ienb;
+
+ ienb = read_aux_reg(AUX_IENABLE);
+ ienb &= ~(1 << i);
+ write_aux_reg(AUX_IENABLE, ienb);
+ }
}
/*
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index a867575a758b..7db283b46ebd 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -665,6 +665,7 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned int ctrl;
+ phys_addr_t end;
spin_lock_irqsave(&lock, flags);
@@ -694,8 +695,19 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
* END needs to be setup before START (latter triggers the operation)
* END can't be same as START, so add (l2_line_sz - 1) to sz
*/
- write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1));
- write_aux_reg(ARC_REG_SLC_RGN_START, paddr);
+ end = paddr + sz + l2_line_sz - 1;
+ if (is_pae40_enabled())
+ write_aux_reg(ARC_REG_SLC_RGN_END1, upper_32_bits(end));
+
+ write_aux_reg(ARC_REG_SLC_RGN_END, lower_32_bits(end));
+
+ if (is_pae40_enabled())
+ write_aux_reg(ARC_REG_SLC_RGN_START1, upper_32_bits(paddr));
+
+ write_aux_reg(ARC_REG_SLC_RGN_START, lower_32_bits(paddr));
+
+ /* Make sure "busy" bit reports correct stataus, see STAR 9001165532 */
+ read_aux_reg(ARC_REG_SLC_CTRL);
while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY);
@@ -1111,6 +1123,13 @@ noinline void __init arc_ioc_setup(void)
__dc_enable();
}
+/*
+ * Cache related boot time checks/setups only needed on master CPU:
+ * - Geometry checks (kernel build and hardware agree: e.g. L1_CACHE_BYTES)
+ * Assume SMP only, so all cores will have same cache config. A check on
+ * one core suffices for all
+ * - IOC setup / dma callbacks only need to be done once
+ */
void __init arc_cache_init_master(void)
{
unsigned int __maybe_unused cpu = smp_processor_id();
@@ -1190,12 +1209,27 @@ void __ref arc_cache_init(void)
printk(arc_cache_mumbojumbo(0, str, sizeof(str)));
- /*
- * Only master CPU needs to execute rest of function:
- * - Assume SMP so all cores will have same cache config so
- * any geomtry checks will be same for all
- * - IOC setup / dma callbacks only need to be setup once
- */
if (!cpu)
arc_cache_init_master();
+
+ /*
+ * In PAE regime, TLB and cache maintenance ops take wider addresses
+ * And even if PAE is not enabled in kernel, the upper 32-bits still need
+ * to be zeroed to keep the ops sane.
+ * As an optimization for more common !PAE enabled case, zero them out
+ * once at init, rather than checking/setting to 0 for every runtime op
+ */
+ if (is_isa_arcv2() && pae40_exist_but_not_enab()) {
+
+ if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE))
+ write_aux_reg(ARC_REG_IC_PTAG_HI, 0);
+
+ if (IS_ENABLED(CONFIG_ARC_HAS_DCACHE))
+ write_aux_reg(ARC_REG_DC_PTAG_HI, 0);
+
+ if (l2_line_sz) {
+ write_aux_reg(ARC_REG_SLC_RGN_END1, 0);
+ write_aux_reg(ARC_REG_SLC_RGN_START1, 0);
+ }
+ }
}
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 71d3efff99d3..e9d93604ad0f 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -153,6 +153,19 @@ static void _dma_cache_sync(phys_addr_t paddr, size_t size,
}
}
+/*
+ * arc_dma_map_page - map a portion of a page for streaming DMA
+ *
+ * Ensure that any data held in the cache is appropriately discarded
+ * or written back.
+ *
+ * The device owns this memory once this call has completed. The CPU
+ * can regain ownership by calling dma_unmap_page().
+ *
+ * Note: while it takes struct page as arg, caller can "abuse" it to pass
+ * a region larger than PAGE_SIZE, provided it is physically contiguous
+ * and this still works correctly
+ */
static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
@@ -165,6 +178,24 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
return plat_phys_to_dma(dev, paddr);
}
+/*
+ * arc_dma_unmap_page - unmap a buffer previously mapped through dma_map_page()
+ *
+ * After this call, reads by the CPU to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ *
+ * Note: historically this routine was not implemented for ARC
+ */
+static void arc_dma_unmap_page(struct device *dev, dma_addr_t handle,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ phys_addr_t paddr = plat_dma_to_phys(dev, handle);
+
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ _dma_cache_sync(paddr, size, dir);
+}
+
static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
@@ -178,6 +209,18 @@ static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg,
return nents;
}
+static void arc_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sg, s, nents, i)
+ arc_dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir,
+ attrs);
+}
+
static void arc_dma_sync_single_for_cpu(struct device *dev,
dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
{
@@ -223,7 +266,9 @@ const struct dma_map_ops arc_dma_ops = {
.free = arc_dma_free,
.mmap = arc_dma_mmap,
.map_page = arc_dma_map_page,
+ .unmap_page = arc_dma_unmap_page,
.map_sg = arc_dma_map_sg,
+ .unmap_sg = arc_dma_unmap_sg,
.sync_single_for_device = arc_dma_sync_single_for_device,
.sync_single_for_cpu = arc_dma_sync_single_for_cpu,
.sync_sg_for_cpu = arc_dma_sync_sg_for_cpu,
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index d0126fdfe2d8..b181f3ee38aa 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -104,6 +104,8 @@
/* A copy of the ASID from the PID reg is kept in asid_cache */
DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE;
+static int __read_mostly pae_exists;
+
/*
* Utility Routine to erase a J-TLB entry
* Caller needs to setup Index Reg (manually or via getIndex)
@@ -784,7 +786,7 @@ void read_decode_mmu_bcr(void)
mmu->u_dtlb = mmu4->u_dtlb * 4;
mmu->u_itlb = mmu4->u_itlb * 4;
mmu->sasid = mmu4->sasid;
- mmu->pae = mmu4->pae;
+ pae_exists = mmu->pae = mmu4->pae;
}
}
@@ -809,6 +811,11 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len)
return buf;
}
+int pae40_exist_but_not_enab(void)
+{
+ return pae_exists && !is_pae40_enabled();
+}
+
void arc_mmu_init(void)
{
char str[256];
@@ -859,6 +866,9 @@ void arc_mmu_init(void)
/* swapper_pg_dir is the pgd for the kernel, used by vmalloc */
write_aux_reg(ARC_REG_SCRATCH_DATA0, swapper_pg_dir);
#endif
+
+ if (pae40_exist_but_not_enab())
+ write_aux_reg(ARC_REG_TLBPD1HI, 0);
}
/*
diff --git a/arch/arc/plat-sim/Kconfig b/arch/arc/plat-sim/Kconfig
deleted file mode 100644
index ac6af96a82f3..000000000000
--- a/arch/arc/plat-sim/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# Copyright (C) 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-#
-
-menuconfig ARC_PLAT_SIM
- bool "ARC nSIM based simulation virtual platforms"
- help
- Support for nSIM based ARC simulation platforms
- This includes the standalone nSIM (uart only) vs. System C OSCI VP
diff --git a/arch/arc/plat-sim/platform.c b/arch/arc/plat-sim/platform.c
index aea87389e44b..5cda56b1a2ea 100644
--- a/arch/arc/plat-sim/platform.c
+++ b/arch/arc/plat-sim/platform.c
@@ -20,11 +20,14 @@
*/
static const char *simulation_compat[] __initconst = {
+#ifdef CONFIG_ISA_ARCOMPACT
"snps,nsim",
- "snps,nsim_hs",
"snps,nsimosci",
+#else
+ "snps,nsim_hs",
"snps,nsimosci_hs",
"snps,zebu_hs",
+#endif
NULL,
};
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a208bfe367b5..61a0cb15067e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -380,7 +380,7 @@ config ARCH_EP93XX
bool "EP93xx-based"
select ARCH_HAS_HOLES_MEMORYMODEL
select ARM_AMBA
- select ARM_PATCH_PHYS_VIRT
+ imply ARM_PATCH_PHYS_VIRT
select ARM_VIC
select AUTO_ZRELADDR
select CLKDEV_LOOKUP
diff --git a/arch/arm/boot/dts/armada-388-gp.dts b/arch/arm/boot/dts/armada-388-gp.dts
index 895fa6cfa15a..563901e0ec07 100644
--- a/arch/arm/boot/dts/armada-388-gp.dts
+++ b/arch/arm/boot/dts/armada-388-gp.dts
@@ -75,7 +75,7 @@
pinctrl-names = "default";
pinctrl-0 = <&pca0_pins>;
interrupt-parent = <&gpio0>;
- interrupts = <18 IRQ_TYPE_EDGE_FALLING>;
+ interrupts = <18 IRQ_TYPE_LEVEL_LOW>;
gpio-controller;
#gpio-cells = <2>;
interrupt-controller;
@@ -87,7 +87,7 @@
compatible = "nxp,pca9555";
pinctrl-names = "default";
interrupt-parent = <&gpio0>;
- interrupts = <18 IRQ_TYPE_EDGE_FALLING>;
+ interrupts = <18 IRQ_TYPE_LEVEL_LOW>;
gpio-controller;
#gpio-cells = <2>;
interrupt-controller;
diff --git a/arch/arm/boot/dts/da850-evm.dts b/arch/arm/boot/dts/da850-evm.dts
index a423e8ebfb37..67e72bc72e80 100644
--- a/arch/arm/boot/dts/da850-evm.dts
+++ b/arch/arm/boot/dts/da850-evm.dts
@@ -301,25 +301,4 @@
pinctrl-names = "default";
pinctrl-0 = <&vpif_capture_pins>, <&vpif_display_pins>;
status = "okay";
-
- /* VPIF capture port */
- port@0 {
- vpif_input_ch0: endpoint@0 {
- reg = <0>;
- bus-width = <8>;
- };
-
- vpif_input_ch1: endpoint@1 {
- reg = <1>;
- bus-width = <8>;
- data-shift = <8>;
- };
- };
-
- /* VPIF display port */
- port@1 {
- vpif_output_ch0: endpoint {
- bus-width = <8>;
- };
- };
};
diff --git a/arch/arm/boot/dts/da850-lcdk.dts b/arch/arm/boot/dts/da850-lcdk.dts
index b837fec70eec..a0f0916156e6 100644
--- a/arch/arm/boot/dts/da850-lcdk.dts
+++ b/arch/arm/boot/dts/da850-lcdk.dts
@@ -318,11 +318,4 @@
pinctrl-names = "default";
pinctrl-0 = <&vpif_capture_pins>;
status = "okay";
-
- /* VPIF capture port */
- port {
- vpif_ch0: endpoint {
- bus-width = <8>;
- };
- };
};
diff --git a/arch/arm/boot/dts/dm8168-evm.dts b/arch/arm/boot/dts/dm8168-evm.dts
index 1865976db5f9..c72a2132aa82 100644
--- a/arch/arm/boot/dts/dm8168-evm.dts
+++ b/arch/arm/boot/dts/dm8168-evm.dts
@@ -68,6 +68,34 @@
DM816X_IOPAD(0x0d08, MUX_MODE0) /* USB1_DRVVBUS */
>;
};
+
+ nandflash_pins: nandflash_pins {
+ pinctrl-single,pins = <
+ DM816X_IOPAD(0x0b38, PULL_UP | MUX_MODE0) /* PINCTRL207 GPMC_CS0*/
+ DM816X_IOPAD(0x0b60, PULL_ENA | MUX_MODE0) /* PINCTRL217 GPMC_ADV_ALE */
+ DM816X_IOPAD(0x0b54, PULL_UP | PULL_ENA | MUX_MODE0) /* PINCTRL214 GPMC_OE_RE */
+ DM816X_IOPAD(0x0b58, PULL_ENA | MUX_MODE0) /* PINCTRL215 GPMC_BE0_CLE */
+ DM816X_IOPAD(0x0b50, PULL_UP | MUX_MODE0) /* PINCTRL213 GPMC_WE */
+ DM816X_IOPAD(0x0b6c, MUX_MODE0) /* PINCTRL220 GPMC_WAIT */
+ DM816X_IOPAD(0x0be4, PULL_ENA | MUX_MODE0) /* PINCTRL250 GPMC_CLK */
+ DM816X_IOPAD(0x0ba4, MUX_MODE0) /* PINCTRL234 GPMC_D0 */
+ DM816X_IOPAD(0x0ba8, MUX_MODE0) /* PINCTRL234 GPMC_D1 */
+ DM816X_IOPAD(0x0bac, MUX_MODE0) /* PINCTRL234 GPMC_D2 */
+ DM816X_IOPAD(0x0bb0, MUX_MODE0) /* PINCTRL234 GPMC_D3 */
+ DM816X_IOPAD(0x0bb4, MUX_MODE0) /* PINCTRL234 GPMC_D4 */
+ DM816X_IOPAD(0x0bb8, MUX_MODE0) /* PINCTRL234 GPMC_D5 */
+ DM816X_IOPAD(0x0bbc, MUX_MODE0) /* PINCTRL234 GPMC_D6 */
+ DM816X_IOPAD(0x0bc0, MUX_MODE0) /* PINCTRL234 GPMC_D7 */
+ DM816X_IOPAD(0x0bc4, MUX_MODE0) /* PINCTRL234 GPMC_D8 */
+ DM816X_IOPAD(0x0bc8, MUX_MODE0) /* PINCTRL234 GPMC_D9 */
+ DM816X_IOPAD(0x0bcc, MUX_MODE0) /* PINCTRL234 GPMC_D10 */
+ DM816X_IOPAD(0x0bd0, MUX_MODE0) /* PINCTRL234 GPMC_D11 */
+ DM816X_IOPAD(0x0bd4, MUX_MODE0) /* PINCTRL234 GPMC_D12 */
+ DM816X_IOPAD(0x0bd8, MUX_MODE0) /* PINCTRL234 GPMC_D13 */
+ DM816X_IOPAD(0x0bdc, MUX_MODE0) /* PINCTRL234 GPMC_D14 */
+ DM816X_IOPAD(0x0be0, MUX_MODE0) /* PINCTRL234 GPMC_D15 */
+ >;
+ };
};
&i2c1 {
@@ -90,6 +118,8 @@
&gpmc {
ranges = <0 0 0x04000000 0x01000000>; /* CS0: 16MB for NAND */
+ pinctrl-names = "default";
+ pinctrl-0 = <&nandflash_pins>;
nand@0,0 {
compatible = "ti,omap2-nand";
@@ -98,9 +128,11 @@
interrupt-parent = <&gpmc>;
interrupts = <0 IRQ_TYPE_NONE>, /* fifoevent */
<1 IRQ_TYPE_NONE>; /* termcount */
+ rb-gpios = <&gpmc 0 GPIO_ACTIVE_HIGH>; /* gpmc_wait0 */
#address-cells = <1>;
#size-cells = <1>;
ti,nand-ecc-opt = "bch8";
+ ti,elm-id = <&elm>;
nand-bus-width = <16>;
gpmc,device-width = <2>;
gpmc,sync-clk-ps = <0>;
@@ -164,7 +196,7 @@
vmmc-supply = <&vmmcsd_fixed>;
bus-width = <4>;
cd-gpios = <&gpio2 7 GPIO_ACTIVE_LOW>;
- wp-gpios = <&gpio2 8 GPIO_ACTIVE_LOW>;
+ wp-gpios = <&gpio2 8 GPIO_ACTIVE_HIGH>;
};
/* At least dm8168-evm rev c won't support multipoint, later may */
diff --git a/arch/arm/boot/dts/dm816x.dtsi b/arch/arm/boot/dts/dm816x.dtsi
index 59cbf958fcc3..566b2a8c8b96 100644
--- a/arch/arm/boot/dts/dm816x.dtsi
+++ b/arch/arm/boot/dts/dm816x.dtsi
@@ -145,7 +145,7 @@
};
elm: elm@48080000 {
- compatible = "ti,816-elm";
+ compatible = "ti,am3352-elm";
ti,hwmods = "elm";
reg = <0x48080000 0x2000>;
interrupts = <4>;
diff --git a/arch/arm/boot/dts/dra71-evm.dts b/arch/arm/boot/dts/dra71-evm.dts
index 4d57a55473af..a6298eb56978 100644
--- a/arch/arm/boot/dts/dra71-evm.dts
+++ b/arch/arm/boot/dts/dra71-evm.dts
@@ -190,7 +190,7 @@
ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_25_NS>;
ti,tx-internal-delay = <DP83867_RGMIIDCTL_250_PS>;
ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_8_B_NIB>;
- ti,impedance-control = <0x1f>;
+ ti,min-output-impedance;
};
dp83867_1: ethernet-phy@3 {
@@ -198,7 +198,7 @@
ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_25_NS>;
ti,tx-internal-delay = <DP83867_RGMIIDCTL_250_PS>;
ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_8_B_NIB>;
- ti,impedance-control = <0x1f>;
+ ti,min-output-impedance;
};
};
diff --git a/arch/arm/boot/dts/exynos4.dtsi b/arch/arm/boot/dts/exynos4.dtsi
index 497a9470c888..5739389f5bb8 100644
--- a/arch/arm/boot/dts/exynos4.dtsi
+++ b/arch/arm/boot/dts/exynos4.dtsi
@@ -59,6 +59,9 @@
compatible = "samsung,exynos4210-audss-clock";
reg = <0x03810000 0x0C>;
#clock-cells = <1>;
+ clocks = <&clock CLK_FIN_PLL>, <&clock CLK_FOUT_EPLL>,
+ <&clock CLK_SCLK_AUDIO0>, <&clock CLK_SCLK_AUDIO0>;
+ clock-names = "pll_ref", "pll_in", "sclk_audio", "sclk_pcm_in";
};
i2s0: i2s@03830000 {
diff --git a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
index f92f95741207..a183b56283f8 100644
--- a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
+++ b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
@@ -266,6 +266,7 @@
&hdmicec {
status = "okay";
+ needs-hpd;
};
&hsi2c_4 {
diff --git a/arch/arm/boot/dts/imx25.dtsi b/arch/arm/boot/dts/imx25.dtsi
index dfcc8e00cf1c..0ade3619f3c3 100644
--- a/arch/arm/boot/dts/imx25.dtsi
+++ b/arch/arm/boot/dts/imx25.dtsi
@@ -297,6 +297,7 @@
#address-cells = <1>;
#size-cells = <1>;
status = "disabled";
+ ranges;
adc: adc@50030800 {
compatible = "fsl,imx25-gcq";
diff --git a/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi b/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi
index aeaa5a6e4fcf..a24e4f1911ab 100644
--- a/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi
+++ b/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi
@@ -507,7 +507,7 @@
pinctrl_pcie: pciegrp {
fsl,pins = <
/* PCIe reset */
- MX6QDL_PAD_EIM_BCLK__GPIO6_IO31 0x030b0
+ MX6QDL_PAD_EIM_DA0__GPIO3_IO00 0x030b0
MX6QDL_PAD_EIM_DA4__GPIO3_IO04 0x030b0
>;
};
@@ -668,7 +668,7 @@
&pcie {
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_pcie>;
- reset-gpio = <&gpio6 31 GPIO_ACTIVE_LOW>;
+ reset-gpio = <&gpio3 0 GPIO_ACTIVE_LOW>;
status = "okay";
};
diff --git a/arch/arm/boot/dts/imx7d-sdb.dts b/arch/arm/boot/dts/imx7d-sdb.dts
index 54c45402286b..0a24d1bf3c39 100644
--- a/arch/arm/boot/dts/imx7d-sdb.dts
+++ b/arch/arm/boot/dts/imx7d-sdb.dts
@@ -557,6 +557,14 @@
>;
};
+ pinctrl_spi4: spi4grp {
+ fsl,pins = <
+ MX7D_PAD_GPIO1_IO09__GPIO1_IO9 0x59
+ MX7D_PAD_GPIO1_IO12__GPIO1_IO12 0x59
+ MX7D_PAD_GPIO1_IO13__GPIO1_IO13 0x59
+ >;
+ };
+
pinctrl_tsc2046_pendown: tsc2046_pendown {
fsl,pins = <
MX7D_PAD_EPDC_BDR1__GPIO2_IO29 0x59
@@ -697,13 +705,5 @@
fsl,pins = <
MX7D_PAD_LPSR_GPIO1_IO01__PWM1_OUT 0x110b0
>;
-
- pinctrl_spi4: spi4grp {
- fsl,pins = <
- MX7D_PAD_GPIO1_IO09__GPIO1_IO9 0x59
- MX7D_PAD_GPIO1_IO12__GPIO1_IO12 0x59
- MX7D_PAD_GPIO1_IO13__GPIO1_IO13 0x59
- >;
- };
};
};
diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi
index 7bb9df2c1460..9319e1f0f1d8 100644
--- a/arch/arm/boot/dts/ls1021a.dtsi
+++ b/arch/arm/boot/dts/ls1021a.dtsi
@@ -129,14 +129,14 @@
};
msi1: msi-controller@1570e00 {
- compatible = "fsl,1s1021a-msi";
+ compatible = "fsl,ls1021a-msi";
reg = <0x0 0x1570e00 0x0 0x8>;
msi-controller;
interrupts = <GIC_SPI 179 IRQ_TYPE_LEVEL_HIGH>;
};
msi2: msi-controller@1570e08 {
- compatible = "fsl,1s1021a-msi";
+ compatible = "fsl,ls1021a-msi";
reg = <0x0 0x1570e08 0x0 0x8>;
msi-controller;
interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_HIGH>;
@@ -699,7 +699,7 @@
bus-range = <0x0 0xff>;
ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */
0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
- msi-parent = <&msi1>;
+ msi-parent = <&msi1>, <&msi2>;
#interrupt-cells = <1>;
interrupt-map-mask = <0 0 0 7>;
interrupt-map = <0000 0 0 1 &gic GIC_SPI 91 IRQ_TYPE_LEVEL_HIGH>,
@@ -722,7 +722,7 @@
bus-range = <0x0 0xff>;
ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */
0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
- msi-parent = <&msi2>;
+ msi-parent = <&msi1>, <&msi2>;
#interrupt-cells = <1>;
interrupt-map-mask = <0 0 0 7>;
interrupt-map = <0000 0 0 1 &gic GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 2484f11761ea..858e1fed762a 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -1126,8 +1126,8 @@
};
};
- gpu: mali@ffa30000 {
- compatible = "rockchip,rk3288-mali", "arm,mali-t760", "arm,mali-midgard";
+ gpu: gpu@ffa30000 {
+ compatible = "rockchip,rk3288-mali", "arm,mali-t760";
reg = <0xffa30000 0x10000>;
interrupts = <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi
index cc06da394366..60e69aeacbdb 100644
--- a/arch/arm/boot/dts/sama5d2.dtsi
+++ b/arch/arm/boot/dts/sama5d2.dtsi
@@ -303,7 +303,7 @@
#size-cells = <1>;
atmel,smc = <&hsmc>;
reg = <0x10000000 0x10000000
- 0x40000000 0x30000000>;
+ 0x60000000 0x30000000>;
ranges = <0x0 0x0 0x10000000 0x10000000
0x1 0x0 0x60000000 0x10000000
0x2 0x0 0x70000000 0x10000000
@@ -1048,18 +1048,18 @@
};
hsmc: hsmc@f8014000 {
- compatible = "atmel,sama5d3-smc", "syscon", "simple-mfd";
+ compatible = "atmel,sama5d2-smc", "syscon", "simple-mfd";
reg = <0xf8014000 0x1000>;
- interrupts = <5 IRQ_TYPE_LEVEL_HIGH 6>;
+ interrupts = <17 IRQ_TYPE_LEVEL_HIGH 6>;
clocks = <&hsmc_clk>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
- pmecc: ecc-engine@ffffc070 {
+ pmecc: ecc-engine@f8014070 {
compatible = "atmel,sama5d2-pmecc";
- reg = <0xffffc070 0x490>,
- <0xffffc500 0x100>;
+ reg = <0xf8014070 0x490>,
+ <0xf8014500 0x100>;
};
};
diff --git a/arch/arm/boot/dts/sun8i-a83t.dtsi b/arch/arm/boot/dts/sun8i-a83t.dtsi
index 8923ba625b76..19a8f4fcfab5 100644
--- a/arch/arm/boot/dts/sun8i-a83t.dtsi
+++ b/arch/arm/boot/dts/sun8i-a83t.dtsi
@@ -44,7 +44,9 @@
#include <dt-bindings/interrupt-controller/arm-gic.h>
+#include <dt-bindings/clock/sun8i-a83t-ccu.h>
#include <dt-bindings/clock/sun8i-r-ccu.h>
+#include <dt-bindings/reset/sun8i-a83t-ccu.h>
/ {
interrupt-parent = <&gic>;
@@ -175,8 +177,8 @@
compatible = "allwinner,sun8i-a83t-dma";
reg = <0x01c02000 0x1000>;
interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&ccu 21>;
- resets = <&ccu 7>;
+ clocks = <&ccu CLK_BUS_DMA>;
+ resets = <&ccu RST_BUS_DMA>;
#dma-cells = <1>;
};
@@ -195,7 +197,7 @@
<GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>;
reg = <0x01c20800 0x400>;
- clocks = <&ccu 45>, <&osc24M>, <&osc16Md512>;
+ clocks = <&ccu CLK_BUS_PIO>, <&osc24M>, <&osc16Md512>;
clock-names = "apb", "hosc", "losc";
gpio-controller;
interrupt-controller;
@@ -247,8 +249,8 @@
"allwinner,sun8i-h3-spdif";
reg = <0x01c21000 0x400>;
interrupts = <GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&ccu 44>, <&ccu 76>;
- resets = <&ccu 32>;
+ clocks = <&ccu CLK_BUS_SPDIF>, <&ccu CLK_SPDIF>;
+ resets = <&ccu RST_BUS_SPDIF>;
clock-names = "apb", "spdif";
dmas = <&dma 2>;
dma-names = "tx";
@@ -263,8 +265,8 @@
interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>;
reg-shift = <2>;
reg-io-width = <4>;
- clocks = <&ccu 53>;
- resets = <&ccu 40>;
+ clocks = <&ccu CLK_BUS_UART0>;
+ resets = <&ccu RST_BUS_UART0>;
status = "disabled";
};
diff --git a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
index 6713d0f2b3f4..b1502df7b509 100644
--- a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
+++ b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
@@ -56,8 +56,6 @@
aliases {
serial0 = &uart0;
- /* ethernet0 is the H3 emac, defined in sun8i-h3.dtsi */
- ethernet0 = &emac;
ethernet1 = &xr819;
};
@@ -104,13 +102,6 @@
status = "okay";
};
-&emac {
- phy-handle = <&int_mii_phy>;
- phy-mode = "mii";
- allwinner,leds-active-low;
- status = "okay";
-};
-
&mmc0 {
pinctrl-names = "default";
pinctrl-0 = <&mmc0_pins_a>;
diff --git a/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts b/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts
index d756ff825116..a337af1de322 100644
--- a/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts
+++ b/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts
@@ -52,7 +52,6 @@
compatible = "sinovoip,bpi-m2-plus", "allwinner,sun8i-h3";
aliases {
- ethernet0 = &emac;
serial0 = &uart0;
serial1 = &uart1;
};
@@ -115,30 +114,12 @@
status = "okay";
};
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&emac_rgmii_pins>;
- phy-supply = <&reg_gmac_3v3>;
- phy-handle = <&ext_rgmii_phy>;
- phy-mode = "rgmii";
-
- allwinner,leds-active-low;
- status = "okay";
-};
-
&ir {
pinctrl-names = "default";
pinctrl-0 = <&ir_pins_a>;
status = "okay";
};
-&mdio {
- ext_rgmii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <0>;
- };
-};
-
&mmc0 {
pinctrl-names = "default";
pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts b/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts
index 78f6c24952dd..8d2cc6e9a03f 100644
--- a/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts
+++ b/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts
@@ -46,10 +46,3 @@
model = "FriendlyARM NanoPi NEO";
compatible = "friendlyarm,nanopi-neo", "allwinner,sun8i-h3";
};
-
-&emac {
- phy-handle = <&int_mii_phy>;
- phy-mode = "mii";
- allwinner,leds-active-low;
- status = "okay";
-};
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts
index 17cdeae19c6f..8ff71b1bb45b 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts
@@ -54,7 +54,6 @@
aliases {
serial0 = &uart0;
/* ethernet0 is the H3 emac, defined in sun8i-h3.dtsi */
- ethernet0 = &emac;
ethernet1 = &rtl8189;
};
@@ -118,13 +117,6 @@
status = "okay";
};
-&emac {
- phy-handle = <&int_mii_phy>;
- phy-mode = "mii";
- allwinner,leds-active-low;
- status = "okay";
-};
-
&ir {
pinctrl-names = "default";
pinctrl-0 = <&ir_pins_a>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts
index 6880268e8b87..5fea430e0eb1 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts
@@ -52,7 +52,6 @@
compatible = "xunlong,orangepi-one", "allwinner,sun8i-h3";
aliases {
- ethernet0 = &emac;
serial0 = &uart0;
};
@@ -98,13 +97,6 @@
status = "okay";
};
-&emac {
- phy-handle = <&int_mii_phy>;
- phy-mode = "mii";
- allwinner,leds-active-low;
- status = "okay";
-};
-
&mmc0 {
pinctrl-names = "default";
pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts
index a10281b455f5..8b93f5c781a7 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts
@@ -53,11 +53,6 @@
};
};
-&emac {
- /* LEDs changed to active high on the plus */
- /delete-property/ allwinner,leds-active-low;
-};
-
&mmc1 {
pinctrl-names = "default";
pinctrl-0 = <&mmc1_pins_a>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts
index 998b60f8d295..1a044b17d6c6 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts
@@ -52,7 +52,6 @@
compatible = "xunlong,orangepi-pc", "allwinner,sun8i-h3";
aliases {
- ethernet0 = &emac;
serial0 = &uart0;
};
@@ -114,13 +113,6 @@
status = "okay";
};
-&emac {
- phy-handle = <&int_mii_phy>;
- phy-mode = "mii";
- allwinner,leds-active-low;
- status = "okay";
-};
-
&ir {
pinctrl-names = "default";
pinctrl-0 = <&ir_pins_a>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts
index 331ed683ac62..828ae7a526d9 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts
@@ -47,10 +47,6 @@
model = "Xunlong Orange Pi Plus / Plus 2";
compatible = "xunlong,orangepi-plus", "allwinner,sun8i-h3";
- aliases {
- ethernet0 = &emac;
- };
-
reg_gmac_3v3: gmac-3v3 {
compatible = "regulator-fixed";
regulator-name = "gmac-3v3";
@@ -78,24 +74,6 @@
status = "okay";
};
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&emac_rgmii_pins>;
- phy-supply = <&reg_gmac_3v3>;
- phy-handle = <&ext_rgmii_phy>;
- phy-mode = "rgmii";
-
- allwinner,leds-active-low;
- status = "okay";
-};
-
-&mdio {
- ext_rgmii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <0>;
- };
-};
-
&mmc2 {
pinctrl-names = "default";
pinctrl-0 = <&mmc2_8bit_pins>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts
index 80026f3caafc..97920b12a944 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts
@@ -61,19 +61,3 @@
gpio = <&pio 3 6 GPIO_ACTIVE_HIGH>; /* PD6 */
};
};
-
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&emac_rgmii_pins>;
- phy-supply = <&reg_gmac_3v3>;
- phy-handle = <&ext_rgmii_phy>;
- phy-mode = "rgmii";
- status = "okay";
-};
-
-&mdio {
- ext_rgmii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <1>;
- };
-};
diff --git a/arch/arm/boot/dts/sunxi-h3-h5.dtsi b/arch/arm/boot/dts/sunxi-h3-h5.dtsi
index 6f2162608006..11240a8313c2 100644
--- a/arch/arm/boot/dts/sunxi-h3-h5.dtsi
+++ b/arch/arm/boot/dts/sunxi-h3-h5.dtsi
@@ -391,32 +391,6 @@
clocks = <&osc24M>;
};
- emac: ethernet@1c30000 {
- compatible = "allwinner,sun8i-h3-emac";
- syscon = <&syscon>;
- reg = <0x01c30000 0x104>;
- interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "macirq";
- resets = <&ccu RST_BUS_EMAC>;
- reset-names = "stmmaceth";
- clocks = <&ccu CLK_BUS_EMAC>;
- clock-names = "stmmaceth";
- #address-cells = <1>;
- #size-cells = <0>;
- status = "disabled";
-
- mdio: mdio {
- #address-cells = <1>;
- #size-cells = <0>;
- int_mii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <1>;
- clocks = <&ccu CLK_BUS_EPHY>;
- resets = <&ccu RST_BUS_EPHY>;
- };
- };
- };
-
spi0: spi@01c68000 {
compatible = "allwinner,sun8i-h3-spi";
reg = <0x01c68000 0x1000>;
diff --git a/arch/arm/boot/dts/tango4-vantage-1172.dts b/arch/arm/boot/dts/tango4-vantage-1172.dts
index 86d8df98802f..13bcc460bcb2 100644
--- a/arch/arm/boot/dts/tango4-vantage-1172.dts
+++ b/arch/arm/boot/dts/tango4-vantage-1172.dts
@@ -22,7 +22,7 @@
};
&eth0 {
- phy-connection-type = "rgmii";
+ phy-connection-type = "rgmii-id";
phy-handle = <&eth0_phy>;
#address-cells = <1>;
#size-cells = <0>;
diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h
index 27475904e096..eee269321923 100644
--- a/arch/arm/include/asm/arch_gicv3.h
+++ b/arch/arm/include/asm/arch_gicv3.h
@@ -276,6 +276,12 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr)
#define gicr_write_pendbaser(v, c) __gic_writeq_nonatomic(v, c)
/*
+ * GICR_xLPIR - only the lower bits are significant
+ */
+#define gic_read_lpir(c) readl_relaxed(c)
+#define gic_write_lpir(v, c) writel_relaxed(lower_32_bits(v), c)
+
+/*
* GITS_TYPER is an ID register and doesn't need atomicity.
*/
#define gits_read_typer(c) __gic_readq_nonatomic(c)
@@ -291,5 +297,33 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr)
*/
#define gits_write_cwriter(v, c) __gic_writeq_nonatomic(v, c)
+/*
+ * GITS_VPROPBASER - hi and lo bits may be accessed independently.
+ */
+#define gits_write_vpropbaser(v, c) __gic_writeq_nonatomic(v, c)
+
+/*
+ * GITS_VPENDBASER - the Valid bit must be cleared before changing
+ * anything else.
+ */
+static inline void gits_write_vpendbaser(u64 val, void * __iomem addr)
+{
+ u32 tmp;
+
+ tmp = readl_relaxed(addr + 4);
+ if (tmp & (GICR_VPENDBASER_Valid >> 32)) {
+ tmp &= ~(GICR_VPENDBASER_Valid >> 32);
+ writel_relaxed(tmp, addr + 4);
+ }
+
+ /*
+ * Use the fact that __gic_writeq_nonatomic writes the second
+ * half of the 64bit quantity after the first.
+ */
+ __gic_writeq_nonatomic(val, addr);
+}
+
+#define gits_read_vpendbaser(c) __gic_readq_nonatomic(c)
+
#endif /* !__ASSEMBLY__ */
#endif /* !__ASM_ARCH_GICV3_H */
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 6795368ad023..cc414382dab4 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
#endif /* !SMP */
static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret, tmp;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
-
#ifndef CONFIG_SMP
preempt_disable();
#endif
@@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
preempt_enable();
#endif
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 127e2dd2e21c..4a879f6ff13b 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -225,12 +225,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-/* We do not have shadow page tables, hence the empty hooks */
-static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address)
-{
-}
-
struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
void kvm_arm_halt_guest(struct kvm *kvm);
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 4bec45442072..c030143c18c6 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -52,22 +52,6 @@ static inline void dsb_sev(void)
* memory.
*/
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- u16 owner = READ_ONCE(lock->tickets.owner);
-
- for (;;) {
- arch_spinlock_t tmp = READ_ONCE(*lock);
-
- if (tmp.tickets.owner == tmp.tickets.next ||
- tmp.tickets.owner != owner)
- break;
-
- wfe();
- }
- smp_acquire__after_ctrl_dep();
-}
-
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_UPROBE 3 /* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE 4 /* syscall trace active */
-#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
+#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE 5 /* syscall trace active */
+#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */
+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
#define TIF_NOHZ 12 /* in adaptive nohz mode */
#define TIF_USING_IWMMXT 17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
/*
* Change these and you break ASM code in entry-common.S
*/
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
- _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ _TIF_FSCHECK)
#endif /* __KERNEL__ */
#endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 3f2eb76243e3..d5562f9ce600 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -148,7 +148,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
}
static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
tlb->fullmm = !(start | (end+1));
@@ -166,8 +167,14 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
}
static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
+ if (force) {
+ tlb->range_start = start;
+ tlb->range_end = end;
+ }
+
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h
index f555bb3664dc..683d9230984a 100644
--- a/arch/arm/include/asm/traps.h
+++ b/arch/arm/include/asm/traps.h
@@ -18,7 +18,6 @@ struct undef_hook {
void register_undef_hook(struct undef_hook *hook);
void unregister_undef_hook(struct undef_hook *hook);
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static inline int __in_irqentry_text(unsigned long ptr)
{
extern char __irqentry_text_start[];
@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
return ptr >= (unsigned long)&__irqentry_text_start &&
ptr < (unsigned long)&__irqentry_text_end;
}
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
- return 0;
-}
-#endif
static inline int in_exception_text(unsigned long ptr)
{
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 0bf2347495f1..87936dd5d151 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -70,6 +70,8 @@ static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
}
#define segment_eq(a, b) ((a) == (b))
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -41,7 +41,9 @@ ret_fast_syscall:
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
bne fast_work_pending
/* perform architecture specific actions before user return */
@@ -67,12 +69,15 @@ ret_fast_syscall:
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
disable_irq_notrace @ disable interrupts
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+ tst r1, #_TIF_SYSCALL_WORK
+ bne fast_work_pending
+ tst r1, #_TIF_WORK_MASK
beq no_work_pending
UNWIND(.fnend )
ENDPROC(ret_fast_syscall)
/* Slower path - fall through to work_pending */
+fast_work_pending:
#endif
tst r1, #_TIF_SYSCALL_WORK
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 5814298ef0b7..e2de50bf8742 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
#include <linux/uaccess.h>
#include <linux/tracehook.h>
#include <linux/uprobes.h>
+#include <linux/syscalls.h>
#include <asm/elf.h>
#include <asm/cacheflush.h>
@@ -613,6 +614,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
index d735e5fc4772..195da38cb9a2 100644
--- a/arch/arm/mach-at91/Kconfig
+++ b/arch/arm/mach-at91/Kconfig
@@ -1,7 +1,7 @@
menuconfig ARCH_AT91
bool "Atmel SoCs"
depends on ARCH_MULTI_V4T || ARCH_MULTI_V5 || ARCH_MULTI_V7 || ARM_SINGLE_ARMV7M
- select ARM_CPU_SUSPEND if PM
+ select ARM_CPU_SUSPEND if PM && ARCH_MULTI_V7
select COMMON_CLK_AT91
select GPIOLIB
select PINCTRL
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 667fddac3856..5036f996e694 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -608,6 +608,9 @@ static void __init at91_pm_init(void (*pm_idle)(void))
void __init at91rm9200_pm_init(void)
{
+ if (!IS_ENABLED(CONFIG_SOC_AT91RM9200))
+ return;
+
at91_dt_ramc();
/*
@@ -620,18 +623,27 @@ void __init at91rm9200_pm_init(void)
void __init at91sam9_pm_init(void)
{
+ if (!IS_ENABLED(CONFIG_SOC_AT91SAM9))
+ return;
+
at91_dt_ramc();
at91_pm_init(at91sam9_idle);
}
void __init sama5_pm_init(void)
{
+ if (!IS_ENABLED(CONFIG_SOC_SAMA5))
+ return;
+
at91_dt_ramc();
at91_pm_init(NULL);
}
void __init sama5d2_pm_init(void)
{
+ if (!IS_ENABLED(CONFIG_SOC_SAMA5D2))
+ return;
+
at91_pm_backup_init();
sama5_pm_init();
}
diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c
index b5625d009288..e568c8c6f69c 100644
--- a/arch/arm/mach-davinci/board-da850-evm.c
+++ b/arch/arm/mach-davinci/board-da850-evm.c
@@ -1166,7 +1166,7 @@ static struct tvp514x_platform_data tvp5146_pdata = {
#define TVP514X_STD_ALL (V4L2_STD_NTSC | V4L2_STD_PAL)
-static const struct vpif_input da850_ch0_inputs[] = {
+static struct vpif_input da850_ch0_inputs[] = {
{
.input = {
.index = 0,
@@ -1181,7 +1181,7 @@ static const struct vpif_input da850_ch0_inputs[] = {
},
};
-static const struct vpif_input da850_ch1_inputs[] = {
+static struct vpif_input da850_ch1_inputs[] = {
{
.input = {
.index = 0,
diff --git a/arch/arm/mach-davinci/clock.c b/arch/arm/mach-davinci/clock.c
index f5dce9b4e617..f77a4f766050 100644
--- a/arch/arm/mach-davinci/clock.c
+++ b/arch/arm/mach-davinci/clock.c
@@ -218,6 +218,15 @@ int clk_set_parent(struct clk *clk, struct clk *parent)
}
EXPORT_SYMBOL(clk_set_parent);
+struct clk *clk_get_parent(struct clk *clk)
+{
+ if (!clk)
+ return NULL;
+
+ return clk->parent;
+}
+EXPORT_SYMBOL(clk_get_parent);
+
int clk_register(struct clk *clk)
{
if (clk == NULL || IS_ERR(clk))
diff --git a/arch/arm/mach-ep93xx/clock.c b/arch/arm/mach-ep93xx/clock.c
index 39ef3b613912..beec5f16443a 100644
--- a/arch/arm/mach-ep93xx/clock.c
+++ b/arch/arm/mach-ep93xx/clock.c
@@ -475,6 +475,26 @@ int clk_set_rate(struct clk *clk, unsigned long rate)
}
EXPORT_SYMBOL(clk_set_rate);
+long clk_round_rate(struct clk *clk, unsigned long rate)
+{
+ WARN_ON(clk);
+ return 0;
+}
+EXPORT_SYMBOL(clk_round_rate);
+
+int clk_set_parent(struct clk *clk, struct clk *parent)
+{
+ WARN_ON(clk);
+ return 0;
+}
+EXPORT_SYMBOL(clk_set_parent);
+
+struct clk *clk_get_parent(struct clk *clk)
+{
+ return clk->parent;
+}
+EXPORT_SYMBOL(clk_get_parent);
+
static char fclk_divisors[] = { 1, 2, 4, 8, 16, 1, 1, 1 };
static char hclk_divisors[] = { 1, 2, 4, 5, 6, 8, 16, 32 };
diff --git a/arch/arm/mach-hisi/Kconfig b/arch/arm/mach-hisi/Kconfig
index a3b091a4d344..65a048fa08ec 100644
--- a/arch/arm/mach-hisi/Kconfig
+++ b/arch/arm/mach-hisi/Kconfig
@@ -39,6 +39,7 @@ config ARCH_HIP04
select HAVE_ARM_ARCH_TIMER
select MCPM if SMP
select MCPM_QUAD_CLUSTER if SMP
+ select GENERIC_IRQ_EFFECTIVE_AFF_MASK
help
Support for Hisilicon HiP04 SoC family
diff --git a/arch/arm/mach-ixp4xx/include/mach/io.h b/arch/arm/mach-ixp4xx/include/mach/io.h
index 7a0c13bf4269..844e8ac593e2 100644
--- a/arch/arm/mach-ixp4xx/include/mach/io.h
+++ b/arch/arm/mach-ixp4xx/include/mach/io.h
@@ -95,8 +95,10 @@ static inline void __indirect_writeb(u8 value, volatile void __iomem *p)
}
static inline void __indirect_writesb(volatile void __iomem *bus_addr,
- const u8 *vaddr, int count)
+ const void *p, int count)
{
+ const u8 *vaddr = p;
+
while (count--)
writeb(*vaddr++, bus_addr);
}
@@ -118,8 +120,10 @@ static inline void __indirect_writew(u16 value, volatile void __iomem *p)
}
static inline void __indirect_writesw(volatile void __iomem *bus_addr,
- const u16 *vaddr, int count)
+ const void *p, int count)
{
+ const u16 *vaddr = p;
+
while (count--)
writew(*vaddr++, bus_addr);
}
@@ -137,8 +141,9 @@ static inline void __indirect_writel(u32 value, volatile void __iomem *p)
}
static inline void __indirect_writesl(volatile void __iomem *bus_addr,
- const u32 *vaddr, int count)
+ const void *p, int count)
{
+ const u32 *vaddr = p;
while (count--)
writel(*vaddr++, bus_addr);
}
@@ -160,8 +165,10 @@ static inline u8 __indirect_readb(const volatile void __iomem *p)
}
static inline void __indirect_readsb(const volatile void __iomem *bus_addr,
- u8 *vaddr, u32 count)
+ void *p, u32 count)
{
+ u8 *vaddr = p;
+
while (count--)
*vaddr++ = readb(bus_addr);
}
@@ -183,8 +190,10 @@ static inline u16 __indirect_readw(const volatile void __iomem *p)
}
static inline void __indirect_readsw(const volatile void __iomem *bus_addr,
- u16 *vaddr, u32 count)
+ void *p, u32 count)
{
+ u16 *vaddr = p;
+
while (count--)
*vaddr++ = readw(bus_addr);
}
@@ -204,8 +213,10 @@ static inline u32 __indirect_readl(const volatile void __iomem *p)
}
static inline void __indirect_readsl(const volatile void __iomem *bus_addr,
- u32 *vaddr, u32 count)
+ void *p, u32 count)
{
+ u32 *vaddr = p;
+
while (count--)
*vaddr++ = readl(bus_addr);
}
@@ -523,8 +534,15 @@ static inline void iowrite32_rep(void __iomem *addr, const void *vaddr,
#endif
}
-#define ioport_map(port, nr) ((void __iomem*)(port + PIO_OFFSET))
-#define ioport_unmap(addr)
+#define ioport_map(port, nr) ioport_map(port, nr)
+static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
+{
+ return ((void __iomem*)((port) + PIO_OFFSET));
+}
+#define ioport_unmap(addr) ioport_unmap(addr)
+static inline void ioport_unmap(void __iomem *addr)
+{
+}
#endif /* CONFIG_PCI */
#endif /* __ASM_ARM_ARCH_IO_H */
diff --git a/arch/arm/mach-mmp/devices.c b/arch/arm/mach-mmp/devices.c
index 3330ac7cfbef..671c7a09ab3d 100644
--- a/arch/arm/mach-mmp/devices.c
+++ b/arch/arm/mach-mmp/devices.c
@@ -238,7 +238,7 @@ void pxa_usb_phy_deinit(void __iomem *phy_reg)
#endif
#if IS_ENABLED(CONFIG_USB_SUPPORT)
-static u64 usb_dma_mask = ~(u32)0;
+static u64 __maybe_unused usb_dma_mask = ~(u32)0;
#if IS_ENABLED(CONFIG_USB_MV_UDC)
struct resource pxa168_u2o_resources[] = {
diff --git a/arch/arm/mach-mvebu/platsmp.c b/arch/arm/mach-mvebu/platsmp.c
index e62273aacb43..4ffbbd217e82 100644
--- a/arch/arm/mach-mvebu/platsmp.c
+++ b/arch/arm/mach-mvebu/platsmp.c
@@ -211,7 +211,7 @@ static int mv98dx3236_resume_set_cpu_boot_addr(int hw_cpu, void *boot_addr)
return PTR_ERR(base);
writel(0, base + MV98DX3236_CPU_RESUME_CTRL_REG);
- writel(virt_to_phys(boot_addr), base + MV98DX3236_CPU_RESUME_ADDR_REG);
+ writel(__pa_symbol(boot_addr), base + MV98DX3236_CPU_RESUME_ADDR_REG);
iounmap(base);
diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index 6613a6ff5dbc..6cbc69c92913 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -510,6 +510,7 @@ static void __init ams_delta_init(void)
static void modem_pm(struct uart_port *port, unsigned int state, unsigned old)
{
struct modem_private_data *priv = port->private_data;
+ int ret;
if (IS_ERR(priv->regulator))
return;
@@ -518,9 +519,16 @@ static void modem_pm(struct uart_port *port, unsigned int state, unsigned old)
return;
if (state == 0)
- regulator_enable(priv->regulator);
+ ret = regulator_enable(priv->regulator);
else if (old == 0)
- regulator_disable(priv->regulator);
+ ret = regulator_disable(priv->regulator);
+ else
+ ret = 0;
+
+ if (ret)
+ dev_warn(port->dev,
+ "ams_delta modem_pm: failed to %sable regulator: %d\n",
+ state ? "dis" : "en", ret);
}
static struct plat_serial8250_port ams_delta_modem_ports[] = {
diff --git a/arch/arm/mach-omap1/board-osk.c b/arch/arm/mach-omap1/board-osk.c
index 4dfb99504810..95ac1929aede 100644
--- a/arch/arm/mach-omap1/board-osk.c
+++ b/arch/arm/mach-omap1/board-osk.c
@@ -441,13 +441,11 @@ static struct spi_board_info __initdata mistral_boardinfo[] = { {
.chip_select = 0,
} };
-#ifdef CONFIG_PM
static irqreturn_t
osk_mistral_wake_interrupt(int irq, void *ignored)
{
return IRQ_HANDLED;
}
-#endif
static void __init osk_mistral_init(void)
{
@@ -515,7 +513,6 @@ static void __init osk_mistral_init(void)
gpio_direction_input(OMAP_MPUIO(2));
irq_set_irq_type(irq, IRQ_TYPE_EDGE_RISING);
-#ifdef CONFIG_PM
/* share the IRQ in case someone wants to use the
* button for more than wakeup from system sleep.
*/
@@ -529,7 +526,6 @@ static void __init osk_mistral_init(void)
ret);
} else
enable_irq_wake(irq);
-#endif
} else
printk(KERN_ERR "OSK+Mistral: wakeup button is awol\n");
diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index 779fb1f680b3..b3b3b3a19183 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -8,7 +8,7 @@ ccflags-y := -I$(srctree)/$(src)/include \
# Common support
obj-y := id.o io.o control.o devices.o fb.o timer.o pm.o \
common.o dma.o wd_timer.o display.o i2c.o hdq1w.o omap_hwmod.o \
- omap_device.o omap-headsmp.o sram.o drm.o
+ omap_device.o omap-headsmp.o sram.o
hwmod-common = omap_hwmod.o omap_hwmod_reset.o \
omap_hwmod_common_data.o
diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c
index dc9e34e670a2..583fc39d84cd 100644
--- a/arch/arm/mach-omap2/board-generic.c
+++ b/arch/arm/mach-omap2/board-generic.c
@@ -28,11 +28,12 @@ static const struct of_device_id omap_dt_match_table[] __initconst = {
{ }
};
-static void __init omap_generic_init(void)
+static void __init __maybe_unused omap_generic_init(void)
{
pdata_quirks_init(omap_dt_match_table);
omapdss_init_of();
+ omap_soc_device_init();
}
#ifdef CONFIG_SOC_OMAP2420
diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c
index 8fa01c0ecdb2..b3f6eb5d04a2 100644
--- a/arch/arm/mach-omap2/display.c
+++ b/arch/arm/mach-omap2/display.c
@@ -66,6 +66,7 @@
*/
#define FRAMEDONE_IRQ_TIMEOUT 100
+#if defined(CONFIG_FB_OMAP2)
static struct platform_device omap_display_device = {
.name = "omapdss",
.id = -1,
@@ -163,6 +164,65 @@ static enum omapdss_version __init omap_display_get_version(void)
return OMAPDSS_VER_UNKNOWN;
}
+static int __init omapdss_init_fbdev(void)
+{
+ static struct omap_dss_board_info board_data = {
+ .dsi_enable_pads = omap_dsi_enable_pads,
+ .dsi_disable_pads = omap_dsi_disable_pads,
+ .set_min_bus_tput = omap_dss_set_min_bus_tput,
+ };
+ struct device_node *node;
+ int r;
+
+ board_data.version = omap_display_get_version();
+ if (board_data.version == OMAPDSS_VER_UNKNOWN) {
+ pr_err("DSS not supported on this SoC\n");
+ return -ENODEV;
+ }
+
+ omap_display_device.dev.platform_data = &board_data;
+
+ r = platform_device_register(&omap_display_device);
+ if (r < 0) {
+ pr_err("Unable to register omapdss device\n");
+ return r;
+ }
+
+ /* create vrfb device */
+ r = omap_init_vrfb();
+ if (r < 0) {
+ pr_err("Unable to register omapvrfb device\n");
+ return r;
+ }
+
+ /* create FB device */
+ r = omap_init_fb();
+ if (r < 0) {
+ pr_err("Unable to register omapfb device\n");
+ return r;
+ }
+
+ /* create V4L2 display device */
+ r = omap_init_vout();
+ if (r < 0) {
+ pr_err("Unable to register omap_vout device\n");
+ return r;
+ }
+
+ /* add DSI info for omap4 */
+ node = of_find_node_by_name(NULL, "omap4_padconf_global");
+ if (node)
+ omap4_dsi_mux_syscon = syscon_node_to_regmap(node);
+
+ return 0;
+}
+#else
+static inline int omapdss_init_fbdev(void)
+{
+ return 0;
+}
+#endif /* CONFIG_FB_OMAP2 */
+
static void dispc_disable_outputs(void)
{
u32 v, irq_mask = 0;
@@ -335,16 +395,9 @@ static struct device_node * __init omapdss_find_dss_of_node(void)
int __init omapdss_init_of(void)
{
int r;
- enum omapdss_version ver;
struct device_node *node;
struct platform_device *pdev;
- static struct omap_dss_board_info board_data = {
- .dsi_enable_pads = omap_dsi_enable_pads,
- .dsi_disable_pads = omap_dsi_disable_pads,
- .set_min_bus_tput = omap_dss_set_min_bus_tput,
- };
-
/* only create dss helper devices if dss is enabled in the .dts */
node = omapdss_find_dss_of_node();
@@ -354,13 +407,6 @@ int __init omapdss_init_of(void)
if (!of_device_is_available(node))
return 0;
- ver = omap_display_get_version();
-
- if (ver == OMAPDSS_VER_UNKNOWN) {
- pr_err("DSS not supported on this SoC\n");
- return -ENODEV;
- }
-
pdev = of_find_device_by_node(node);
if (!pdev) {
@@ -374,48 +420,5 @@ int __init omapdss_init_of(void)
return r;
}
- board_data.version = ver;
-
- omap_display_device.dev.platform_data = &board_data;
-
- r = platform_device_register(&omap_display_device);
- if (r < 0) {
- pr_err("Unable to register omapdss device\n");
- return r;
- }
-
- /* create DRM device */
- r = omap_init_drm();
- if (r < 0) {
- pr_err("Unable to register omapdrm device\n");
- return r;
- }
-
- /* create vrfb device */
- r = omap_init_vrfb();
- if (r < 0) {
- pr_err("Unable to register omapvrfb device\n");
- return r;
- }
-
- /* create FB device */
- r = omap_init_fb();
- if (r < 0) {
- pr_err("Unable to register omapfb device\n");
- return r;
- }
-
- /* create V4L2 display device */
- r = omap_init_vout();
- if (r < 0) {
- pr_err("Unable to register omap_vout device\n");
- return r;
- }
-
- /* add DSI info for omap4 */
- node = of_find_node_by_name(NULL, "omap4_padconf_global");
- if (node)
- omap4_dsi_mux_syscon = syscon_node_to_regmap(node);
-
- return 0;
+ return omapdss_init_fbdev();
}
diff --git a/arch/arm/mach-omap2/display.h b/arch/arm/mach-omap2/display.h
index 9a39646d4316..42ec2e99a2f4 100644
--- a/arch/arm/mach-omap2/display.h
+++ b/arch/arm/mach-omap2/display.h
@@ -26,7 +26,6 @@ struct omap_dss_dispc_dev_attr {
bool has_framedonetv_irq;
};
-int omap_init_drm(void);
int omap_init_vrfb(void);
int omap_init_fb(void);
int omap_init_vout(void);
diff --git a/arch/arm/mach-omap2/drm.c b/arch/arm/mach-omap2/drm.c
deleted file mode 100644
index 44fef961bb70..000000000000
--- a/arch/arm/mach-omap2/drm.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * DRM/KMS device registration for TI OMAP platforms
- *
- * Copyright (C) 2012 Texas Instruments
- * Author: Rob Clark <rob.clark@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/dma-mapping.h>
-#include <linux/platform_data/omap_drm.h>
-
-#include "soc.h"
-#include "display.h"
-
-#if IS_ENABLED(CONFIG_DRM_OMAP)
-
-static struct omap_drm_platform_data platform_data;
-
-static struct platform_device omap_drm_device = {
- .dev = {
- .coherent_dma_mask = DMA_BIT_MASK(32),
- .platform_data = &platform_data,
- },
- .name = "omapdrm",
- .id = 0,
-};
-
-int __init omap_init_drm(void)
-{
- platform_data.omaprev = GET_OMAP_TYPE;
-
- return platform_device_register(&omap_drm_device);
-
-}
-#else
-int __init omap_init_drm(void) { return 0; }
-#endif
diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c
index 1d739d1a0a65..cb5d7314cf99 100644
--- a/arch/arm/mach-omap2/io.c
+++ b/arch/arm/mach-omap2/io.c
@@ -410,7 +410,7 @@ static int _set_hwmod_postsetup_state(struct omap_hwmod *oh, void *data)
return omap_hwmod_set_postsetup_state(oh, *(u8 *)data);
}
-static void __init omap_hwmod_init_postsetup(void)
+static void __init __maybe_unused omap_hwmod_init_postsetup(void)
{
u8 postsetup_state;
@@ -428,7 +428,6 @@ static void __init omap_hwmod_init_postsetup(void)
static void __init __maybe_unused omap_common_late_init(void)
{
omap2_common_pm_late_init();
- omap_soc_device_init();
}
#ifdef CONFIG_SOC_OMAP2420
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index d44e0e2f1106..841ba19d64a6 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -486,7 +486,6 @@ int __init omap3_pm_init(void)
ret = request_irq(omap_prcm_event_to_irq("io"),
_prcm_int_handle_io, IRQF_SHARED | IRQF_NO_SUSPEND, "pm_io",
omap3_pm_init);
- enable_irq(omap_prcm_event_to_irq("io"));
if (ret) {
pr_err("pm: Failed to request pm_io irq\n");
diff --git a/arch/arm/mach-omap2/prm3xxx.c b/arch/arm/mach-omap2/prm3xxx.c
index 382e236fbfd9..64f6451499a7 100644
--- a/arch/arm/mach-omap2/prm3xxx.c
+++ b/arch/arm/mach-omap2/prm3xxx.c
@@ -692,7 +692,6 @@ static int omap3xxx_prm_late_init(void)
{
struct device_node *np;
int irq_num;
- int ret;
if (!(prm_features & PRM_HAS_IO_WAKEUP))
return 0;
@@ -712,12 +711,8 @@ static int omap3xxx_prm_late_init(void)
}
omap3xxx_prm_enable_io_wakeup();
- ret = omap_prcm_register_chain_handler(&omap3_prcm_irq_setup);
- if (!ret)
- irq_set_status_flags(omap_prcm_event_to_irq("io"),
- IRQ_NOAUTOEN);
- return ret;
+ return omap_prcm_register_chain_handler(&omap3_prcm_irq_setup);
}
static void __exit omap3xxx_prm_exit(void)
diff --git a/arch/arm/mach-omap2/prm44xx.c b/arch/arm/mach-omap2/prm44xx.c
index 87e86a4a9ead..3ab5df1ce900 100644
--- a/arch/arm/mach-omap2/prm44xx.c
+++ b/arch/arm/mach-omap2/prm44xx.c
@@ -337,6 +337,27 @@ static void omap44xx_prm_reconfigure_io_chain(void)
}
/**
+ * omap44xx_prm_enable_io_wakeup - enable wakeup events from I/O wakeup latches
+ *
+ * Activates the I/O wakeup event latches and allows events logged by
+ * those latches to signal a wakeup event to the PRCM. For I/O wakeups
+ * to occur, WAKEUPENABLE bits must be set in the pad mux registers, and
+ * omap44xx_prm_reconfigure_io_chain() must be called. No return value.
+ */
+static void __init omap44xx_prm_enable_io_wakeup(void)
+{
+ s32 inst = omap4_prmst_get_prm_dev_inst();
+
+ if (inst == PRM_INSTANCE_UNKNOWN)
+ return;
+
+ omap4_prm_rmw_inst_reg_bits(OMAP4430_GLOBAL_WUEN_MASK,
+ OMAP4430_GLOBAL_WUEN_MASK,
+ inst,
+ omap4_prcm_irq_setup.pm_ctrl);
+}
+
+/**
* omap44xx_prm_read_reset_sources - return the last SoC reset source
*
* Return a u32 representing the last reset sources of the SoC. The
@@ -668,6 +689,8 @@ struct pwrdm_ops omap4_pwrdm_operations = {
.pwrdm_has_voltdm = omap4_check_vcvp,
};
+static int omap44xx_prm_late_init(void);
+
/*
* XXX document
*/
@@ -675,6 +698,7 @@ static struct prm_ll_data omap44xx_prm_ll_data = {
.read_reset_sources = &omap44xx_prm_read_reset_sources,
.was_any_context_lost_old = &omap44xx_prm_was_any_context_lost_old,
.clear_context_loss_flags_old = &omap44xx_prm_clear_context_loss_flags_old,
+ .late_init = &omap44xx_prm_late_init,
.assert_hardreset = omap4_prminst_assert_hardreset,
.deassert_hardreset = omap4_prminst_deassert_hardreset,
.is_hardreset_asserted = omap4_prminst_is_hardreset_asserted,
@@ -711,6 +735,37 @@ int __init omap44xx_prm_init(const struct omap_prcm_init_data *data)
return prm_register(&omap44xx_prm_ll_data);
}
+static int omap44xx_prm_late_init(void)
+{
+ int irq_num;
+
+ if (!(prm_features & PRM_HAS_IO_WAKEUP))
+ return 0;
+
+ irq_num = of_irq_get(prm_init_data->np, 0);
+ /*
+ * Already have OMAP4 IRQ num. For all other platforms, we need
+ * IRQ numbers from DT
+ */
+ if (irq_num < 0 && !(prm_init_data->flags & PRM_IRQ_DEFAULT)) {
+ if (irq_num == -EPROBE_DEFER)
+ return irq_num;
+
+ /* Have nothing to do */
+ return 0;
+ }
+
+ /* Once OMAP4 DT is filled as well */
+ if (irq_num >= 0) {
+ omap4_prcm_irq_setup.irq = irq_num;
+ omap4_prcm_irq_setup.xlate_irq = NULL;
+ }
+
+ omap44xx_prm_enable_io_wakeup();
+
+ return omap_prcm_register_chain_handler(&omap4_prcm_irq_setup);
+}
+
static void __exit omap44xx_prm_exit(void)
{
prm_unregister(&omap44xx_prm_ll_data);
diff --git a/arch/arm/mach-prima2/common.c b/arch/arm/mach-prima2/common.c
index 8cadb302a7d2..ffe05c27087e 100644
--- a/arch/arm/mach-prima2/common.c
+++ b/arch/arm/mach-prima2/common.c
@@ -15,7 +15,7 @@
#include <linux/of_platform.h>
#include "common.h"
-static void __init sirfsoc_init_late(void)
+static void __init __maybe_unused sirfsoc_init_late(void)
{
sirfsoc_pm_init();
}
diff --git a/arch/arm/mach-pxa/Kconfig b/arch/arm/mach-pxa/Kconfig
index 76fbc115ec33..ce7d97babb0f 100644
--- a/arch/arm/mach-pxa/Kconfig
+++ b/arch/arm/mach-pxa/Kconfig
@@ -566,6 +566,7 @@ config MACH_ICONTROL
config ARCH_PXA_ESERIES
bool "PXA based Toshiba e-series PDAs"
select FB_W100
+ select FB
select PXA25x
config MACH_E330
diff --git a/arch/arm/mach-pxa/include/mach/mtd-xip.h b/arch/arm/mach-pxa/include/mach/mtd-xip.h
index 990d2bf2fb45..9bf4ea6a6f74 100644
--- a/arch/arm/mach-pxa/include/mach/mtd-xip.h
+++ b/arch/arm/mach-pxa/include/mach/mtd-xip.h
@@ -17,11 +17,15 @@
#include <mach/regs-ost.h>
-#define xip_irqpending() (ICIP & ICMR)
+/* restored July 2017, this did not build since 2011! */
+
+#define ICIP io_p2v(0x40d00000)
+#define ICMR io_p2v(0x40d00004)
+#define xip_irqpending() (readl(ICIP) & readl(ICMR))
/* we sample OSCR and convert desired delta to usec (1/4 ~= 1000000/3686400) */
-#define xip_currtime() (OSCR)
-#define xip_elapsed_since(x) (signed)((OSCR - (x)) / 4)
+#define xip_currtime() readl(OSCR)
+#define xip_elapsed_since(x) (signed)((readl(OSCR) - (x)) / 4)
/*
* xip_cpu_idle() is used when waiting for a delay equal or larger than
diff --git a/arch/arm/mach-rpc/include/mach/hardware.h b/arch/arm/mach-rpc/include/mach/hardware.h
index aa79fa47373a..622d4e5df029 100644
--- a/arch/arm/mach-rpc/include/mach/hardware.h
+++ b/arch/arm/mach-rpc/include/mach/hardware.h
@@ -25,8 +25,8 @@
* *_SIZE is the size of the region
* *_BASE is the virtual address
*/
-#define RAM_SIZE 0x10000000
-#define RAM_START 0x10000000
+#define RPC_RAM_SIZE 0x10000000
+#define RPC_RAM_START 0x10000000
#define EASI_SIZE 0x08000000 /* EASI I/O */
#define EASI_START 0x08000000
diff --git a/arch/arm/mach-sa1100/clock.c b/arch/arm/mach-sa1100/clock.c
index 0db46895c82a..7d52cd97d96e 100644
--- a/arch/arm/mach-sa1100/clock.c
+++ b/arch/arm/mach-sa1100/clock.c
@@ -35,6 +35,31 @@ struct clk clk_##_name = { \
static DEFINE_SPINLOCK(clocks_lock);
+/* Dummy clk routine to build generic kernel parts that may be using them */
+long clk_round_rate(struct clk *clk, unsigned long rate)
+{
+ return clk_get_rate(clk);
+}
+EXPORT_SYMBOL(clk_round_rate);
+
+int clk_set_rate(struct clk *clk, unsigned long rate)
+{
+ return 0;
+}
+EXPORT_SYMBOL(clk_set_rate);
+
+int clk_set_parent(struct clk *clk, struct clk *parent)
+{
+ return 0;
+}
+EXPORT_SYMBOL(clk_set_parent);
+
+struct clk *clk_get_parent(struct clk *clk)
+{
+ return NULL;
+}
+EXPORT_SYMBOL(clk_get_parent);
+
static void clk_gpio27_enable(struct clk *clk)
{
/*
diff --git a/arch/arm/mach-sa1100/include/mach/mtd-xip.h b/arch/arm/mach-sa1100/include/mach/mtd-xip.h
index b3d684098fbf..cb76096a2e36 100644
--- a/arch/arm/mach-sa1100/include/mach/mtd-xip.h
+++ b/arch/arm/mach-sa1100/include/mach/mtd-xip.h
@@ -20,7 +20,7 @@
#define xip_irqpending() (ICIP & ICMR)
/* we sample OSCR and convert desired delta to usec (1/4 ~= 1000000/3686400) */
-#define xip_currtime() (OSCR)
-#define xip_elapsed_since(x) (signed)((OSCR - (x)) / 4)
+#define xip_currtime() readl_relaxed(OSCR)
+#define xip_elapsed_since(x) (signed)((readl_relaxed(OSCR) - (x)) / 4)
#endif /* __ARCH_SA1100_MTD_XIP_H__ */
diff --git a/arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c b/arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c
index 73e3adbc1330..44438f344dc8 100644
--- a/arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c
+++ b/arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c
@@ -67,8 +67,12 @@ static int regulator_quirk_notify(struct notifier_block *nb,
{
struct device *dev = data;
struct i2c_client *client;
+ static bool done;
u32 mon;
+ if (done)
+ return 0;
+
mon = ioread32(irqc + IRQC_MONITOR);
dev_dbg(dev, "%s: %ld, IRQC_MONITOR = 0x%x\n", __func__, action, mon);
if (mon & REGULATOR_IRQ_MASK)
@@ -99,7 +103,7 @@ static int regulator_quirk_notify(struct notifier_block *nb,
remove:
dev_info(dev, "IRQ2 is not asserted, removing quirk\n");
- bus_unregister_notifier(&i2c_bus_type, nb);
+ done = true;
iounmap(irqc);
return 0;
}
diff --git a/arch/arm/mach-w90x900/clock.c b/arch/arm/mach-w90x900/clock.c
index ac6fd1a2cb59..3f93fac98d97 100644
--- a/arch/arm/mach-w90x900/clock.c
+++ b/arch/arm/mach-w90x900/clock.c
@@ -93,3 +93,32 @@ void nuc900_subclk_enable(struct clk *clk, int enable)
__raw_writel(clken, W90X900_VA_CLKPWR + SUBCLK);
}
+
+/* dummy functions, should not be called */
+long clk_round_rate(struct clk *clk, unsigned long rate)
+{
+ WARN_ON(clk);
+ return 0;
+}
+EXPORT_SYMBOL(clk_round_rate);
+
+int clk_set_rate(struct clk *clk, unsigned long rate)
+{
+ WARN_ON(clk);
+ return 0;
+}
+EXPORT_SYMBOL(clk_set_rate);
+
+int clk_set_parent(struct clk *clk, struct clk *parent)
+{
+ WARN_ON(clk);
+ return 0;
+}
+EXPORT_SYMBOL(clk_set_parent);
+
+struct clk *clk_get_parent(struct clk *clk)
+{
+ WARN_ON(clk);
+ return NULL;
+}
+EXPORT_SYMBOL(clk_get_parent);
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
index 0d1f026d831a..6872135d7f84 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
@@ -67,14 +67,6 @@
};
};
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&rgmii_pins>;
- phy-mode = "rgmii";
- phy-handle = <&ext_rgmii_phy>;
- status = "okay";
-};
-
&i2c1 {
pinctrl-names = "default";
pinctrl-0 = <&i2c1_pins>;
@@ -85,13 +77,6 @@
bias-pull-up;
};
-&mdio {
- ext_rgmii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <1>;
- };
-};
-
&mmc0 {
pinctrl-names = "default";
pinctrl-0 = <&mmc0_pins>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
index 24f1aac366d6..f82ccf332c0f 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
@@ -48,18 +48,3 @@
/* TODO: Camera, touchscreen, etc. */
};
-
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&rgmii_pins>;
- phy-mode = "rgmii";
- phy-handle = <&ext_rgmii_phy>;
- status = "okay";
-};
-
-&mdio {
- ext_rgmii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <1>;
- };
-};
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
index 08cda24ea194..7c533b6d4ba9 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
@@ -78,15 +78,6 @@
status = "okay";
};
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&rmii_pins>;
- phy-mode = "rmii";
- phy-handle = <&ext_rmii_phy1>;
- status = "okay";
-
-};
-
&i2c1 {
pinctrl-names = "default";
pinctrl-0 = <&i2c1_pins>;
@@ -97,13 +88,6 @@
bias-pull-up;
};
-&mdio {
- ext_rmii_phy1: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <1>;
- };
-};
-
&mmc0 {
pinctrl-names = "default";
pinctrl-0 = <&mmc0_pins>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
index 17eb1cc5bf6b..d891a1a27f6c 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
@@ -76,21 +76,6 @@
status = "okay";
};
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&rgmii_pins>;
- phy-mode = "rgmii";
- phy-handle = <&ext_rgmii_phy>;
- status = "okay";
-};
-
-&mdio {
- ext_rgmii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <1>;
- };
-};
-
&mmc2 {
pinctrl-names = "default";
pinctrl-0 = <&mmc2_pins>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
index 9d00622ce845..68aadc9b96dc 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
@@ -449,26 +449,6 @@
#size-cells = <0>;
};
- emac: ethernet@1c30000 {
- compatible = "allwinner,sun50i-a64-emac";
- syscon = <&syscon>;
- reg = <0x01c30000 0x100>;
- interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "macirq";
- resets = <&ccu RST_BUS_EMAC>;
- reset-names = "stmmaceth";
- clocks = <&ccu CLK_BUS_EMAC>;
- clock-names = "stmmaceth";
- status = "disabled";
- #address-cells = <1>;
- #size-cells = <0>;
-
- mdio: mdio {
- #address-cells = <1>;
- #size-cells = <0>;
- };
- };
-
gic: interrupt-controller@1c81000 {
compatible = "arm,gic-400";
reg = <0x01c81000 0x1000>,
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
index 968908761194..1c2387bd5df6 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
@@ -50,7 +50,6 @@
compatible = "friendlyarm,nanopi-neo2", "allwinner,sun50i-h5";
aliases {
- ethernet0 = &emac;
serial0 = &uart0;
};
@@ -109,22 +108,6 @@
status = "okay";
};
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&emac_rgmii_pins>;
- phy-supply = <&reg_gmac_3v3>;
- phy-handle = <&ext_rgmii_phy>;
- phy-mode = "rgmii";
- status = "okay";
-};
-
-&mdio {
- ext_rgmii_phy: ethernet-phy@7 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <7>;
- };
-};
-
&mmc0 {
pinctrl-names = "default";
pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts
index a8296feee884..4f77c8470f6c 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts
@@ -59,7 +59,6 @@
};
aliases {
- ethernet0 = &emac;
serial0 = &uart0;
};
@@ -137,28 +136,12 @@
status = "okay";
};
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&emac_rgmii_pins>;
- phy-supply = <&reg_gmac_3v3>;
- phy-handle = <&ext_rgmii_phy>;
- phy-mode = "rgmii";
- status = "okay";
-};
-
&ir {
pinctrl-names = "default";
pinctrl-0 = <&ir_pins_a>;
status = "okay";
};
-&mdio {
- ext_rgmii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <1>;
- };
-};
-
&mmc0 {
pinctrl-names = "default";
pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts
index d906b302cbcd..6be06873e5af 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts
@@ -54,7 +54,6 @@
compatible = "xunlong,orangepi-prime", "allwinner,sun50i-h5";
aliases {
- ethernet0 = &emac;
serial0 = &uart0;
};
@@ -144,28 +143,12 @@
status = "okay";
};
-&emac {
- pinctrl-names = "default";
- pinctrl-0 = <&emac_rgmii_pins>;
- phy-supply = <&reg_gmac_3v3>;
- phy-handle = <&ext_rgmii_phy>;
- phy-mode = "rgmii";
- status = "okay";
-};
-
&ir {
pinctrl-names = "default";
pinctrl-0 = <&ir_pins_a>;
status = "okay";
};
-&mdio {
- ext_rgmii_phy: ethernet-phy@1 {
- compatible = "ethernet-phy-ieee802.3-c22";
- reg = <1>;
- };
-};
-
&mmc0 {
pinctrl-names = "default";
pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
index 732e2e06f503..d9a720bff05d 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
@@ -120,5 +120,8 @@
};
&pio {
+ interrupts = <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>;
compatible = "allwinner,sun50i-h5-pinctrl";
};
diff --git a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
index 35b8c88c3220..738ed689ff69 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
@@ -400,7 +400,7 @@
};
pwm_AO_ab: pwm@550 {
- compatible = "amlogic,meson-gx-pwm", "amlogic,meson-gxbb-pwm";
+ compatible = "amlogic,meson-gx-ao-pwm", "amlogic,meson-gxbb-ao-pwm";
reg = <0x0 0x00550 0x0 0x10>;
#pwm-cells = <3>;
status = "disabled";
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts
index 72c5a9f64ca8..94567eb17875 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts
@@ -109,8 +109,8 @@
status = "okay";
pinctrl-0 = <&pwm_ao_a_3_pins>, <&pwm_ao_b_pins>;
pinctrl-names = "default";
- clocks = <&clkc CLKID_FCLK_DIV4>;
- clock-names = "clkin0";
+ clocks = <&xtal> , <&xtal>;
+ clock-names = "clkin0", "clkin1" ;
};
&pwm_ef {
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
index 890821d6e52b..266fbcf3e47f 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
@@ -10,12 +10,20 @@
#include <dt-bindings/input/input.h>
-#include "meson-gxl-s905x-p212.dtsi"
+#include "meson-gxl-s905x.dtsi"
/ {
compatible = "libretech,cc", "amlogic,s905x", "amlogic,meson-gxl";
model = "Libre Technology CC";
+ aliases {
+ serial0 = &uart_AO;
+ };
+
+ chosen {
+ stdout-path = "serial0:115200n8";
+ };
+
cvbs-connector {
compatible = "composite-video-connector";
@@ -26,6 +34,11 @@
};
};
+ emmc_pwrseq: emmc-pwrseq {
+ compatible = "mmc-pwrseq-emmc";
+ reset-gpios = <&gpio BOOT_9 GPIO_ACTIVE_LOW>;
+ };
+
hdmi-connector {
compatible = "hdmi-connector";
type = "a";
@@ -53,6 +66,39 @@
linux,default-trigger = "heartbeat";
};
};
+
+ memory@0 {
+ device_type = "memory";
+ reg = <0x0 0x0 0x0 0x80000000>;
+ };
+
+ vcc_3v3: regulator-vcc_3v3 {
+ compatible = "regulator-fixed";
+ regulator-name = "VCC_3V3";
+ regulator-min-microvolt = <3300000>;
+ regulator-max-microvolt = <3300000>;
+ };
+
+ vcc_card: regulator-vcc-card {
+ compatible = "regulator-gpio";
+
+ regulator-name = "VCC_CARD";
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <3300000>;
+
+ gpios = <&gpio_ao GPIOAO_3 GPIO_ACTIVE_HIGH>;
+ gpios-states = <0>;
+
+ states = <3300000 0>,
+ <1800000 1>;
+ };
+
+ vddio_boot: regulator-vddio_boot {
+ compatible = "regulator-fixed";
+ regulator-name = "VDDIO_BOOT";
+ regulator-min-microvolt = <3300000>;
+ regulator-max-microvolt = <3300000>;
+ };
};
&cvbs_vdac_port {
@@ -61,6 +107,16 @@
};
};
+&ethmac {
+ status = "okay";
+};
+
+&ir {
+ status = "okay";
+ pinctrl-0 = <&remote_input_ao_pins>;
+ pinctrl-names = "default";
+};
+
&hdmi_tx {
status = "okay";
pinctrl-0 = <&hdmi_hpd_pins>, <&hdmi_i2c_pins>;
@@ -73,20 +129,43 @@
};
};
-/*
- * The following devices exists but are exposed on the general
- * purpose GPIO header. End user may well decide to use those pins
- * for another purpose
- */
+/* SD card */
+&sd_emmc_b {
+ status = "okay";
+ pinctrl-0 = <&sdcard_pins>;
+ pinctrl-names = "default";
+
+ bus-width = <4>;
+ cap-sd-highspeed;
+ max-frequency = <100000000>;
+ disable-wp;
+
+ cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
+ cd-inverted;
-&sd_emmc_a {
- status = "disabled";
+ vmmc-supply = <&vcc_3v3>;
+ vqmmc-supply = <&vcc_card>;
};
-&uart_A {
- status = "disabled";
+/* eMMC */
+&sd_emmc_c {
+ status = "okay";
+ pinctrl-0 = <&emmc_pins>;
+ pinctrl-names = "default";
+
+ bus-width = <8>;
+ cap-mmc-highspeed;
+ max-frequency = <50000000>;
+ non-removable;
+ disable-wp;
+
+ mmc-pwrseq = <&emmc_pwrseq>;
+ vmmc-supply = <&vcc_3v3>;
+ vqmmc-supply = <&vddio_boot>;
};
-&wifi32k {
- status = "disabled";
+&uart_AO {
+ status = "okay";
+ pinctrl-0 = <&uart_ao_a_pins>;
+ pinctrl-names = "default";
};
diff --git a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
index e2b0da2c0bc7..105b2938082f 100644
--- a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
+++ b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
@@ -280,9 +280,6 @@
&decon {
status = "okay";
-
- i80-if-timings {
- };
};
&decon_tv {
@@ -1116,9 +1113,6 @@
&mic {
status = "okay";
-
- i80-if-timings {
- };
};
&pmu_system_controller {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
index 31fd77f82ced..d16b9cc1e825 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
@@ -653,21 +653,21 @@
};
msi1: msi-controller1@1571000 {
- compatible = "fsl,1s1043a-msi";
+ compatible = "fsl,ls1043a-msi";
reg = <0x0 0x1571000 0x0 0x8>;
msi-controller;
interrupts = <0 116 0x4>;
};
msi2: msi-controller2@1572000 {
- compatible = "fsl,1s1043a-msi";
+ compatible = "fsl,ls1043a-msi";
reg = <0x0 0x1572000 0x0 0x8>;
msi-controller;
interrupts = <0 126 0x4>;
};
msi3: msi-controller3@1573000 {
- compatible = "fsl,1s1043a-msi";
+ compatible = "fsl,ls1043a-msi";
reg = <0x0 0x1573000 0x0 0x8>;
msi-controller;
interrupts = <0 160 0x4>;
@@ -689,7 +689,7 @@
bus-range = <0x0 0xff>;
ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */
0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
- msi-parent = <&msi1>;
+ msi-parent = <&msi1>, <&msi2>, <&msi3>;
#interrupt-cells = <1>;
interrupt-map-mask = <0 0 0 7>;
interrupt-map = <0000 0 0 1 &gic 0 110 0x4>,
@@ -714,7 +714,7 @@
bus-range = <0x0 0xff>;
ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */
0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
- msi-parent = <&msi2>;
+ msi-parent = <&msi1>, <&msi2>, <&msi3>;
#interrupt-cells = <1>;
interrupt-map-mask = <0 0 0 7>;
interrupt-map = <0000 0 0 1 &gic 0 120 0x4>,
@@ -739,7 +739,7 @@
bus-range = <0x0 0xff>;
ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */
0x82000000 0x0 0x40000000 0x50 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
- msi-parent = <&msi3>;
+ msi-parent = <&msi1>, <&msi2>, <&msi3>;
#interrupt-cells = <1>;
interrupt-map-mask = <0 0 0 7>;
interrupt-map = <0000 0 0 1 &gic 0 154 0x4>,
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index dc1640be0345..c8ff0baddf1d 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
@@ -630,6 +630,37 @@
interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clockgen 4 1>;
};
+
+ msi1: msi-controller@1580000 {
+ compatible = "fsl,ls1046a-msi";
+ msi-controller;
+ reg = <0x0 0x1580000 0x0 0x10000>;
+ interrupts = <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>;
+ };
+
+ msi2: msi-controller@1590000 {
+ compatible = "fsl,ls1046a-msi";
+ msi-controller;
+ reg = <0x0 0x1590000 0x0 0x10000>;
+ interrupts = <GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>;
+ };
+
+ msi3: msi-controller@15a0000 {
+ compatible = "fsl,ls1046a-msi";
+ msi-controller;
+ reg = <0x0 0x15a0000 0x0 0x10000>;
+ interrupts = <GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>;
+ };
+
};
reserved-memory {
diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
index dbcc3d4e2ed5..51763d674050 100644
--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
@@ -219,7 +219,7 @@
reg = <0x18800 0x100>, <0x18C00 0x20>;
gpiosb: gpio {
#gpio-cells = <2>;
- gpio-ranges = <&pinctrl_sb 0 0 29>;
+ gpio-ranges = <&pinctrl_sb 0 0 30>;
gpio-controller;
interrupts =
<GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi
index 1eb1f1e9aac4..4d360713ed12 100644
--- a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi
@@ -268,10 +268,10 @@
ap_gpio: gpio {
compatible = "marvell,armada-8k-gpio";
offset = <0x1040>;
- ngpios = <19>;
+ ngpios = <20>;
gpio-controller;
#gpio-cells = <2>;
- gpio-ranges = <&ap_pinctrl 0 0 19>;
+ gpio-ranges = <&ap_pinctrl 0 0 20>;
};
};
};
diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
index 726528ce54e9..4c68605675a8 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
@@ -270,6 +270,7 @@
interrupt-names = "mem", "ring0", "ring1",
"ring2", "ring3", "eip";
clocks = <&cpm_clk 1 26>;
+ dma-coherent;
};
};
diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
index 95f8e5f607f6..923f354b02f0 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
@@ -64,7 +64,7 @@
compatible = "marvell,armada-8k-rtc";
reg = <0x284000 0x20>, <0x284080 0x24>;
reg-names = "rtc", "rtc-soc";
- interrupts = <GIC_SPI 71 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <ICU_GRP_NSR 77 IRQ_TYPE_LEVEL_HIGH>;
};
cps_ethernet: ethernet@0 {
@@ -261,6 +261,7 @@
interrupt-names = "mem", "ring0", "ring1",
"ring2", "ring3", "eip";
clocks = <&cps_clk 1 26>;
+ dma-coherent;
/*
* The cryptographic engine found on the cp110
* master is enabled by default at the SoC
diff --git a/arch/arm64/boot/dts/renesas/salvator-common.dtsi b/arch/arm64/boot/dts/renesas/salvator-common.dtsi
index aef35e0b685a..f903957da504 100644
--- a/arch/arm64/boot/dts/renesas/salvator-common.dtsi
+++ b/arch/arm64/boot/dts/renesas/salvator-common.dtsi
@@ -45,7 +45,7 @@
stdout-path = "serial0:115200n8";
};
- audio_clkout: audio_clkout {
+ audio_clkout: audio-clkout {
/*
* This is same as <&rcar_sound 0>
* but needed to avoid cs2000/rcar_sound probe dead-lock
@@ -508,7 +508,7 @@
/* audio_clkout0/1/2/3 */
#clock-cells = <1>;
- clock-frequency = <11289600 12288000>;
+ clock-frequency = <12288000 11289600>;
status = "okay";
diff --git a/arch/arm64/boot/dts/renesas/ulcb.dtsi b/arch/arm64/boot/dts/renesas/ulcb.dtsi
index b5c6ee07d7f9..d1a3f3b7a0ab 100644
--- a/arch/arm64/boot/dts/renesas/ulcb.dtsi
+++ b/arch/arm64/boot/dts/renesas/ulcb.dtsi
@@ -281,7 +281,7 @@
/* audio_clkout0/1/2/3 */
#clock-cells = <1>;
- clock-frequency = <11289600 12288000>;
+ clock-frequency = <12288000 11289600>;
status = "okay";
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 6c7d147eed54..b4ca115b3be1 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -476,6 +476,7 @@ CONFIG_QCOM_CLK_SMD_RPM=y
CONFIG_MSM_GCC_8916=y
CONFIG_MSM_GCC_8994=y
CONFIG_MSM_MMCC_8996=y
+CONFIG_HWSPINLOCK=y
CONFIG_HWSPINLOCK_QCOM=y
CONFIG_ARM_MHU=y
CONFIG_PLATFORM_MHU=y
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 8cef47fa2218..b7e3f74822da 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -116,6 +116,8 @@ static inline void gic_write_bpr1(u32 val)
#define gic_read_typer(c) readq_relaxed(c)
#define gic_write_irouter(v, c) writeq_relaxed(v, c)
+#define gic_read_lpir(c) readq_relaxed(c)
+#define gic_write_lpir(v, c) writeq_relaxed(v, c)
#define gic_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l))
@@ -133,5 +135,10 @@ static inline void gic_write_bpr1(u32 val)
#define gicr_write_pendbaser(v, c) writeq_relaxed(v, c)
#define gicr_read_pendbaser(c) readq_relaxed(c)
+#define gits_write_vpropbaser(v, c) writeq_relaxed(v, c)
+
+#define gits_write_vpendbaser(v, c) writeq_relaxed(v, c)
+#define gits_read_vpendbaser(c) readq_relaxed(c)
+
#endif /* __ASSEMBLY__ */
#endif /* __ASM_ARCH_GICV3_H */
diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
index 74d08e44a651..a652ce0a5cb2 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -65,13 +65,13 @@ DECLARE_PER_CPU(const struct arch_timer_erratum_workaround *,
u64 _val; \
if (needs_unstable_timer_counter_workaround()) { \
const struct arch_timer_erratum_workaround *wa; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
wa = __this_cpu_read(timer_unstable_counter_workaround); \
if (wa && wa->read_##reg) \
_val = wa->read_##reg(); \
else \
_val = read_sysreg(reg); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
} else { \
_val = read_sysreg(reg); \
} \
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index acae781f7359..3288c2b36731 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -114,10 +114,10 @@
/*
* This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
* space open for things that want to use the area for 32-bit pointers.
*/
-#define ELF_ET_DYN_BASE 0x100000000UL
+#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3)
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index f32b42e8725d..5bb2fd4674e7 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -48,20 +48,10 @@ do { \
} while (0)
static inline int
-futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (int)(encoded_op << 8) >> 20;
- int cmparg = (int)(encoded_op << 20) >> 20;
int oldval = 0, ret, tmp;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1U << (oparg & 0x1f);
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
-
pagefault_disable();
switch (op) {
@@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d68630007b14..e923b58606e2 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -326,12 +326,6 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-/* We do not have shadow page tables, hence the empty hooks */
-static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address)
-{
-}
-
struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
void kvm_arm_halt_guest(struct kvm *kvm);
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 32f82723338a..ef39dcb9ca6a 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -64,8 +64,10 @@
* TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
*/
#define VA_BITS (CONFIG_ARM64_VA_BITS)
-#define VA_START (UL(0xffffffffffffffff) << VA_BITS)
-#define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1))
+#define VA_START (UL(0xffffffffffffffff) - \
+ (UL(1) << VA_BITS) + 1)
+#define PAGE_OFFSET (UL(0xffffffffffffffff) - \
+ (UL(1) << (VA_BITS - 1)) + 1)
#define KIMAGE_VADDR (MODULES_END)
#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE)
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index cae331d553f8..95ad7102b63c 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -26,58 +26,6 @@
* The memory barriers are implicit with the load-acquire and store-release
* instructions.
*/
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- unsigned int tmp;
- arch_spinlock_t lockval;
- u32 owner;
-
- /*
- * Ensure prior spin_lock operations to other locks have completed
- * on this CPU before we test whether "lock" is locked.
- */
- smp_mb();
- owner = READ_ONCE(lock->owner) << 16;
-
- asm volatile(
-" sevl\n"
-"1: wfe\n"
-"2: ldaxr %w0, %2\n"
- /* Is the lock free? */
-" eor %w1, %w0, %w0, ror #16\n"
-" cbz %w1, 3f\n"
- /* Lock taken -- has there been a subsequent unlock->lock transition? */
-" eor %w1, %w3, %w0, lsl #16\n"
-" cbz %w1, 1b\n"
- /*
- * The owner has been updated, so there was an unlock->lock
- * transition that we missed. That means we can rely on the
- * store-release of the unlock operation paired with the
- * load-acquire of the lock operation to publish any of our
- * previous stores to the new lock owner and therefore don't
- * need to bother with the writeback below.
- */
-" b 4f\n"
-"3:\n"
- /*
- * Serialise against any concurrent lockers by writing back the
- * unlocked lock value
- */
- ARM64_LSE_ATOMIC_INSN(
- /* LL/SC */
-" stxr %w1, %w0, %2\n"
- __nops(2),
- /* LSE atomics */
-" mov %w1, %w0\n"
-" cas %w0, %w0, %2\n"
-" eor %w1, %w1, %w0\n")
- /* Somebody else wrote to the lock, GOTO 10 and reload the value */
-" cbnz %w1, 2b\n"
-"4:"
- : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
- : "r" (owner)
- : "memory");
-}
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
@@ -176,7 +124,11 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
- smp_mb(); /* See arch_spin_unlock_wait */
+ /*
+ * Ensure prior spin_lock operations to other locks have completed
+ * on this CPU before we test whether "lock" is locked.
+ */
+ smp_mb(); /* ^^^ */
return !arch_spin_value_unlocked(READ_ONCE(*lock));
}
@@ -358,14 +310,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
#define arch_read_relax(lock) cpu_relax()
#define arch_write_relax(lock) cpu_relax()
-/*
- * Accesses appearing in program order before a spin_lock() operation
- * can be reordered with accesses inside the critical section, by virtue
- * of arch_spin_lock being constructed using acquire semantics.
- *
- * In cases where this is problematic (e.g. try_to_wake_up), an
- * smp_mb__before_spinlock() can restore the required ordering.
- */
-#define smp_mb__before_spinlock() smp_mb()
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock() smp_mb()
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 46c3b93cf865..c5ba565544ee 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
+#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */
#define TIF_NOHZ 7
#define TIF_SYSCALL_TRACE 8
#define TIF_SYSCALL_AUDIT 9
@@ -107,11 +108,12 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
#define _TIF_32BIT (1 << TIF_32BIT)
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
- _TIF_UPROBE)
+ _TIF_UPROBE | _TIF_FSCHECK)
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 02e9035b0685..47a9066f7c86 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook);
void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr);
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static inline int __in_irqentry_text(unsigned long ptr)
{
return ptr >= (unsigned long)&__irqentry_text_start &&
ptr < (unsigned long)&__irqentry_text_end;
}
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
- return 0;
-}
-#endif
static inline int in_exception_text(unsigned long ptr)
{
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index fab46a0ea223..a801a48a7972 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -45,6 +45,9 @@ static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
+
/*
* Enable/disable UAO so that copy_to_user() etc can access
* kernel memory with the unprivileged instructions.
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 06da8ea16bbe..c7b4995868e1 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -161,9 +161,11 @@ void fpsimd_flush_thread(void)
{
if (!system_supports_fpsimd())
return;
+ preempt_disable();
memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
fpsimd_flush_task_state(current);
set_thread_flag(TIF_FOREIGN_FPSTATE);
+ preempt_enable();
}
/*
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 973df7de7bf8..adb0910b88f5 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -354,7 +354,6 @@ __primary_switched:
tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized?
b.ne 0f
mov x0, x21 // pass FDT address in x0
- mov x1, x23 // pass modulo offset in x1
bl kaslr_early_init // parse FDT for KASLR options
cbz x0, 0f // KASLR disabled? just proceed
orr x23, x23, x0 // record KASLR offset
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index a9710efb8c01..47080c49cc7e 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -75,7 +75,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size,
* containing function pointers) to be reinitialized, and zero-initialized
* .bss variables will be reset to 0.
*/
-u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
+u64 __init kaslr_early_init(u64 dt_phys)
{
void *fdt;
u64 seed, offset, mask, module_range;
@@ -131,15 +131,17 @@ u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
/*
* The kernel Image should not extend across a 1GB/32MB/512MB alignment
* boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
- * happens, increase the KASLR offset by the size of the kernel image
- * rounded up by SWAPPER_BLOCK_SIZE.
+ * happens, round down the KASLR offset by (1 << SWAPPER_TABLE_SHIFT).
+ *
+ * NOTE: The references to _text and _end below will already take the
+ * modulo offset (the physical displacement modulo 2 MB) into
+ * account, given that the physical placement is controlled by
+ * the loader, and will not change as a result of the virtual
+ * mapping we choose.
*/
- if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) !=
- (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT)) {
- u64 kimg_sz = _end - _text;
- offset = (offset + round_up(kimg_sz, SWAPPER_BLOCK_SIZE))
- & mask;
- }
+ if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) !=
+ (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT))
+ offset = round_down(offset, 1 << SWAPPER_TABLE_SHIFT);
if (IS_ENABLED(CONFIG_KASAN))
/*
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 659ae8094ed5..c8f7d98d8cb9 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
/*
* Complete any pending TLB or cache maintenance on this CPU in case
* the thread migrates to a different CPU.
+ * This full barrier is also required by the membarrier system
+ * call.
*/
dsb(ish);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 089c3747995d..e3e3293d1123 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -29,6 +29,7 @@
#include <linux/string.h>
#include <linux/tracehook.h>
#include <linux/ratelimit.h>
+#include <linux/syscalls.h>
#include <asm/debug-monitors.h>
#include <asm/elf.h>
@@ -749,6 +750,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
do {
if (thread_flags & _TIF_NEED_RESCHED) {
schedule();
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index d48f47080213..8a62648848e5 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -523,7 +523,7 @@ static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs)
{
int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
- pt_regs_write_reg(regs, rt, read_sysreg(cntfrq_el0));
+ pt_regs_write_reg(regs, rt, arch_timer_get_rate());
regs->pc += 4;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 77862881ae86..2e070d3baf9f 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -764,7 +764,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (p->is_write) {
if (r->CRm & 0x2)
/* accessing PMOVSSET_EL0 */
- kvm_pmu_overflow_set(vcpu, p->regval & mask);
+ vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask);
else
/* accessing PMOVSCLR_EL0 */
vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c7861c9864e6..1f22a41565a3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -163,26 +163,27 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
/* only preserve the access flags and write permission */
pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY;
- /*
- * PTE_RDONLY is cleared by default in the asm below, so set it in
- * back if necessary (read-only or clean PTE).
- */
+ /* set PTE_RDONLY if actual read-only or clean PTE */
if (!pte_write(entry) || !pte_sw_dirty(entry))
pte_val(entry) |= PTE_RDONLY;
/*
* Setting the flags must be done atomically to avoid racing with the
- * hardware update of the access/dirty state.
+ * hardware update of the access/dirty state. The PTE_RDONLY bit must
+ * be set to the most permissive (lowest value) of *ptep and entry
+ * (calculated as: a & b == ~(~a | ~b)).
*/
+ pte_val(entry) ^= PTE_RDONLY;
asm volatile("// ptep_set_access_flags\n"
" prfm pstl1strm, %2\n"
"1: ldxr %0, %2\n"
- " and %0, %0, %3 // clear PTE_RDONLY\n"
+ " eor %0, %0, %3 // negate PTE_RDONLY in *ptep\n"
" orr %0, %0, %4 // set flags\n"
+ " eor %0, %0, %3 // negate final PTE_RDONLY\n"
" stxr %w1, %0, %2\n"
" cbnz %w1, 1b\n"
: "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))
- : "L" (~PTE_RDONLY), "r" (pte_val(entry)));
+ : "L" (PTE_RDONLY), "r" (pte_val(entry)));
flush_tlb_fix_spurious_fault(vma, address);
return 1;
@@ -434,8 +435,11 @@ retry:
* the mmap_sem because it would already be released
* in __lock_page_or_retry in mm/filemap.c.
*/
- if (fatal_signal_pending(current))
+ if (fatal_signal_pending(current)) {
+ if (!user_mode(regs))
+ goto no_context;
return 0;
+ }
/*
* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h
index c58f4a83ed6f..f6431439d15d 100644
--- a/arch/blackfin/include/asm/spinlock.h
+++ b/arch/blackfin/include/asm/spinlock.h
@@ -48,11 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
__raw_spin_unlock_asm(&lock->lock);
}
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
static inline int arch_read_can_lock(arch_rwlock_t *rw)
{
return __raw_uncached_fetch_asm(&rw->lock) > 0;
diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c
index 0188c933b155..15af5768c403 100644
--- a/arch/blackfin/kernel/module.c
+++ b/arch/blackfin/kernel/module.c
@@ -4,8 +4,6 @@
* Licensed under the GPL-2 or later
*/
-#define pr_fmt(fmt) "module %s: " fmt, mod->name
-
#include <linux/moduleloader.h>
#include <linux/elf.h>
#include <linux/vmalloc.h>
@@ -16,6 +14,11 @@
#include <asm/cacheflush.h>
#include <linux/uaccess.h>
+#define mod_err(mod, fmt, ...) \
+ pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+#define mod_debug(mod, fmt, ...) \
+ pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+
/* Transfer the section to the L1 memory */
int
module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
@@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
dest = l1_inst_sram_alloc(s->sh_size);
mod->arch.text_l1 = dest;
if (dest == NULL) {
- pr_err("L1 inst memory allocation failed\n");
+ mod_err(mod, "L1 inst memory allocation failed\n");
return -1;
}
dma_memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
dest = l1_data_sram_alloc(s->sh_size);
mod->arch.data_a_l1 = dest;
if (dest == NULL) {
- pr_err("L1 data memory allocation failed\n");
+ mod_err(mod, "L1 data memory allocation failed\n");
return -1;
}
memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
dest = l1_data_sram_zalloc(s->sh_size);
mod->arch.bss_a_l1 = dest;
if (dest == NULL) {
- pr_err("L1 data memory allocation failed\n");
+ mod_err(mod, "L1 data memory allocation failed\n");
return -1;
}
@@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
dest = l1_data_B_sram_alloc(s->sh_size);
mod->arch.data_b_l1 = dest;
if (dest == NULL) {
- pr_err("L1 data memory allocation failed\n");
+ mod_err(mod, "L1 data memory allocation failed\n");
return -1;
}
memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
dest = l1_data_B_sram_alloc(s->sh_size);
mod->arch.bss_b_l1 = dest;
if (dest == NULL) {
- pr_err("L1 data memory allocation failed\n");
+ mod_err(mod, "L1 data memory allocation failed\n");
return -1;
}
memset(dest, 0, s->sh_size);
@@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
dest = l2_sram_alloc(s->sh_size);
mod->arch.text_l2 = dest;
if (dest == NULL) {
- pr_err("L2 SRAM allocation failed\n");
+ mod_err(mod, "L2 SRAM allocation failed\n");
return -1;
}
memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
dest = l2_sram_alloc(s->sh_size);
mod->arch.data_l2 = dest;
if (dest == NULL) {
- pr_err("L2 SRAM allocation failed\n");
+ mod_err(mod, "L2 SRAM allocation failed\n");
return -1;
}
memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
dest = l2_sram_zalloc(s->sh_size);
mod->arch.bss_l2 = dest;
if (dest == NULL) {
- pr_err("L2 SRAM allocation failed\n");
+ mod_err(mod, "L2 SRAM allocation failed\n");
return -1;
}
@@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
Elf32_Sym *sym;
unsigned long location, value, size;
- pr_debug("applying relocate section %u to %u\n",
- relsec, sechdrs[relsec].sh_info);
+ mod_debug(mod, "applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
@@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
#ifdef CONFIG_SMP
if (location >= COREB_L1_DATA_A_START) {
- pr_err("cannot relocate in L1: %u (SMP kernel)\n",
+ mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n",
ELF32_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
#endif
- pr_debug("location is %lx, value is %lx type is %d\n",
- location, value, ELF32_R_TYPE(rel[i].r_info));
+ mod_debug(mod, "location is %lx, value is %lx type is %d\n",
+ location, value, ELF32_R_TYPE(rel[i].r_info));
switch (ELF32_R_TYPE(rel[i].r_info)) {
@@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
case R_BFIN_PCREL12_JUMP:
case R_BFIN_PCREL12_JUMP_S:
case R_BFIN_PCREL10:
- pr_err("unsupported relocation: %u (no -mlong-calls?)\n",
+ mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n",
ELF32_R_TYPE(rel[i].r_info));
return -ENOEXEC;
default:
- pr_err("unknown relocation: %u\n",
+ mod_err(mod, "unknown relocation: %u\n",
ELF32_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
@@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
isram_memcpy((void *)location, &value, size);
break;
default:
- pr_err("invalid relocation for %#lx\n", location);
+ mod_err(mod, "invalid relocation for %#lx\n", location);
return -ENOEXEC;
}
}
diff --git a/arch/c6x/configs/dsk6455_defconfig b/arch/c6x/configs/dsk6455_defconfig
index 4663487c67a1..d764ea4cce7f 100644
--- a/arch/c6x/configs/dsk6455_defconfig
+++ b/arch/c6x/configs/dsk6455_defconfig
@@ -1,5 +1,4 @@
CONFIG_SOC_TMS320C6455=y
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_SPARSE_IRQ=y
@@ -25,7 +24,6 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=2
CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
diff --git a/arch/c6x/configs/evmc6457_defconfig b/arch/c6x/configs/evmc6457_defconfig
index bba40e195ec4..05d0b4a25ab1 100644
--- a/arch/c6x/configs/evmc6457_defconfig
+++ b/arch/c6x/configs/evmc6457_defconfig
@@ -1,5 +1,4 @@
CONFIG_SOC_TMS320C6457=y
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_SPARSE_IRQ=y
@@ -26,7 +25,6 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=2
CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
diff --git a/arch/c6x/configs/evmc6472_defconfig b/arch/c6x/configs/evmc6472_defconfig
index 8c46155f6d31..8d81fcf86b0e 100644
--- a/arch/c6x/configs/evmc6472_defconfig
+++ b/arch/c6x/configs/evmc6472_defconfig
@@ -1,5 +1,4 @@
CONFIG_SOC_TMS320C6472=y
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_SPARSE_IRQ=y
@@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=2
CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
diff --git a/arch/c6x/configs/evmc6474_defconfig b/arch/c6x/configs/evmc6474_defconfig
index 15533f632313..8156a98f3958 100644
--- a/arch/c6x/configs/evmc6474_defconfig
+++ b/arch/c6x/configs/evmc6474_defconfig
@@ -1,5 +1,4 @@
CONFIG_SOC_TMS320C6474=y
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_SPARSE_IRQ=y
@@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=2
CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
diff --git a/arch/c6x/configs/evmc6678_defconfig b/arch/c6x/configs/evmc6678_defconfig
index 5f126d4905b1..c4f433c25b69 100644
--- a/arch/c6x/configs/evmc6678_defconfig
+++ b/arch/c6x/configs/evmc6678_defconfig
@@ -1,5 +1,4 @@
CONFIG_SOC_TMS320C6678=y
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_SPARSE_IRQ=y
@@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=2
CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
# CONFIG_INPUT is not set
# CONFIG_SERIO is not set
# CONFIG_VT is not set
diff --git a/arch/c6x/platforms/megamod-pic.c b/arch/c6x/platforms/megamod-pic.c
index 43afc03e4125..9519fa5f97d0 100644
--- a/arch/c6x/platforms/megamod-pic.c
+++ b/arch/c6x/platforms/megamod-pic.c
@@ -208,14 +208,14 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
pic = kzalloc(sizeof(struct megamod_pic), GFP_KERNEL);
if (!pic) {
- pr_err("%s: Could not alloc PIC structure.\n", np->full_name);
+ pr_err("%pOF: Could not alloc PIC structure.\n", np);
return NULL;
}
pic->irqhost = irq_domain_add_linear(np, NR_COMBINERS * 32,
&megamod_domain_ops, pic);
if (!pic->irqhost) {
- pr_err("%s: Could not alloc host.\n", np->full_name);
+ pr_err("%pOF: Could not alloc host.\n", np);
goto error_free;
}
@@ -225,7 +225,7 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
pic->regs = of_iomap(np, 0);
if (!pic->regs) {
- pr_err("%s: Could not map registers.\n", np->full_name);
+ pr_err("%pOF: Could not map registers.\n", np);
goto error_free;
}
@@ -253,8 +253,8 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
irq_data = irq_get_irq_data(irq);
if (!irq_data) {
- pr_err("%s: combiner-%d no irq_data for virq %d!\n",
- np->full_name, i, irq);
+ pr_err("%pOF: combiner-%d no irq_data for virq %d!\n",
+ np, i, irq);
continue;
}
@@ -265,16 +265,16 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
* of the core priority interrupts (4 - 15).
*/
if (hwirq < 4 || hwirq >= NR_PRIORITY_IRQS) {
- pr_err("%s: combiner-%d core irq %ld out of range!\n",
- np->full_name, i, hwirq);
+ pr_err("%pOF: combiner-%d core irq %ld out of range!\n",
+ np, i, hwirq);
continue;
}
/* record the mapping */
mapping[hwirq - 4] = i;
- pr_debug("%s: combiner-%d cascading to hwirq %ld\n",
- np->full_name, i, hwirq);
+ pr_debug("%pOF: combiner-%d cascading to hwirq %ld\n",
+ np, i, hwirq);
cascade_data[i].pic = pic;
cascade_data[i].index = i;
@@ -290,8 +290,8 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
/* Finally, set up the MUX registers */
for (i = 0; i < NR_MUX_OUTPUTS; i++) {
if (mapping[i] != IRQ_UNMAPPED) {
- pr_debug("%s: setting mux %d to priority %d\n",
- np->full_name, mapping[i], i + 4);
+ pr_debug("%pOF: setting mux %d to priority %d\n",
+ np, mapping[i], i + 4);
set_megamod_mux(pic, mapping[i], i);
}
}
diff --git a/arch/c6x/platforms/plldata.c b/arch/c6x/platforms/plldata.c
index 755359eb6286..e8b6cc6a7b5a 100644
--- a/arch/c6x/platforms/plldata.c
+++ b/arch/c6x/platforms/plldata.c
@@ -436,8 +436,8 @@ void __init c64x_setup_clocks(void)
err = of_property_read_u32(node, "clock-frequency", &val);
if (err || val == 0) {
- pr_err("%s: no clock-frequency found! Using %dMHz\n",
- node->full_name, (int)val / 1000000);
+ pr_err("%pOF: no clock-frequency found! Using %dMHz\n",
+ node, (int)val / 1000000);
val = 25000000;
}
clkin1.rate = val;
diff --git a/arch/c6x/platforms/timer64.c b/arch/c6x/platforms/timer64.c
index 0bd0452ded80..241a9a607193 100644
--- a/arch/c6x/platforms/timer64.c
+++ b/arch/c6x/platforms/timer64.c
@@ -204,14 +204,14 @@ void __init timer64_init(void)
timer = of_iomap(np, 0);
if (!timer) {
- pr_debug("%s: Cannot map timer registers.\n", np->full_name);
+ pr_debug("%pOF: Cannot map timer registers.\n", np);
goto out;
}
- pr_debug("%s: Timer registers=%p.\n", np->full_name, timer);
+ pr_debug("%pOF: Timer registers=%p.\n", np, timer);
cd->irq = irq_of_parse_and_map(np, 0);
if (cd->irq == NO_IRQ) {
- pr_debug("%s: Cannot find interrupt.\n", np->full_name);
+ pr_debug("%pOF: Cannot find interrupt.\n", np);
iounmap(timer);
goto out;
}
@@ -229,7 +229,7 @@ void __init timer64_init(void)
dscr_set_devstate(timer64_devstate_id, DSCR_DEVSTATE_ENABLED);
}
- pr_debug("%s: Timer irq=%d.\n", np->full_name, cd->irq);
+ pr_debug("%pOF: Timer irq=%d.\n", np, cd->irq);
clockevents_calc_mult_shift(cd, c6x_core_freq / TIMER_DIVISOR, 5);
diff --git a/arch/cris/arch-v32/mach-a3/arbiter.c b/arch/cris/arch-v32/mach-a3/arbiter.c
index ab5c421a4de8..735a9b0abdb8 100644
--- a/arch/cris/arch-v32/mach-a3/arbiter.c
+++ b/arch/cris/arch-v32/mach-a3/arbiter.c
@@ -227,7 +227,7 @@ static void crisv32_arbiter_config(int arbiter, int region, int unused_slots)
}
}
-extern char _stext, _etext;
+extern char _stext[], _etext[];
static void crisv32_arbiter_init(void)
{
@@ -265,7 +265,7 @@ static void crisv32_arbiter_init(void)
#ifndef CONFIG_ETRAX_KGDB
/* Global watch for writes to kernel text segment. */
- crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext,
+ crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext,
MARB_CLIENTS(arbiter_all_clients, arbiter_bar_all_clients),
arbiter_all_write, NULL);
#endif
diff --git a/arch/cris/arch-v32/mach-fs/arbiter.c b/arch/cris/arch-v32/mach-fs/arbiter.c
index c97f4d8120f9..047c70bdbb23 100644
--- a/arch/cris/arch-v32/mach-fs/arbiter.c
+++ b/arch/cris/arch-v32/mach-fs/arbiter.c
@@ -158,7 +158,7 @@ static void crisv32_arbiter_config(int region, int unused_slots)
}
}
-extern char _stext, _etext;
+extern char _stext[], _etext[];
static void crisv32_arbiter_init(void)
{
@@ -190,7 +190,7 @@ static void crisv32_arbiter_init(void)
#ifndef CONFIG_ETRAX_KGDB
/* Global watch for writes to kernel text segment. */
- crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext,
+ crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext,
arbiter_all_clients, arbiter_all_write, NULL);
#endif
}
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index a01636a12a6e..d98131c45bb5 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -42,7 +42,7 @@ void (*nmi_handler)(struct pt_regs *);
void show_trace(unsigned long *stack)
{
unsigned long addr, module_start, module_end;
- extern char _stext, _etext;
+ extern char _stext[], _etext[];
int i;
pr_err("\nCall Trace: ");
@@ -69,8 +69,8 @@ void show_trace(unsigned long *stack)
* down the cause of the crash will be able to figure
* out the call path that was taken.
*/
- if (((addr >= (unsigned long)&_stext) &&
- (addr <= (unsigned long)&_etext)) ||
+ if (((addr >= (unsigned long)_stext) &&
+ (addr <= (unsigned long)_etext)) ||
((addr >= module_start) && (addr <= module_end))) {
#ifdef CONFIG_KALLSYMS
print_ip_sym(addr);
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 2e1da71e27a4..ab346f5f8820 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -7,7 +7,8 @@
#include <asm/errno.h>
#include <linux/uaccess.h>
-extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
+extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr);
static inline int
futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index d155ca9e5098..37f7b2bf7f73 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o
/*
* do the futex operations
*/
-int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
-
pagefault_disable();
switch (op) {
@@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS; break;
- }
- }
+ if (!ret)
+ *oval = oldval;
return ret;
-} /* end futex_atomic_op_inuser() */
+} /* end arch_futex_atomic_op_inuser() */
diff --git a/arch/h8300/include/asm/traps.h b/arch/h8300/include/asm/traps.h
index 15e701130b27..1c5a30ec2df8 100644
--- a/arch/h8300/include/asm/traps.h
+++ b/arch/h8300/include/asm/traps.h
@@ -33,9 +33,9 @@ extern unsigned long *_interrupt_redirect_table;
#define TRAP2_VEC 10
#define TRAP3_VEC 11
-extern char _start, _etext;
+extern char _start[], _etext[];
#define check_kernel_text(addr) \
- ((addr >= (unsigned long)(&_start)) && \
- (addr < (unsigned long)(&_etext)) && !(addr & 1))
+ ((addr >= (unsigned long)(_start)) && \
+ (addr < (unsigned long)(_etext)) && !(addr & 1))
#endif /* _H8300_TRAPS_H */
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index a62ba368b27d..fb3dfb2a667e 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new)
);
}
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
/**
* atomic_read - reads a word, atomically
* @v: pointer to atomic value
diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h
index 7e597f8434da..c607b77c8215 100644
--- a/arch/hexagon/include/asm/futex.h
+++ b/arch/hexagon/include/asm/futex.h
@@ -31,18 +31,9 @@
static inline int
-futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
- return -EFAULT;
pagefault_disable();
@@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- ret = (oldval == cmparg);
- break;
- case FUTEX_OP_CMP_NE:
- ret = (oldval != cmparg);
- break;
- case FUTEX_OP_CMP_LT:
- ret = (oldval < cmparg);
- break;
- case FUTEX_OP_CMP_GE:
- ret = (oldval >= cmparg);
- break;
- case FUTEX_OP_CMP_LE:
- ret = (oldval <= cmparg);
- break;
- case FUTEX_OP_CMP_GT:
- ret = (oldval > cmparg);
- break;
- default:
- ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h
index a1c55788c5d6..53a8d5885887 100644
--- a/arch/hexagon/include/asm/spinlock.h
+++ b/arch/hexagon/include/asm/spinlock.h
@@ -179,11 +179,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
*/
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
#define arch_spin_is_locked(x) ((x)->lock != 0)
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index a3d0211970e9..c86a947f5368 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
}
-#define acpi_unlazy_tlb(x)
-
#ifdef CONFIG_ACPI_NUMA
extern cpumask_t early_cpu_possible_map;
#define for_each_possible_early_cpu(cpu) \
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index 76acbcd5c060..6d67dc1eaf2b 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -45,18 +45,9 @@ do { \
} while (0)
static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
pagefault_disable();
@@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index ca9e76149a4a..df2c121164b8 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -76,22 +76,6 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
ACCESS_ONCE(*p) = (tmp + 2) & ~1;
}
-static __always_inline void __ticket_spin_unlock_wait(arch_spinlock_t *lock)
-{
- int *p = (int *)&lock->lock, ticket;
-
- ia64_invala();
-
- for (;;) {
- asm volatile ("ld4.c.nc %0=[%1]" : "=r"(ticket) : "r"(p) : "memory");
- if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK))
- return;
- cpu_relax();
- }
-
- smp_acquire__after_ctrl_dep();
-}
-
static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
{
long tmp = ACCESS_ONCE(lock->lock);
@@ -143,11 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
arch_spin_lock(lock);
}
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- __ticket_spin_unlock_wait(lock);
-}
-
#define arch_read_can_lock(rw) (*(volatile int *)(rw) >= 0)
#define arch_write_can_lock(rw) (*(volatile int *)(rw) == 0)
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index fced197b9626..cbe5ac3699bf 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -168,7 +168,8 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb)
static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
tlb->max = ARRAY_SIZE(tlb->local);
@@ -185,8 +186,11 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
* collected.
*/
static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
+ if (force)
+ tlb->need_flush = 1;
/*
* Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
* tlb->end_addr.
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 121295637d0d..81416000c5e0 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
return 0;
}
-u32
+int
efi_mem_type (unsigned long phys_addr)
{
efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
if (md)
return md->type;
- return 0;
+ return -EINVAL;
}
u64
diff --git a/arch/m32r/include/asm/flat.h b/arch/m32r/include/asm/flat.h
index 455ce7ddbf14..dfcb0e4eb256 100644
--- a/arch/m32r/include/asm/flat.h
+++ b/arch/m32r/include/asm/flat.h
@@ -95,7 +95,7 @@ static inline unsigned long m32r_flat_get_addr_from_rp (u32 *rp,
return ~0; /* bogus value */
}
-static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval)
+static inline int flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval)
{
unsigned int reloc = flat_m32r_get_reloc_type (relval);
if (reloc & 0xf0) {
@@ -133,6 +133,7 @@ static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval)
break;
}
}
+ return 0;
}
// kludge - text_len is a local variable in the only user.
diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h
index 323c7fc953cd..a56825592b90 100644
--- a/arch/m32r/include/asm/spinlock.h
+++ b/arch/m32r/include/asm/spinlock.h
@@ -30,11 +30,6 @@
#define arch_spin_is_locked(x) (*(volatile int *)(&(x)->slock) <= 0)
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->slock, VAL > 0);
-}
-
/**
* arch_spin_trylock - Try spin lock and return a result
* @lock: Pointer to the lock variable
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index 5b7a45d99cfb..7d8b322e5101 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -26,6 +26,7 @@ config METAG
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNDERSCORE_SYMBOL_PREFIX
select IRQ_DOMAIN
+ select GENERIC_IRQ_EFFECTIVE_AFF_MASK
select MODULES_USE_ELF_RELA
select OF
select OF_EARLY_FLATTREE
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index 6c1380a8a0d4..eee779f26cc4 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i)
return i;
}
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
#define ATOMIC_OP(op, c_op) \
static inline void atomic_##op(int i, atomic_t *v) \
{ \
diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h
index c0c7a22be1ae..ddf7fe5708a6 100644
--- a/arch/metag/include/asm/spinlock.h
+++ b/arch/metag/include/asm/spinlock.h
@@ -15,11 +15,6 @@
* locked.
*/
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
diff --git a/arch/microblaze/include/asm/flat.h b/arch/microblaze/include/asm/flat.h
index f23c3d266bae..3d2747d4c967 100644
--- a/arch/microblaze/include/asm/flat.h
+++ b/arch/microblaze/include/asm/flat.h
@@ -60,7 +60,7 @@ static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags,
* unaligned.
*/
-static inline void
+static inline int
flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 relval)
{
u32 *p = (__force u32 *)rp;
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index 01848f056f43..a9dad9e5e132 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -29,18 +29,9 @@
})
static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
pagefault_disable();
@@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- ret = (oldval == cmparg);
- break;
- case FUTEX_OP_CMP_NE:
- ret = (oldval != cmparg);
- break;
- case FUTEX_OP_CMP_LT:
- ret = (oldval < cmparg);
- break;
- case FUTEX_OP_CMP_GE:
- ret = (oldval >= cmparg);
- break;
- case FUTEX_OP_CMP_LE:
- ret = (oldval <= cmparg);
- break;
- case FUTEX_OP_CMP_GT:
- ret = (oldval > cmparg);
- break;
- default:
- ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8dd20358464f..48d91d5be4e9 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2260,7 +2260,7 @@ config CPU_R4K_CACHE_TLB
config MIPS_MT_SMP
bool "MIPS MT SMP support (1 TC on each available VPE)"
- depends on SYS_SUPPORTS_MULTITHREADING && !CPU_MIPSR6
+ depends on SYS_SUPPORTS_MULTITHREADING && !CPU_MIPSR6 && !CPU_MICROMIPS
select CPU_MIPSR2_IRQ_VI
select CPU_MIPSR2_IRQ_EI
select SYNC_R4K
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 04343625b929..bc2708c9ada4 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -243,8 +243,21 @@ include arch/mips/Kbuild.platforms
ifdef CONFIG_PHYSICAL_START
load-y = $(CONFIG_PHYSICAL_START)
endif
-entry-y = 0x$(shell $(NM) vmlinux 2>/dev/null \
+
+entry-noisa-y = 0x$(shell $(NM) vmlinux 2>/dev/null \
| grep "\bkernel_entry\b" | cut -f1 -d \ )
+ifdef CONFIG_CPU_MICROMIPS
+ #
+ # Set the ISA bit, since the kernel_entry symbol in the ELF will have it
+ # clear which would lead to images containing addresses which bootloaders may
+ # jump to as MIPS32 code.
+ #
+ entry-y = $(patsubst %0,%1,$(patsubst %2,%3,$(patsubst %4,%5, \
+ $(patsubst %6,%7,$(patsubst %8,%9,$(patsubst %a,%b, \
+ $(patsubst %c,%d,$(patsubst %e,%f,$(entry-noisa-y)))))))))
+else
+ entry-y = $(entry-noisa-y)
+endif
cflags-y += -I$(srctree)/arch/mips/include/asm/mach-generic
drivers-$(CONFIG_PCI) += arch/mips/pci/
diff --git a/arch/mips/boot/compressed/.gitignore b/arch/mips/boot/compressed/.gitignore
new file mode 100644
index 000000000000..ebae133f1d00
--- /dev/null
+++ b/arch/mips/boot/compressed/.gitignore
@@ -0,0 +1,2 @@
+ashldi3.c
+bswapsi.c
diff --git a/arch/mips/cavium-octeon/octeon-usb.c b/arch/mips/cavium-octeon/octeon-usb.c
index 542be1cd0f32..bfdfaf32d2c4 100644
--- a/arch/mips/cavium-octeon/octeon-usb.c
+++ b/arch/mips/cavium-octeon/octeon-usb.c
@@ -13,9 +13,9 @@
#include <linux/mutex.h>
#include <linux/delay.h>
#include <linux/of_platform.h>
+#include <linux/io.h>
#include <asm/octeon/octeon.h>
-#include <asm/octeon/cvmx-gpio-defs.h>
/* USB Control Register */
union cvm_usbdrd_uctl_ctl {
diff --git a/arch/mips/dec/int-handler.S b/arch/mips/dec/int-handler.S
index 1910223a9c02..cea2bb1621e6 100644
--- a/arch/mips/dec/int-handler.S
+++ b/arch/mips/dec/int-handler.S
@@ -147,23 +147,12 @@
* Find irq with highest priority
*/
# open coded PTR_LA t1, cpu_mask_nr_tbl
-#if (_MIPS_SZPTR == 32)
+#if defined(CONFIG_32BIT) || defined(KBUILD_64BIT_SYM32)
# open coded la t1, cpu_mask_nr_tbl
lui t1, %hi(cpu_mask_nr_tbl)
addiu t1, %lo(cpu_mask_nr_tbl)
-
-#endif
-#if (_MIPS_SZPTR == 64)
- # open coded dla t1, cpu_mask_nr_tbl
- .set push
- .set noat
- lui t1, %highest(cpu_mask_nr_tbl)
- lui AT, %hi(cpu_mask_nr_tbl)
- daddiu t1, t1, %higher(cpu_mask_nr_tbl)
- daddiu AT, AT, %lo(cpu_mask_nr_tbl)
- dsll t1, 32
- daddu t1, t1, AT
- .set pop
+#else
+#error GCC `-msym32' option required for 64-bit DECstation builds
#endif
1: lw t2,(t1)
nop
@@ -214,23 +203,12 @@
* Find irq with highest priority
*/
# open coded PTR_LA t1,asic_mask_nr_tbl
-#if (_MIPS_SZPTR == 32)
+#if defined(CONFIG_32BIT) || defined(KBUILD_64BIT_SYM32)
# open coded la t1, asic_mask_nr_tbl
lui t1, %hi(asic_mask_nr_tbl)
addiu t1, %lo(asic_mask_nr_tbl)
-
-#endif
-#if (_MIPS_SZPTR == 64)
- # open coded dla t1, asic_mask_nr_tbl
- .set push
- .set noat
- lui t1, %highest(asic_mask_nr_tbl)
- lui AT, %hi(asic_mask_nr_tbl)
- daddiu t1, t1, %higher(asic_mask_nr_tbl)
- daddiu AT, AT, %lo(asic_mask_nr_tbl)
- dsll t1, 32
- daddu t1, t1, AT
- .set pop
+#else
+#error GCC `-msym32' option required for 64-bit DECstation builds
#endif
2: lw t2,(t1)
nop
diff --git a/arch/mips/include/asm/cache.h b/arch/mips/include/asm/cache.h
index fc67947ed658..8b14c2706aa5 100644
--- a/arch/mips/include/asm/cache.h
+++ b/arch/mips/include/asm/cache.h
@@ -9,6 +9,8 @@
#ifndef _ASM_CACHE_H
#define _ASM_CACHE_H
+#include <kmalloc.h>
+
#define L1_CACHE_SHIFT CONFIG_MIPS_L1_CACHE_SHIFT
#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index 8baa9033b181..721b698bfe3c 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -428,6 +428,9 @@
#ifndef cpu_scache_line_size
#define cpu_scache_line_size() cpu_data[0].scache.linesz
#endif
+#ifndef cpu_tcache_line_size
+#define cpu_tcache_line_size() cpu_data[0].tcache.linesz
+#endif
#ifndef cpu_hwrena_impl_bits
#define cpu_hwrena_impl_bits 0
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index 1de190bdfb9c..a9e61ea54ca9 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -83,18 +83,9 @@
}
static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
pagefault_disable();
@@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 2998479fd4e8..a9af1d2dcd69 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -938,11 +938,6 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address)
-{
-}
-
/* Emulation */
int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
diff --git a/arch/mips/include/asm/mach-ralink/ralink_regs.h b/arch/mips/include/asm/mach-ralink/ralink_regs.h
index 9df1a53bcb36..b4e7dfa214eb 100644
--- a/arch/mips/include/asm/mach-ralink/ralink_regs.h
+++ b/arch/mips/include/asm/mach-ralink/ralink_regs.h
@@ -13,6 +13,8 @@
#ifndef _RALINK_REGS_H_
#define _RALINK_REGS_H_
+#include <linux/io.h>
+
enum ralink_soc_type {
RALINK_UNKNOWN = 0,
RT2880_SOC,
diff --git a/arch/mips/include/asm/octeon/cvmx-l2c-defs.h b/arch/mips/include/asm/octeon/cvmx-l2c-defs.h
index d045973ddb33..3ea84acf1814 100644
--- a/arch/mips/include/asm/octeon/cvmx-l2c-defs.h
+++ b/arch/mips/include/asm/octeon/cvmx-l2c-defs.h
@@ -33,6 +33,10 @@
#define CVMX_L2C_DBG (CVMX_ADD_IO_SEG(0x0001180080000030ull))
#define CVMX_L2C_CFG (CVMX_ADD_IO_SEG(0x0001180080000000ull))
#define CVMX_L2C_CTL (CVMX_ADD_IO_SEG(0x0001180080800000ull))
+#define CVMX_L2C_ERR_TDTX(block_id) \
+ (CVMX_ADD_IO_SEG(0x0001180080A007E0ull) + ((block_id) & 3) * 0x40000ull)
+#define CVMX_L2C_ERR_TTGX(block_id) \
+ (CVMX_ADD_IO_SEG(0x0001180080A007E8ull) + ((block_id) & 3) * 0x40000ull)
#define CVMX_L2C_LCKBASE (CVMX_ADD_IO_SEG(0x0001180080000058ull))
#define CVMX_L2C_LCKOFF (CVMX_ADD_IO_SEG(0x0001180080000060ull))
#define CVMX_L2C_PFCTL (CVMX_ADD_IO_SEG(0x0001180080000090ull))
@@ -66,9 +70,40 @@
((offset) & 1) * 8)
#define CVMX_L2C_WPAR_PPX(offset) (CVMX_ADD_IO_SEG(0x0001180080840000ull) + \
((offset) & 31) * 8)
-#define CVMX_L2D_FUS3 (CVMX_ADD_IO_SEG(0x00011800800007B8ull))
+union cvmx_l2c_err_tdtx {
+ uint64_t u64;
+ struct cvmx_l2c_err_tdtx_s {
+ __BITFIELD_FIELD(uint64_t dbe:1,
+ __BITFIELD_FIELD(uint64_t sbe:1,
+ __BITFIELD_FIELD(uint64_t vdbe:1,
+ __BITFIELD_FIELD(uint64_t vsbe:1,
+ __BITFIELD_FIELD(uint64_t syn:10,
+ __BITFIELD_FIELD(uint64_t reserved_22_49:28,
+ __BITFIELD_FIELD(uint64_t wayidx:18,
+ __BITFIELD_FIELD(uint64_t reserved_2_3:2,
+ __BITFIELD_FIELD(uint64_t type:2,
+ ;)))))))))
+ } s;
+};
+
+union cvmx_l2c_err_ttgx {
+ uint64_t u64;
+ struct cvmx_l2c_err_ttgx_s {
+ __BITFIELD_FIELD(uint64_t dbe:1,
+ __BITFIELD_FIELD(uint64_t sbe:1,
+ __BITFIELD_FIELD(uint64_t noway:1,
+ __BITFIELD_FIELD(uint64_t reserved_56_60:5,
+ __BITFIELD_FIELD(uint64_t syn:6,
+ __BITFIELD_FIELD(uint64_t reserved_22_49:28,
+ __BITFIELD_FIELD(uint64_t wayidx:15,
+ __BITFIELD_FIELD(uint64_t reserved_2_6:5,
+ __BITFIELD_FIELD(uint64_t type:2,
+ ;)))))))))
+ } s;
+};
+
union cvmx_l2c_cfg {
uint64_t u64;
struct cvmx_l2c_cfg_s {
diff --git a/arch/mips/include/asm/octeon/cvmx-l2d-defs.h b/arch/mips/include/asm/octeon/cvmx-l2d-defs.h
new file mode 100644
index 000000000000..a951ad5d65ad
--- /dev/null
+++ b/arch/mips/include/asm/octeon/cvmx-l2d-defs.h
@@ -0,0 +1,60 @@
+/***********************license start***************
+ * Author: Cavium Networks
+ *
+ * Contact: support@caviumnetworks.com
+ * This file is part of the OCTEON SDK
+ *
+ * Copyright (c) 2003-2017 Cavium, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, Version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * AS-IS and WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or
+ * NONINFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this file; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ * or visit http://www.gnu.org/licenses/.
+ *
+ * This file may also be available under a different license from Cavium.
+ * Contact Cavium Networks for more information
+ ***********************license end**************************************/
+
+#ifndef __CVMX_L2D_DEFS_H__
+#define __CVMX_L2D_DEFS_H__
+
+#define CVMX_L2D_ERR (CVMX_ADD_IO_SEG(0x0001180080000010ull))
+#define CVMX_L2D_FUS3 (CVMX_ADD_IO_SEG(0x00011800800007B8ull))
+
+
+union cvmx_l2d_err {
+ uint64_t u64;
+ struct cvmx_l2d_err_s {
+ __BITFIELD_FIELD(uint64_t reserved_6_63:58,
+ __BITFIELD_FIELD(uint64_t bmhclsel:1,
+ __BITFIELD_FIELD(uint64_t ded_err:1,
+ __BITFIELD_FIELD(uint64_t sec_err:1,
+ __BITFIELD_FIELD(uint64_t ded_intena:1,
+ __BITFIELD_FIELD(uint64_t sec_intena:1,
+ __BITFIELD_FIELD(uint64_t ecc_ena:1,
+ ;)))))))
+ } s;
+};
+
+union cvmx_l2d_fus3 {
+ uint64_t u64;
+ struct cvmx_l2d_fus3_s {
+ __BITFIELD_FIELD(uint64_t reserved_40_63:24,
+ __BITFIELD_FIELD(uint64_t ema_ctl:3,
+ __BITFIELD_FIELD(uint64_t reserved_34_36:3,
+ __BITFIELD_FIELD(uint64_t q3fus:34,
+ ;))))
+ } s;
+};
+
+#endif
diff --git a/arch/mips/include/asm/octeon/cvmx.h b/arch/mips/include/asm/octeon/cvmx.h
index 9742202f2a32..e638735cc3ac 100644
--- a/arch/mips/include/asm/octeon/cvmx.h
+++ b/arch/mips/include/asm/octeon/cvmx.h
@@ -62,6 +62,7 @@ enum cvmx_mips_space {
#include <asm/octeon/cvmx-iob-defs.h>
#include <asm/octeon/cvmx-ipd-defs.h>
#include <asm/octeon/cvmx-l2c-defs.h>
+#include <asm/octeon/cvmx-l2d-defs.h>
#include <asm/octeon/cvmx-l2t-defs.h>
#include <asm/octeon/cvmx-led-defs.h>
#include <asm/octeon/cvmx-mio-defs.h>
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 6dd13641a418..1395654cfc8d 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -872,15 +872,13 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
if (unlikely(test_thread_flag(TIF_SECCOMP))) {
int ret, i;
struct seccomp_data sd;
+ unsigned long args[6];
sd.nr = syscall;
sd.arch = syscall_get_arch();
- for (i = 0; i < 6; i++) {
- unsigned long v, r;
-
- r = mips_get_syscall_arg(&v, current, regs, i);
- sd.args[i] = r ? 0 : v;
- }
+ syscall_get_arguments(current, regs, 0, 6, args);
+ for (i = 0; i < 6; i++)
+ sd.args[i] = args[i];
sd.instruction_pointer = KSTK_EIP(current);
ret = __secure_computing(&sd);
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 27c2f90eeb21..a9a7d78803cd 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -190,12 +190,6 @@ illegal_syscall:
sll t1, t0, 2
beqz v0, einval
lw t2, sys_call_table(t1) # syscall routine
- sw a0, PT_R2(sp) # call routine directly on restart
-
- /* Some syscalls like execve get their arguments from struct pt_regs
- and claim zero arguments in the syscall table. Thus we have to
- assume the worst case and shuffle around all potential arguments.
- If you want performance, don't use indirect syscalls. */
move a0, a1 # shift argument registers
move a1, a2
@@ -207,11 +201,6 @@ illegal_syscall:
sw t4, 16(sp)
sw t5, 20(sp)
sw t6, 24(sp)
- sw a0, PT_R4(sp) # .. and push back a0 - a3, some
- sw a1, PT_R5(sp) # syscalls expect them there
- sw a2, PT_R6(sp)
- sw a3, PT_R7(sp)
- sw a3, PT_R26(sp) # update a3 for syscall restarting
jr t2
/* Unreached */
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index c30bc520885f..9ebe3e2403b1 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -198,7 +198,6 @@ LEAF(sys32_syscall)
dsll t1, t0, 3
beqz v0, einval
ld t2, sys32_call_table(t1) # syscall routine
- sd a0, PT_R2(sp) # call routine directly on restart
move a0, a1 # shift argument registers
move a1, a2
@@ -207,11 +206,6 @@ LEAF(sys32_syscall)
move a4, a5
move a5, a6
move a6, a7
- sd a0, PT_R4(sp) # ... and push back a0 - a3, some
- sd a1, PT_R5(sp) # syscalls expect them there
- sd a2, PT_R6(sp)
- sd a3, PT_R7(sp)
- sd a3, PT_R26(sp) # update a3 for syscall restarting
jr t2
/* Unreached */
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 770d4d1516cb..c7cbddfcdc3b 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -376,9 +376,6 @@ asmlinkage void start_secondary(void)
cpumask_set_cpu(cpu, &cpu_coherent_mask);
notify_cpu_starting(cpu);
- complete(&cpu_running);
- synchronise_count_slave(cpu);
-
set_cpu_online(cpu, true);
set_cpu_sibling_map(cpu);
@@ -386,6 +383,9 @@ asmlinkage void start_secondary(void)
calculate_cpu_foreign_map();
+ complete(&cpu_running);
+ synchronise_count_slave(cpu);
+
/*
* irq will be enabled in ->smp_finish(), enabling it too early
* is dangerous.
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
-static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
void tick_broadcast(const struct cpumask *mask)
{
atomic_t *count;
- struct call_single_data *csd;
+ call_single_data_t *csd;
int cpu;
for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
static int __init tick_broadcast_init(void)
{
- struct call_single_data *csd;
+ call_single_data_t *csd;
int cpu;
for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 3f74f6c1f065..9fea6c6bbf49 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -48,7 +48,7 @@
#include "uasm.c"
-static const struct insn const insn_table[insn_invalid] = {
+static const struct insn insn_table[insn_invalid] = {
[insn_addiu] = {M(addiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM},
[insn_addu] = {M(spec_op, 0, 0, 0, 0, addu_op), RS | RT | RD},
[insn_and] = {M(spec_op, 0, 0, 0, 0, and_op), RS | RT | RD},
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
new file mode 100644
index 000000000000..3f87b96da5c4
--- /dev/null
+++ b/arch/mips/net/ebpf_jit.c
@@ -0,0 +1,1950 @@
+/*
+ * Just-In-Time compiler for eBPF filters on MIPS
+ *
+ * Copyright (c) 2017 Cavium, Inc.
+ *
+ * Based on code from:
+ *
+ * Copyright (c) 2014 Imagination Technologies Ltd.
+ * Author: Markos Chandras <markos.chandras@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/slab.h>
+#include <asm/bitops.h>
+#include <asm/byteorder.h>
+#include <asm/cacheflush.h>
+#include <asm/cpu-features.h>
+#include <asm/uasm.h>
+
+/* Registers used by JIT */
+#define MIPS_R_ZERO 0
+#define MIPS_R_AT 1
+#define MIPS_R_V0 2 /* BPF_R0 */
+#define MIPS_R_V1 3
+#define MIPS_R_A0 4 /* BPF_R1 */
+#define MIPS_R_A1 5 /* BPF_R2 */
+#define MIPS_R_A2 6 /* BPF_R3 */
+#define MIPS_R_A3 7 /* BPF_R4 */
+#define MIPS_R_A4 8 /* BPF_R5 */
+#define MIPS_R_T4 12 /* BPF_AX */
+#define MIPS_R_T5 13
+#define MIPS_R_T6 14
+#define MIPS_R_T7 15
+#define MIPS_R_S0 16 /* BPF_R6 */
+#define MIPS_R_S1 17 /* BPF_R7 */
+#define MIPS_R_S2 18 /* BPF_R8 */
+#define MIPS_R_S3 19 /* BPF_R9 */
+#define MIPS_R_S4 20 /* BPF_TCC */
+#define MIPS_R_S5 21
+#define MIPS_R_S6 22
+#define MIPS_R_S7 23
+#define MIPS_R_T8 24
+#define MIPS_R_T9 25
+#define MIPS_R_SP 29
+#define MIPS_R_RA 31
+
+/* eBPF flags */
+#define EBPF_SAVE_S0 BIT(0)
+#define EBPF_SAVE_S1 BIT(1)
+#define EBPF_SAVE_S2 BIT(2)
+#define EBPF_SAVE_S3 BIT(3)
+#define EBPF_SAVE_S4 BIT(4)
+#define EBPF_SAVE_RA BIT(5)
+#define EBPF_SEEN_FP BIT(6)
+#define EBPF_SEEN_TC BIT(7)
+#define EBPF_TCC_IN_V1 BIT(8)
+
+/*
+ * For the mips64 ISA, we need to track the value range or type for
+ * each JIT register. The BPF machine requires zero extended 32-bit
+ * values, but the mips64 ISA requires sign extended 32-bit values.
+ * At each point in the BPF program we track the state of every
+ * register so that we can zero extend or sign extend as the BPF
+ * semantics require.
+ */
+enum reg_val_type {
+ /* uninitialized */
+ REG_UNKNOWN,
+ /* not known to be 32-bit compatible. */
+ REG_64BIT,
+ /* 32-bit compatible, no truncation needed for 64-bit ops. */
+ REG_64BIT_32BIT,
+ /* 32-bit compatible, need truncation for 64-bit ops. */
+ REG_32BIT,
+ /* 32-bit zero extended. */
+ REG_32BIT_ZERO_EX,
+ /* 32-bit no sign/zero extension needed. */
+ REG_32BIT_POS
+};
+
+/*
+ * high bit of offsets indicates if long branch conversion done at
+ * this insn.
+ */
+#define OFFSETS_B_CONV BIT(31)
+
+/**
+ * struct jit_ctx - JIT context
+ * @skf: The sk_filter
+ * @stack_size: eBPF stack size
+ * @tmp_offset: eBPF $sp offset to 8-byte temporary memory
+ * @idx: Instruction index
+ * @flags: JIT flags
+ * @offsets: Instruction offsets
+ * @target: Memory location for the compiled filter
+ * @reg_val_types Packed enum reg_val_type for each register.
+ */
+struct jit_ctx {
+ const struct bpf_prog *skf;
+ int stack_size;
+ int tmp_offset;
+ u32 idx;
+ u32 flags;
+ u32 *offsets;
+ u32 *target;
+ u64 *reg_val_types;
+ unsigned int long_b_conversion:1;
+ unsigned int gen_b_offsets:1;
+};
+
+static void set_reg_val_type(u64 *rvt, int reg, enum reg_val_type type)
+{
+ *rvt &= ~(7ull << (reg * 3));
+ *rvt |= ((u64)type << (reg * 3));
+}
+
+static enum reg_val_type get_reg_val_type(const struct jit_ctx *ctx,
+ int index, int reg)
+{
+ return (ctx->reg_val_types[index] >> (reg * 3)) & 7;
+}
+
+/* Simply emit the instruction if the JIT memory space has been allocated */
+#define emit_instr(ctx, func, ...) \
+do { \
+ if ((ctx)->target != NULL) { \
+ u32 *p = &(ctx)->target[ctx->idx]; \
+ uasm_i_##func(&p, ##__VA_ARGS__); \
+ } \
+ (ctx)->idx++; \
+} while (0)
+
+static unsigned int j_target(struct jit_ctx *ctx, int target_idx)
+{
+ unsigned long target_va, base_va;
+ unsigned int r;
+
+ if (!ctx->target)
+ return 0;
+
+ base_va = (unsigned long)ctx->target;
+ target_va = base_va + (ctx->offsets[target_idx] & ~OFFSETS_B_CONV);
+
+ if ((base_va & ~0x0ffffffful) != (target_va & ~0x0ffffffful))
+ return (unsigned int)-1;
+ r = target_va & 0x0ffffffful;
+ return r;
+}
+
+/* Compute the immediate value for PC-relative branches. */
+static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx)
+{
+ if (!ctx->gen_b_offsets)
+ return 0;
+
+ /*
+ * We want a pc-relative branch. tgt is the instruction offset
+ * we want to jump to.
+
+ * Branch on MIPS:
+ * I: target_offset <- sign_extend(offset)
+ * I+1: PC += target_offset (delay slot)
+ *
+ * ctx->idx currently points to the branch instruction
+ * but the offset is added to the delay slot so we need
+ * to subtract 4.
+ */
+ return (ctx->offsets[tgt] & ~OFFSETS_B_CONV) -
+ (ctx->idx * 4) - 4;
+}
+
+int bpf_jit_enable __read_mostly;
+
+enum which_ebpf_reg {
+ src_reg,
+ src_reg_no_fp,
+ dst_reg,
+ dst_reg_fp_ok
+};
+
+/*
+ * For eBPF, the register mapping naturally falls out of the
+ * requirements of eBPF and the MIPS n64 ABI. We don't maintain a
+ * separate frame pointer, so BPF_REG_10 relative accesses are
+ * adjusted to be $sp relative.
+ */
+int ebpf_to_mips_reg(struct jit_ctx *ctx, const struct bpf_insn *insn,
+ enum which_ebpf_reg w)
+{
+ int ebpf_reg = (w == src_reg || w == src_reg_no_fp) ?
+ insn->src_reg : insn->dst_reg;
+
+ switch (ebpf_reg) {
+ case BPF_REG_0:
+ return MIPS_R_V0;
+ case BPF_REG_1:
+ return MIPS_R_A0;
+ case BPF_REG_2:
+ return MIPS_R_A1;
+ case BPF_REG_3:
+ return MIPS_R_A2;
+ case BPF_REG_4:
+ return MIPS_R_A3;
+ case BPF_REG_5:
+ return MIPS_R_A4;
+ case BPF_REG_6:
+ ctx->flags |= EBPF_SAVE_S0;
+ return MIPS_R_S0;
+ case BPF_REG_7:
+ ctx->flags |= EBPF_SAVE_S1;
+ return MIPS_R_S1;
+ case BPF_REG_8:
+ ctx->flags |= EBPF_SAVE_S2;
+ return MIPS_R_S2;
+ case BPF_REG_9:
+ ctx->flags |= EBPF_SAVE_S3;
+ return MIPS_R_S3;
+ case BPF_REG_10:
+ if (w == dst_reg || w == src_reg_no_fp)
+ goto bad_reg;
+ ctx->flags |= EBPF_SEEN_FP;
+ /*
+ * Needs special handling, return something that
+ * cannot be clobbered just in case.
+ */
+ return MIPS_R_ZERO;
+ case BPF_REG_AX:
+ return MIPS_R_T4;
+ default:
+bad_reg:
+ WARN(1, "Illegal bpf reg: %d\n", ebpf_reg);
+ return -EINVAL;
+ }
+}
+/*
+ * eBPF stack frame will be something like:
+ *
+ * Entry $sp ------> +--------------------------------+
+ * | $ra (optional) |
+ * +--------------------------------+
+ * | $s0 (optional) |
+ * +--------------------------------+
+ * | $s1 (optional) |
+ * +--------------------------------+
+ * | $s2 (optional) |
+ * +--------------------------------+
+ * | $s3 (optional) |
+ * +--------------------------------+
+ * | $s4 (optional) |
+ * +--------------------------------+
+ * | tmp-storage (if $ra saved) |
+ * $sp + tmp_offset --> +--------------------------------+ <--BPF_REG_10
+ * | BPF_REG_10 relative storage |
+ * | MAX_BPF_STACK (optional) |
+ * | . |
+ * | . |
+ * | . |
+ * $sp --------> +--------------------------------+
+ *
+ * If BPF_REG_10 is never referenced, then the MAX_BPF_STACK sized
+ * area is not allocated.
+ */
+static int gen_int_prologue(struct jit_ctx *ctx)
+{
+ int stack_adjust = 0;
+ int store_offset;
+ int locals_size;
+
+ if (ctx->flags & EBPF_SAVE_RA)
+ /*
+ * If RA we are doing a function call and may need
+ * extra 8-byte tmp area.
+ */
+ stack_adjust += 16;
+ if (ctx->flags & EBPF_SAVE_S0)
+ stack_adjust += 8;
+ if (ctx->flags & EBPF_SAVE_S1)
+ stack_adjust += 8;
+ if (ctx->flags & EBPF_SAVE_S2)
+ stack_adjust += 8;
+ if (ctx->flags & EBPF_SAVE_S3)
+ stack_adjust += 8;
+ if (ctx->flags & EBPF_SAVE_S4)
+ stack_adjust += 8;
+
+ BUILD_BUG_ON(MAX_BPF_STACK & 7);
+ locals_size = (ctx->flags & EBPF_SEEN_FP) ? MAX_BPF_STACK : 0;
+
+ stack_adjust += locals_size;
+ ctx->tmp_offset = locals_size;
+
+ ctx->stack_size = stack_adjust;
+
+ /*
+ * First instruction initializes the tail call count (TCC).
+ * On tail call we skip this instruction, and the TCC is
+ * passed in $v1 from the caller.
+ */
+ emit_instr(ctx, daddiu, MIPS_R_V1, MIPS_R_ZERO, MAX_TAIL_CALL_CNT);
+ if (stack_adjust)
+ emit_instr(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, -stack_adjust);
+ else
+ return 0;
+
+ store_offset = stack_adjust - 8;
+
+ if (ctx->flags & EBPF_SAVE_RA) {
+ emit_instr(ctx, sd, MIPS_R_RA, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S0) {
+ emit_instr(ctx, sd, MIPS_R_S0, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S1) {
+ emit_instr(ctx, sd, MIPS_R_S1, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S2) {
+ emit_instr(ctx, sd, MIPS_R_S2, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S3) {
+ emit_instr(ctx, sd, MIPS_R_S3, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S4) {
+ emit_instr(ctx, sd, MIPS_R_S4, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+
+ if ((ctx->flags & EBPF_SEEN_TC) && !(ctx->flags & EBPF_TCC_IN_V1))
+ emit_instr(ctx, daddu, MIPS_R_S4, MIPS_R_V1, MIPS_R_ZERO);
+
+ return 0;
+}
+
+static int build_int_epilogue(struct jit_ctx *ctx, int dest_reg)
+{
+ const struct bpf_prog *prog = ctx->skf;
+ int stack_adjust = ctx->stack_size;
+ int store_offset = stack_adjust - 8;
+ int r0 = MIPS_R_V0;
+
+ if (dest_reg == MIPS_R_RA &&
+ get_reg_val_type(ctx, prog->len, BPF_REG_0) == REG_32BIT_ZERO_EX)
+ /* Don't let zero extended value escape. */
+ emit_instr(ctx, sll, r0, r0, 0);
+
+ if (ctx->flags & EBPF_SAVE_RA) {
+ emit_instr(ctx, ld, MIPS_R_RA, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S0) {
+ emit_instr(ctx, ld, MIPS_R_S0, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S1) {
+ emit_instr(ctx, ld, MIPS_R_S1, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S2) {
+ emit_instr(ctx, ld, MIPS_R_S2, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S3) {
+ emit_instr(ctx, ld, MIPS_R_S3, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ if (ctx->flags & EBPF_SAVE_S4) {
+ emit_instr(ctx, ld, MIPS_R_S4, store_offset, MIPS_R_SP);
+ store_offset -= 8;
+ }
+ emit_instr(ctx, jr, dest_reg);
+
+ if (stack_adjust)
+ emit_instr(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, stack_adjust);
+ else
+ emit_instr(ctx, nop);
+
+ return 0;
+}
+
+static void gen_imm_to_reg(const struct bpf_insn *insn, int reg,
+ struct jit_ctx *ctx)
+{
+ if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) {
+ emit_instr(ctx, addiu, reg, MIPS_R_ZERO, insn->imm);
+ } else {
+ int lower = (s16)(insn->imm & 0xffff);
+ int upper = insn->imm - lower;
+
+ emit_instr(ctx, lui, reg, upper >> 16);
+ emit_instr(ctx, addiu, reg, reg, lower);
+ }
+
+}
+
+static int gen_imm_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
+ int idx)
+{
+ int upper_bound, lower_bound;
+ int dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+
+ if (dst < 0)
+ return dst;
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_MOV:
+ case BPF_ADD:
+ upper_bound = S16_MAX;
+ lower_bound = S16_MIN;
+ break;
+ case BPF_SUB:
+ upper_bound = -(int)S16_MIN;
+ lower_bound = -(int)S16_MAX;
+ break;
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ upper_bound = 0xffff;
+ lower_bound = 0;
+ break;
+ case BPF_RSH:
+ case BPF_LSH:
+ case BPF_ARSH:
+ /* Shift amounts are truncated, no need for bounds */
+ upper_bound = S32_MAX;
+ lower_bound = S32_MIN;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /*
+ * Immediate move clobbers the register, so no sign/zero
+ * extension needed.
+ */
+ if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ BPF_OP(insn->code) != BPF_MOV &&
+ get_reg_val_type(ctx, idx, insn->dst_reg) == REG_32BIT)
+ emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+ /* BPF_ALU | BPF_LSH doesn't need separate sign extension */
+ if (BPF_CLASS(insn->code) == BPF_ALU &&
+ BPF_OP(insn->code) != BPF_LSH &&
+ BPF_OP(insn->code) != BPF_MOV &&
+ get_reg_val_type(ctx, idx, insn->dst_reg) != REG_32BIT)
+ emit_instr(ctx, sll, dst, dst, 0);
+
+ if (insn->imm >= lower_bound && insn->imm <= upper_bound) {
+ /* single insn immediate case */
+ switch (BPF_OP(insn->code) | BPF_CLASS(insn->code)) {
+ case BPF_ALU64 | BPF_MOV:
+ emit_instr(ctx, daddiu, dst, MIPS_R_ZERO, insn->imm);
+ break;
+ case BPF_ALU64 | BPF_AND:
+ case BPF_ALU | BPF_AND:
+ emit_instr(ctx, andi, dst, dst, insn->imm);
+ break;
+ case BPF_ALU64 | BPF_OR:
+ case BPF_ALU | BPF_OR:
+ emit_instr(ctx, ori, dst, dst, insn->imm);
+ break;
+ case BPF_ALU64 | BPF_XOR:
+ case BPF_ALU | BPF_XOR:
+ emit_instr(ctx, xori, dst, dst, insn->imm);
+ break;
+ case BPF_ALU64 | BPF_ADD:
+ emit_instr(ctx, daddiu, dst, dst, insn->imm);
+ break;
+ case BPF_ALU64 | BPF_SUB:
+ emit_instr(ctx, daddiu, dst, dst, -insn->imm);
+ break;
+ case BPF_ALU64 | BPF_RSH:
+ emit_instr(ctx, dsrl_safe, dst, dst, insn->imm & 0x3f);
+ break;
+ case BPF_ALU | BPF_RSH:
+ emit_instr(ctx, srl, dst, dst, insn->imm & 0x1f);
+ break;
+ case BPF_ALU64 | BPF_LSH:
+ emit_instr(ctx, dsll_safe, dst, dst, insn->imm & 0x3f);
+ break;
+ case BPF_ALU | BPF_LSH:
+ emit_instr(ctx, sll, dst, dst, insn->imm & 0x1f);
+ break;
+ case BPF_ALU64 | BPF_ARSH:
+ emit_instr(ctx, dsra_safe, dst, dst, insn->imm & 0x3f);
+ break;
+ case BPF_ALU | BPF_ARSH:
+ emit_instr(ctx, sra, dst, dst, insn->imm & 0x1f);
+ break;
+ case BPF_ALU | BPF_MOV:
+ emit_instr(ctx, addiu, dst, MIPS_R_ZERO, insn->imm);
+ break;
+ case BPF_ALU | BPF_ADD:
+ emit_instr(ctx, addiu, dst, dst, insn->imm);
+ break;
+ case BPF_ALU | BPF_SUB:
+ emit_instr(ctx, addiu, dst, dst, -insn->imm);
+ break;
+ default:
+ return -EINVAL;
+ }
+ } else {
+ /* multi insn immediate case */
+ if (BPF_OP(insn->code) == BPF_MOV) {
+ gen_imm_to_reg(insn, dst, ctx);
+ } else {
+ gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+ switch (BPF_OP(insn->code) | BPF_CLASS(insn->code)) {
+ case BPF_ALU64 | BPF_AND:
+ case BPF_ALU | BPF_AND:
+ emit_instr(ctx, and, dst, dst, MIPS_R_AT);
+ break;
+ case BPF_ALU64 | BPF_OR:
+ case BPF_ALU | BPF_OR:
+ emit_instr(ctx, or, dst, dst, MIPS_R_AT);
+ break;
+ case BPF_ALU64 | BPF_XOR:
+ case BPF_ALU | BPF_XOR:
+ emit_instr(ctx, xor, dst, dst, MIPS_R_AT);
+ break;
+ case BPF_ALU64 | BPF_ADD:
+ emit_instr(ctx, daddu, dst, dst, MIPS_R_AT);
+ break;
+ case BPF_ALU64 | BPF_SUB:
+ emit_instr(ctx, dsubu, dst, dst, MIPS_R_AT);
+ break;
+ case BPF_ALU | BPF_ADD:
+ emit_instr(ctx, addu, dst, dst, MIPS_R_AT);
+ break;
+ case BPF_ALU | BPF_SUB:
+ emit_instr(ctx, subu, dst, dst, MIPS_R_AT);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static void * __must_check
+ool_skb_header_pointer(const struct sk_buff *skb, int offset,
+ int len, void *buffer)
+{
+ return skb_header_pointer(skb, offset, len, buffer);
+}
+
+static int size_to_len(const struct bpf_insn *insn)
+{
+ switch (BPF_SIZE(insn->code)) {
+ case BPF_B:
+ return 1;
+ case BPF_H:
+ return 2;
+ case BPF_W:
+ return 4;
+ case BPF_DW:
+ return 8;
+ }
+ return 0;
+}
+
+static void emit_const_to_reg(struct jit_ctx *ctx, int dst, u64 value)
+{
+ if (value >= 0xffffffffffff8000ull || value < 0x8000ull) {
+ emit_instr(ctx, daddiu, dst, MIPS_R_ZERO, (int)value);
+ } else if (value >= 0xffffffff80000000ull ||
+ (value < 0x80000000 && value > 0xffff)) {
+ emit_instr(ctx, lui, dst, (s32)(s16)(value >> 16));
+ emit_instr(ctx, ori, dst, dst, (unsigned int)(value & 0xffff));
+ } else {
+ int i;
+ bool seen_part = false;
+ int needed_shift = 0;
+
+ for (i = 0; i < 4; i++) {
+ u64 part = (value >> (16 * (3 - i))) & 0xffff;
+
+ if (seen_part && needed_shift > 0 && (part || i == 3)) {
+ emit_instr(ctx, dsll_safe, dst, dst, needed_shift);
+ needed_shift = 0;
+ }
+ if (part) {
+ if (i == 0 || (!seen_part && i < 3 && part < 0x8000)) {
+ emit_instr(ctx, lui, dst, (s32)(s16)part);
+ needed_shift = -16;
+ } else {
+ emit_instr(ctx, ori, dst,
+ seen_part ? dst : MIPS_R_ZERO,
+ (unsigned int)part);
+ }
+ seen_part = true;
+ }
+ if (seen_part)
+ needed_shift += 16;
+ }
+ }
+}
+
+static int emit_bpf_tail_call(struct jit_ctx *ctx, int this_idx)
+{
+ int off, b_off;
+
+ ctx->flags |= EBPF_SEEN_TC;
+ /*
+ * if (index >= array->map.max_entries)
+ * goto out;
+ */
+ off = offsetof(struct bpf_array, map.max_entries);
+ emit_instr(ctx, lwu, MIPS_R_T5, off, MIPS_R_A1);
+ emit_instr(ctx, sltu, MIPS_R_AT, MIPS_R_T5, MIPS_R_A2);
+ b_off = b_imm(this_idx + 1, ctx);
+ emit_instr(ctx, bne, MIPS_R_AT, MIPS_R_ZERO, b_off);
+ /*
+ * if (--TCC < 0)
+ * goto out;
+ */
+ /* Delay slot */
+ emit_instr(ctx, daddiu, MIPS_R_T5,
+ (ctx->flags & EBPF_TCC_IN_V1) ? MIPS_R_V1 : MIPS_R_S4, -1);
+ b_off = b_imm(this_idx + 1, ctx);
+ emit_instr(ctx, bltz, MIPS_R_T5, b_off);
+ /*
+ * prog = array->ptrs[index];
+ * if (prog == NULL)
+ * goto out;
+ */
+ /* Delay slot */
+ emit_instr(ctx, dsll, MIPS_R_T8, MIPS_R_A2, 3);
+ emit_instr(ctx, daddu, MIPS_R_T8, MIPS_R_T8, MIPS_R_A1);
+ off = offsetof(struct bpf_array, ptrs);
+ emit_instr(ctx, ld, MIPS_R_AT, off, MIPS_R_T8);
+ b_off = b_imm(this_idx + 1, ctx);
+ emit_instr(ctx, beq, MIPS_R_AT, MIPS_R_ZERO, b_off);
+ /* Delay slot */
+ emit_instr(ctx, nop);
+
+ /* goto *(prog->bpf_func + 4); */
+ off = offsetof(struct bpf_prog, bpf_func);
+ emit_instr(ctx, ld, MIPS_R_T9, off, MIPS_R_AT);
+ /* All systems are go... propagate TCC */
+ emit_instr(ctx, daddu, MIPS_R_V1, MIPS_R_T5, MIPS_R_ZERO);
+ /* Skip first instruction (TCC initialization) */
+ emit_instr(ctx, daddiu, MIPS_R_T9, MIPS_R_T9, 4);
+ return build_int_epilogue(ctx, MIPS_R_T9);
+}
+
+static bool use_bbit_insns(void)
+{
+ switch (current_cpu_type()) {
+ case CPU_CAVIUM_OCTEON:
+ case CPU_CAVIUM_OCTEON_PLUS:
+ case CPU_CAVIUM_OCTEON2:
+ case CPU_CAVIUM_OCTEON3:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool is_bad_offset(int b_off)
+{
+ return b_off > 0x1ffff || b_off < -0x20000;
+}
+
+/* Returns the number of insn slots consumed. */
+static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
+ int this_idx, int exit_idx)
+{
+ int src, dst, r, td, ts, mem_off, b_off;
+ bool need_swap, did_move, cmp_eq;
+ unsigned int target;
+ u64 t64;
+ s64 t64s;
+
+ switch (insn->code) {
+ case BPF_ALU64 | BPF_ADD | BPF_K: /* ALU64_IMM */
+ case BPF_ALU64 | BPF_SUB | BPF_K: /* ALU64_IMM */
+ case BPF_ALU64 | BPF_OR | BPF_K: /* ALU64_IMM */
+ case BPF_ALU64 | BPF_AND | BPF_K: /* ALU64_IMM */
+ case BPF_ALU64 | BPF_LSH | BPF_K: /* ALU64_IMM */
+ case BPF_ALU64 | BPF_RSH | BPF_K: /* ALU64_IMM */
+ case BPF_ALU64 | BPF_XOR | BPF_K: /* ALU64_IMM */
+ case BPF_ALU64 | BPF_ARSH | BPF_K: /* ALU64_IMM */
+ case BPF_ALU64 | BPF_MOV | BPF_K: /* ALU64_IMM */
+ case BPF_ALU | BPF_MOV | BPF_K: /* ALU32_IMM */
+ case BPF_ALU | BPF_ADD | BPF_K: /* ALU32_IMM */
+ case BPF_ALU | BPF_SUB | BPF_K: /* ALU32_IMM */
+ case BPF_ALU | BPF_OR | BPF_K: /* ALU64_IMM */
+ case BPF_ALU | BPF_AND | BPF_K: /* ALU64_IMM */
+ case BPF_ALU | BPF_LSH | BPF_K: /* ALU64_IMM */
+ case BPF_ALU | BPF_RSH | BPF_K: /* ALU64_IMM */
+ case BPF_ALU | BPF_XOR | BPF_K: /* ALU64_IMM */
+ case BPF_ALU | BPF_ARSH | BPF_K: /* ALU64_IMM */
+ r = gen_imm_insn(insn, ctx, this_idx);
+ if (r < 0)
+ return r;
+ break;
+ case BPF_ALU64 | BPF_MUL | BPF_K: /* ALU64_IMM */
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
+ emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+ if (insn->imm == 1) /* Mult by 1 is a nop */
+ break;
+ gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+ emit_instr(ctx, dmultu, MIPS_R_AT, dst);
+ emit_instr(ctx, mflo, dst);
+ break;
+ case BPF_ALU64 | BPF_NEG | BPF_K: /* ALU64_IMM */
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
+ emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+ emit_instr(ctx, dsubu, dst, MIPS_R_ZERO, dst);
+ break;
+ case BPF_ALU | BPF_MUL | BPF_K: /* ALU_IMM */
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+ if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) {
+ /* sign extend */
+ emit_instr(ctx, sll, dst, dst, 0);
+ }
+ if (insn->imm == 1) /* Mult by 1 is a nop */
+ break;
+ gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+ emit_instr(ctx, multu, dst, MIPS_R_AT);
+ emit_instr(ctx, mflo, dst);
+ break;
+ case BPF_ALU | BPF_NEG | BPF_K: /* ALU_IMM */
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+ if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) {
+ /* sign extend */
+ emit_instr(ctx, sll, dst, dst, 0);
+ }
+ emit_instr(ctx, subu, dst, MIPS_R_ZERO, dst);
+ break;
+ case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */
+ case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ if (insn->imm == 0) { /* Div by zero */
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
+ emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO);
+ }
+ td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+ if (td == REG_64BIT || td == REG_32BIT_ZERO_EX)
+ /* sign extend */
+ emit_instr(ctx, sll, dst, dst, 0);
+ if (insn->imm == 1) {
+ /* div by 1 is a nop, mod by 1 is zero */
+ if (BPF_OP(insn->code) == BPF_MOD)
+ emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO);
+ break;
+ }
+ gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+ emit_instr(ctx, divu, dst, MIPS_R_AT);
+ if (BPF_OP(insn->code) == BPF_DIV)
+ emit_instr(ctx, mflo, dst);
+ else
+ emit_instr(ctx, mfhi, dst);
+ break;
+ case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */
+ case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ if (insn->imm == 0) { /* Div by zero */
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
+ emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO);
+ }
+ if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
+ emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+
+ if (insn->imm == 1) {
+ /* div by 1 is a nop, mod by 1 is zero */
+ if (BPF_OP(insn->code) == BPF_MOD)
+ emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO);
+ break;
+ }
+ gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+ emit_instr(ctx, ddivu, dst, MIPS_R_AT);
+ if (BPF_OP(insn->code) == BPF_DIV)
+ emit_instr(ctx, mflo, dst);
+ else
+ emit_instr(ctx, mfhi, dst);
+ break;
+ case BPF_ALU64 | BPF_MOV | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_ADD | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_SUB | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_XOR | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_OR | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_AND | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_MUL | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_DIV | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_MOD | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_LSH | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_RSH | BPF_X: /* ALU64_REG */
+ case BPF_ALU64 | BPF_ARSH | BPF_X: /* ALU64_REG */
+ src = ebpf_to_mips_reg(ctx, insn, src_reg);
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (src < 0 || dst < 0)
+ return -EINVAL;
+ if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
+ emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+ did_move = false;
+ if (insn->src_reg == BPF_REG_10) {
+ if (BPF_OP(insn->code) == BPF_MOV) {
+ emit_instr(ctx, daddiu, dst, MIPS_R_SP, MAX_BPF_STACK);
+ did_move = true;
+ } else {
+ emit_instr(ctx, daddiu, MIPS_R_AT, MIPS_R_SP, MAX_BPF_STACK);
+ src = MIPS_R_AT;
+ }
+ } else if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
+ int tmp_reg = MIPS_R_AT;
+
+ if (BPF_OP(insn->code) == BPF_MOV) {
+ tmp_reg = dst;
+ did_move = true;
+ }
+ emit_instr(ctx, daddu, tmp_reg, src, MIPS_R_ZERO);
+ emit_instr(ctx, dinsu, tmp_reg, MIPS_R_ZERO, 32, 32);
+ src = MIPS_R_AT;
+ }
+ switch (BPF_OP(insn->code)) {
+ case BPF_MOV:
+ if (!did_move)
+ emit_instr(ctx, daddu, dst, src, MIPS_R_ZERO);
+ break;
+ case BPF_ADD:
+ emit_instr(ctx, daddu, dst, dst, src);
+ break;
+ case BPF_SUB:
+ emit_instr(ctx, dsubu, dst, dst, src);
+ break;
+ case BPF_XOR:
+ emit_instr(ctx, xor, dst, dst, src);
+ break;
+ case BPF_OR:
+ emit_instr(ctx, or, dst, dst, src);
+ break;
+ case BPF_AND:
+ emit_instr(ctx, and, dst, dst, src);
+ break;
+ case BPF_MUL:
+ emit_instr(ctx, dmultu, dst, src);
+ emit_instr(ctx, mflo, dst);
+ break;
+ case BPF_DIV:
+ case BPF_MOD:
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
+ emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
+ emit_instr(ctx, ddivu, dst, src);
+ if (BPF_OP(insn->code) == BPF_DIV)
+ emit_instr(ctx, mflo, dst);
+ else
+ emit_instr(ctx, mfhi, dst);
+ break;
+ case BPF_LSH:
+ emit_instr(ctx, dsllv, dst, dst, src);
+ break;
+ case BPF_RSH:
+ emit_instr(ctx, dsrlv, dst, dst, src);
+ break;
+ case BPF_ARSH:
+ emit_instr(ctx, dsrav, dst, dst, src);
+ break;
+ default:
+ pr_err("ALU64_REG NOT HANDLED\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_ALU | BPF_MOV | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_ADD | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_SUB | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_XOR | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_OR | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_AND | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_MUL | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_DIV | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_MOD | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_LSH | BPF_X: /* ALU_REG */
+ case BPF_ALU | BPF_RSH | BPF_X: /* ALU_REG */
+ src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (src < 0 || dst < 0)
+ return -EINVAL;
+ td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+ if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) {
+ /* sign extend */
+ emit_instr(ctx, sll, dst, dst, 0);
+ }
+ did_move = false;
+ ts = get_reg_val_type(ctx, this_idx, insn->src_reg);
+ if (ts == REG_64BIT || ts == REG_32BIT_ZERO_EX) {
+ int tmp_reg = MIPS_R_AT;
+
+ if (BPF_OP(insn->code) == BPF_MOV) {
+ tmp_reg = dst;
+ did_move = true;
+ }
+ /* sign extend */
+ emit_instr(ctx, sll, tmp_reg, src, 0);
+ src = MIPS_R_AT;
+ }
+ switch (BPF_OP(insn->code)) {
+ case BPF_MOV:
+ if (!did_move)
+ emit_instr(ctx, addu, dst, src, MIPS_R_ZERO);
+ break;
+ case BPF_ADD:
+ emit_instr(ctx, addu, dst, dst, src);
+ break;
+ case BPF_SUB:
+ emit_instr(ctx, subu, dst, dst, src);
+ break;
+ case BPF_XOR:
+ emit_instr(ctx, xor, dst, dst, src);
+ break;
+ case BPF_OR:
+ emit_instr(ctx, or, dst, dst, src);
+ break;
+ case BPF_AND:
+ emit_instr(ctx, and, dst, dst, src);
+ break;
+ case BPF_MUL:
+ emit_instr(ctx, mul, dst, dst, src);
+ break;
+ case BPF_DIV:
+ case BPF_MOD:
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
+ emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
+ emit_instr(ctx, divu, dst, src);
+ if (BPF_OP(insn->code) == BPF_DIV)
+ emit_instr(ctx, mflo, dst);
+ else
+ emit_instr(ctx, mfhi, dst);
+ break;
+ case BPF_LSH:
+ emit_instr(ctx, sllv, dst, dst, src);
+ break;
+ case BPF_RSH:
+ emit_instr(ctx, srlv, dst, dst, src);
+ break;
+ default:
+ pr_err("ALU_REG NOT HANDLED\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_JMP | BPF_EXIT:
+ if (this_idx + 1 < exit_idx) {
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
+ emit_instr(ctx, nop);
+ }
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K: /* JMP_IMM */
+ case BPF_JMP | BPF_JNE | BPF_K: /* JMP_IMM */
+ cmp_eq = (BPF_OP(insn->code) == BPF_JEQ);
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok);
+ if (dst < 0)
+ return dst;
+ if (insn->imm == 0) {
+ src = MIPS_R_ZERO;
+ } else {
+ gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+ src = MIPS_R_AT;
+ }
+ goto jeq_common;
+ case BPF_JMP | BPF_JEQ | BPF_X: /* JMP_REG */
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (src < 0 || dst < 0)
+ return -EINVAL;
+ td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+ ts = get_reg_val_type(ctx, this_idx, insn->src_reg);
+ if (td == REG_32BIT && ts != REG_32BIT) {
+ emit_instr(ctx, sll, MIPS_R_AT, src, 0);
+ src = MIPS_R_AT;
+ } else if (ts == REG_32BIT && td != REG_32BIT) {
+ emit_instr(ctx, sll, MIPS_R_AT, dst, 0);
+ dst = MIPS_R_AT;
+ }
+ if (BPF_OP(insn->code) == BPF_JSET) {
+ emit_instr(ctx, and, MIPS_R_AT, dst, src);
+ cmp_eq = false;
+ dst = MIPS_R_AT;
+ src = MIPS_R_ZERO;
+ } else if (BPF_OP(insn->code) == BPF_JSGT) {
+ emit_instr(ctx, dsubu, MIPS_R_AT, dst, src);
+ if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) {
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, blez, MIPS_R_AT, b_off);
+ emit_instr(ctx, nop);
+ return 2; /* We consumed the exit. */
+ }
+ b_off = b_imm(this_idx + insn->off + 1, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, bgtz, MIPS_R_AT, b_off);
+ emit_instr(ctx, nop);
+ break;
+ } else if (BPF_OP(insn->code) == BPF_JSGE) {
+ emit_instr(ctx, slt, MIPS_R_AT, dst, src);
+ cmp_eq = true;
+ dst = MIPS_R_AT;
+ src = MIPS_R_ZERO;
+ } else if (BPF_OP(insn->code) == BPF_JGT) {
+ /* dst or src could be AT */
+ emit_instr(ctx, dsubu, MIPS_R_T8, dst, src);
+ emit_instr(ctx, sltu, MIPS_R_AT, dst, src);
+ /* SP known to be non-zero, movz becomes boolean not */
+ emit_instr(ctx, movz, MIPS_R_T9, MIPS_R_SP, MIPS_R_T8);
+ emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_ZERO, MIPS_R_T8);
+ emit_instr(ctx, or, MIPS_R_AT, MIPS_R_T9, MIPS_R_AT);
+ cmp_eq = true;
+ dst = MIPS_R_AT;
+ src = MIPS_R_ZERO;
+ } else if (BPF_OP(insn->code) == BPF_JGE) {
+ emit_instr(ctx, sltu, MIPS_R_AT, dst, src);
+ cmp_eq = true;
+ dst = MIPS_R_AT;
+ src = MIPS_R_ZERO;
+ } else { /* JNE/JEQ case */
+ cmp_eq = (BPF_OP(insn->code) == BPF_JEQ);
+ }
+jeq_common:
+ /*
+ * If the next insn is EXIT and we are jumping arround
+ * only it, invert the sense of the compare and
+ * conditionally jump to the exit. Poor man's branch
+ * chaining.
+ */
+ if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) {
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off)) {
+ target = j_target(ctx, exit_idx);
+ if (target == (unsigned int)-1)
+ return -E2BIG;
+ cmp_eq = !cmp_eq;
+ b_off = 4 * 3;
+ if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) {
+ ctx->offsets[this_idx] |= OFFSETS_B_CONV;
+ ctx->long_b_conversion = 1;
+ }
+ }
+
+ if (cmp_eq)
+ emit_instr(ctx, bne, dst, src, b_off);
+ else
+ emit_instr(ctx, beq, dst, src, b_off);
+ emit_instr(ctx, nop);
+ if (ctx->offsets[this_idx] & OFFSETS_B_CONV) {
+ emit_instr(ctx, j, target);
+ emit_instr(ctx, nop);
+ }
+ return 2; /* We consumed the exit. */
+ }
+ b_off = b_imm(this_idx + insn->off + 1, ctx);
+ if (is_bad_offset(b_off)) {
+ target = j_target(ctx, this_idx + insn->off + 1);
+ if (target == (unsigned int)-1)
+ return -E2BIG;
+ cmp_eq = !cmp_eq;
+ b_off = 4 * 3;
+ if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) {
+ ctx->offsets[this_idx] |= OFFSETS_B_CONV;
+ ctx->long_b_conversion = 1;
+ }
+ }
+
+ if (cmp_eq)
+ emit_instr(ctx, beq, dst, src, b_off);
+ else
+ emit_instr(ctx, bne, dst, src, b_off);
+ emit_instr(ctx, nop);
+ if (ctx->offsets[this_idx] & OFFSETS_B_CONV) {
+ emit_instr(ctx, j, target);
+ emit_instr(ctx, nop);
+ }
+ break;
+ case BPF_JMP | BPF_JSGT | BPF_K: /* JMP_IMM */
+ case BPF_JMP | BPF_JSGE | BPF_K: /* JMP_IMM */
+ cmp_eq = (BPF_OP(insn->code) == BPF_JSGE);
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok);
+ if (dst < 0)
+ return dst;
+
+ if (insn->imm == 0) {
+ if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) {
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ if (cmp_eq)
+ emit_instr(ctx, bltz, dst, b_off);
+ else
+ emit_instr(ctx, blez, dst, b_off);
+ emit_instr(ctx, nop);
+ return 2; /* We consumed the exit. */
+ }
+ b_off = b_imm(this_idx + insn->off + 1, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ if (cmp_eq)
+ emit_instr(ctx, bgez, dst, b_off);
+ else
+ emit_instr(ctx, bgtz, dst, b_off);
+ emit_instr(ctx, nop);
+ break;
+ }
+ /*
+ * only "LT" compare available, so we must use imm + 1
+ * to generate "GT"
+ */
+ t64s = insn->imm + (cmp_eq ? 0 : 1);
+ if (t64s >= S16_MIN && t64s <= S16_MAX) {
+ emit_instr(ctx, slti, MIPS_R_AT, dst, (int)t64s);
+ src = MIPS_R_AT;
+ dst = MIPS_R_ZERO;
+ cmp_eq = true;
+ goto jeq_common;
+ }
+ emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s);
+ emit_instr(ctx, slt, MIPS_R_AT, dst, MIPS_R_AT);
+ src = MIPS_R_AT;
+ dst = MIPS_R_ZERO;
+ cmp_eq = true;
+ goto jeq_common;
+
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ cmp_eq = (BPF_OP(insn->code) == BPF_JGE);
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok);
+ if (dst < 0)
+ return dst;
+ /*
+ * only "LT" compare available, so we must use imm + 1
+ * to generate "GT"
+ */
+ t64s = (u64)(u32)(insn->imm) + (cmp_eq ? 0 : 1);
+ if (t64s >= 0 && t64s <= S16_MAX) {
+ emit_instr(ctx, sltiu, MIPS_R_AT, dst, (int)t64s);
+ src = MIPS_R_AT;
+ dst = MIPS_R_ZERO;
+ cmp_eq = true;
+ goto jeq_common;
+ }
+ emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s);
+ emit_instr(ctx, sltu, MIPS_R_AT, dst, MIPS_R_AT);
+ src = MIPS_R_AT;
+ dst = MIPS_R_ZERO;
+ cmp_eq = true;
+ goto jeq_common;
+
+ case BPF_JMP | BPF_JSET | BPF_K: /* JMP_IMM */
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok);
+ if (dst < 0)
+ return dst;
+
+ if (use_bbit_insns() && hweight32((u32)insn->imm) == 1) {
+ if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) {
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, bbit0, dst, ffs((u32)insn->imm) - 1, b_off);
+ emit_instr(ctx, nop);
+ return 2; /* We consumed the exit. */
+ }
+ b_off = b_imm(this_idx + insn->off + 1, ctx);
+ if (is_bad_offset(b_off))
+ return -E2BIG;
+ emit_instr(ctx, bbit1, dst, ffs((u32)insn->imm) - 1, b_off);
+ emit_instr(ctx, nop);
+ break;
+ }
+ t64 = (u32)insn->imm;
+ emit_const_to_reg(ctx, MIPS_R_AT, t64);
+ emit_instr(ctx, and, MIPS_R_AT, dst, MIPS_R_AT);
+ src = MIPS_R_AT;
+ dst = MIPS_R_ZERO;
+ cmp_eq = false;
+ goto jeq_common;
+
+ case BPF_JMP | BPF_JA:
+ /*
+ * Prefer relative branch for easier debugging, but
+ * fall back if needed.
+ */
+ b_off = b_imm(this_idx + insn->off + 1, ctx);
+ if (is_bad_offset(b_off)) {
+ target = j_target(ctx, this_idx + insn->off + 1);
+ if (target == (unsigned int)-1)
+ return -E2BIG;
+ emit_instr(ctx, j, target);
+ } else {
+ emit_instr(ctx, b, b_off);
+ }
+ emit_instr(ctx, nop);
+ break;
+ case BPF_LD | BPF_DW | BPF_IMM:
+ if (insn->src_reg != 0)
+ return -EINVAL;
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ t64 = ((u64)(u32)insn->imm) | ((u64)(insn + 1)->imm << 32);
+ emit_const_to_reg(ctx, dst, t64);
+ return 2; /* Double slot insn */
+
+ case BPF_JMP | BPF_CALL:
+ ctx->flags |= EBPF_SAVE_RA;
+ t64s = (s64)insn->imm + (s64)__bpf_call_base;
+ emit_const_to_reg(ctx, MIPS_R_T9, (u64)t64s);
+ emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
+ /* delay slot */
+ emit_instr(ctx, nop);
+ break;
+
+ case BPF_JMP | BPF_TAIL_CALL:
+ if (emit_bpf_tail_call(ctx, this_idx))
+ return -EINVAL;
+ break;
+
+ case BPF_LD | BPF_B | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_DW | BPF_ABS:
+ ctx->flags |= EBPF_SAVE_RA;
+
+ gen_imm_to_reg(insn, MIPS_R_A1, ctx);
+ emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn));
+
+ if (insn->imm < 0) {
+ emit_const_to_reg(ctx, MIPS_R_T9, (u64)bpf_internal_load_pointer_neg_helper);
+ } else {
+ emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer);
+ emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset);
+ }
+ goto ld_skb_common;
+
+ case BPF_LD | BPF_B | BPF_IND:
+ case BPF_LD | BPF_H | BPF_IND:
+ case BPF_LD | BPF_W | BPF_IND:
+ case BPF_LD | BPF_DW | BPF_IND:
+ ctx->flags |= EBPF_SAVE_RA;
+ src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+ if (src < 0)
+ return src;
+ ts = get_reg_val_type(ctx, this_idx, insn->src_reg);
+ if (ts == REG_32BIT_ZERO_EX) {
+ /* sign extend */
+ emit_instr(ctx, sll, MIPS_R_A1, src, 0);
+ src = MIPS_R_A1;
+ }
+ if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) {
+ emit_instr(ctx, daddiu, MIPS_R_A1, src, insn->imm);
+ } else {
+ gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+ emit_instr(ctx, daddu, MIPS_R_A1, MIPS_R_AT, src);
+ }
+ /* truncate to 32-bit int */
+ emit_instr(ctx, sll, MIPS_R_A1, MIPS_R_A1, 0);
+ emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset);
+ emit_instr(ctx, slt, MIPS_R_AT, MIPS_R_A1, MIPS_R_ZERO);
+
+ emit_const_to_reg(ctx, MIPS_R_T8, (u64)bpf_internal_load_pointer_neg_helper);
+ emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer);
+ emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn));
+ emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_T8, MIPS_R_AT);
+
+ld_skb_common:
+ emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
+ /* delay slot move */
+ emit_instr(ctx, daddu, MIPS_R_A0, MIPS_R_S0, MIPS_R_ZERO);
+
+ /* Check the error value */
+ b_off = b_imm(exit_idx, ctx);
+ if (is_bad_offset(b_off)) {
+ target = j_target(ctx, exit_idx);
+ if (target == (unsigned int)-1)
+ return -E2BIG;
+
+ if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) {
+ ctx->offsets[this_idx] |= OFFSETS_B_CONV;
+ ctx->long_b_conversion = 1;
+ }
+ emit_instr(ctx, bne, MIPS_R_V0, MIPS_R_ZERO, 4 * 3);
+ emit_instr(ctx, nop);
+ emit_instr(ctx, j, target);
+ emit_instr(ctx, nop);
+ } else {
+ emit_instr(ctx, beq, MIPS_R_V0, MIPS_R_ZERO, b_off);
+ emit_instr(ctx, nop);
+ }
+
+#ifdef __BIG_ENDIAN
+ need_swap = false;
+#else
+ need_swap = true;
+#endif
+ dst = MIPS_R_V0;
+ switch (BPF_SIZE(insn->code)) {
+ case BPF_B:
+ emit_instr(ctx, lbu, dst, 0, MIPS_R_V0);
+ break;
+ case BPF_H:
+ emit_instr(ctx, lhu, dst, 0, MIPS_R_V0);
+ if (need_swap)
+ emit_instr(ctx, wsbh, dst, dst);
+ break;
+ case BPF_W:
+ emit_instr(ctx, lw, dst, 0, MIPS_R_V0);
+ if (need_swap) {
+ emit_instr(ctx, wsbh, dst, dst);
+ emit_instr(ctx, rotr, dst, dst, 16);
+ }
+ break;
+ case BPF_DW:
+ emit_instr(ctx, ld, dst, 0, MIPS_R_V0);
+ if (need_swap) {
+ emit_instr(ctx, dsbh, dst, dst);
+ emit_instr(ctx, dshd, dst, dst);
+ }
+ break;
+ }
+
+ break;
+ case BPF_ALU | BPF_END | BPF_FROM_BE:
+ case BPF_ALU | BPF_END | BPF_FROM_LE:
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+ if (insn->imm == 64 && td == REG_32BIT)
+ emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+
+ if (insn->imm != 64 &&
+ (td == REG_64BIT || td == REG_32BIT_ZERO_EX)) {
+ /* sign extend */
+ emit_instr(ctx, sll, dst, dst, 0);
+ }
+
+#ifdef __BIG_ENDIAN
+ need_swap = (BPF_SRC(insn->code) == BPF_FROM_LE);
+#else
+ need_swap = (BPF_SRC(insn->code) == BPF_FROM_BE);
+#endif
+ if (insn->imm == 16) {
+ if (need_swap)
+ emit_instr(ctx, wsbh, dst, dst);
+ emit_instr(ctx, andi, dst, dst, 0xffff);
+ } else if (insn->imm == 32) {
+ if (need_swap) {
+ emit_instr(ctx, wsbh, dst, dst);
+ emit_instr(ctx, rotr, dst, dst, 16);
+ }
+ } else { /* 64-bit*/
+ if (need_swap) {
+ emit_instr(ctx, dsbh, dst, dst);
+ emit_instr(ctx, dshd, dst, dst);
+ }
+ }
+ break;
+
+ case BPF_ST | BPF_B | BPF_MEM:
+ case BPF_ST | BPF_H | BPF_MEM:
+ case BPF_ST | BPF_W | BPF_MEM:
+ case BPF_ST | BPF_DW | BPF_MEM:
+ if (insn->dst_reg == BPF_REG_10) {
+ ctx->flags |= EBPF_SEEN_FP;
+ dst = MIPS_R_SP;
+ mem_off = insn->off + MAX_BPF_STACK;
+ } else {
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ mem_off = insn->off;
+ }
+ gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+ switch (BPF_SIZE(insn->code)) {
+ case BPF_B:
+ emit_instr(ctx, sb, MIPS_R_AT, mem_off, dst);
+ break;
+ case BPF_H:
+ emit_instr(ctx, sh, MIPS_R_AT, mem_off, dst);
+ break;
+ case BPF_W:
+ emit_instr(ctx, sw, MIPS_R_AT, mem_off, dst);
+ break;
+ case BPF_DW:
+ emit_instr(ctx, sd, MIPS_R_AT, mem_off, dst);
+ break;
+ }
+ break;
+
+ case BPF_LDX | BPF_B | BPF_MEM:
+ case BPF_LDX | BPF_H | BPF_MEM:
+ case BPF_LDX | BPF_W | BPF_MEM:
+ case BPF_LDX | BPF_DW | BPF_MEM:
+ if (insn->src_reg == BPF_REG_10) {
+ ctx->flags |= EBPF_SEEN_FP;
+ src = MIPS_R_SP;
+ mem_off = insn->off + MAX_BPF_STACK;
+ } else {
+ src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+ if (src < 0)
+ return src;
+ mem_off = insn->off;
+ }
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ switch (BPF_SIZE(insn->code)) {
+ case BPF_B:
+ emit_instr(ctx, lbu, dst, mem_off, src);
+ break;
+ case BPF_H:
+ emit_instr(ctx, lhu, dst, mem_off, src);
+ break;
+ case BPF_W:
+ emit_instr(ctx, lw, dst, mem_off, src);
+ break;
+ case BPF_DW:
+ emit_instr(ctx, ld, dst, mem_off, src);
+ break;
+ }
+ break;
+
+ case BPF_STX | BPF_B | BPF_MEM:
+ case BPF_STX | BPF_H | BPF_MEM:
+ case BPF_STX | BPF_W | BPF_MEM:
+ case BPF_STX | BPF_DW | BPF_MEM:
+ case BPF_STX | BPF_W | BPF_XADD:
+ case BPF_STX | BPF_DW | BPF_XADD:
+ if (insn->dst_reg == BPF_REG_10) {
+ ctx->flags |= EBPF_SEEN_FP;
+ dst = MIPS_R_SP;
+ mem_off = insn->off + MAX_BPF_STACK;
+ } else {
+ dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+ if (dst < 0)
+ return dst;
+ mem_off = insn->off;
+ }
+ src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+ if (src < 0)
+ return dst;
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ switch (BPF_SIZE(insn->code)) {
+ case BPF_W:
+ if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
+ emit_instr(ctx, sll, MIPS_R_AT, src, 0);
+ src = MIPS_R_AT;
+ }
+ emit_instr(ctx, ll, MIPS_R_T8, mem_off, dst);
+ emit_instr(ctx, addu, MIPS_R_T8, MIPS_R_T8, src);
+ emit_instr(ctx, sc, MIPS_R_T8, mem_off, dst);
+ /*
+ * On failure back up to LL (-4
+ * instructions of 4 bytes each
+ */
+ emit_instr(ctx, beq, MIPS_R_T8, MIPS_R_ZERO, -4 * 4);
+ emit_instr(ctx, nop);
+ break;
+ case BPF_DW:
+ if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
+ emit_instr(ctx, daddu, MIPS_R_AT, src, MIPS_R_ZERO);
+ emit_instr(ctx, dinsu, MIPS_R_AT, MIPS_R_ZERO, 32, 32);
+ src = MIPS_R_AT;
+ }
+ emit_instr(ctx, lld, MIPS_R_T8, mem_off, dst);
+ emit_instr(ctx, daddu, MIPS_R_T8, MIPS_R_T8, src);
+ emit_instr(ctx, scd, MIPS_R_T8, mem_off, dst);
+ emit_instr(ctx, beq, MIPS_R_T8, MIPS_R_ZERO, -4 * 4);
+ emit_instr(ctx, nop);
+ break;
+ }
+ } else { /* BPF_MEM */
+ switch (BPF_SIZE(insn->code)) {
+ case BPF_B:
+ emit_instr(ctx, sb, src, mem_off, dst);
+ break;
+ case BPF_H:
+ emit_instr(ctx, sh, src, mem_off, dst);
+ break;
+ case BPF_W:
+ emit_instr(ctx, sw, src, mem_off, dst);
+ break;
+ case BPF_DW:
+ if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
+ emit_instr(ctx, daddu, MIPS_R_AT, src, MIPS_R_ZERO);
+ emit_instr(ctx, dinsu, MIPS_R_AT, MIPS_R_ZERO, 32, 32);
+ src = MIPS_R_AT;
+ }
+ emit_instr(ctx, sd, src, mem_off, dst);
+ break;
+ }
+ }
+ break;
+
+ default:
+ pr_err("NOT HANDLED %d - (%02x)\n",
+ this_idx, (unsigned int)insn->code);
+ return -EINVAL;
+ }
+ return 1;
+}
+
+#define RVT_VISITED_MASK 0xc000000000000000ull
+#define RVT_FALL_THROUGH 0x4000000000000000ull
+#define RVT_BRANCH_TAKEN 0x8000000000000000ull
+#define RVT_DONE (RVT_FALL_THROUGH | RVT_BRANCH_TAKEN)
+
+static int build_int_body(struct jit_ctx *ctx)
+{
+ const struct bpf_prog *prog = ctx->skf;
+ const struct bpf_insn *insn;
+ int i, r;
+
+ for (i = 0; i < prog->len; ) {
+ insn = prog->insnsi + i;
+ if ((ctx->reg_val_types[i] & RVT_VISITED_MASK) == 0) {
+ /* dead instruction, don't emit it. */
+ i++;
+ continue;
+ }
+
+ if (ctx->target == NULL)
+ ctx->offsets[i] = (ctx->offsets[i] & OFFSETS_B_CONV) | (ctx->idx * 4);
+
+ r = build_one_insn(insn, ctx, i, prog->len);
+ if (r < 0)
+ return r;
+ i += r;
+ }
+ /* epilogue offset */
+ if (ctx->target == NULL)
+ ctx->offsets[i] = ctx->idx * 4;
+
+ /*
+ * All exits have an offset of the epilogue, some offsets may
+ * not have been set due to banch-around threading, so set
+ * them now.
+ */
+ if (ctx->target == NULL)
+ for (i = 0; i < prog->len; i++) {
+ insn = prog->insnsi + i;
+ if (insn->code == (BPF_JMP | BPF_EXIT))
+ ctx->offsets[i] = ctx->idx * 4;
+ }
+ return 0;
+}
+
+/* return the last idx processed, or negative for error */
+static int reg_val_propagate_range(struct jit_ctx *ctx, u64 initial_rvt,
+ int start_idx, bool follow_taken)
+{
+ const struct bpf_prog *prog = ctx->skf;
+ const struct bpf_insn *insn;
+ u64 exit_rvt = initial_rvt;
+ u64 *rvt = ctx->reg_val_types;
+ int idx;
+ int reg;
+
+ for (idx = start_idx; idx < prog->len; idx++) {
+ rvt[idx] = (rvt[idx] & RVT_VISITED_MASK) | exit_rvt;
+ insn = prog->insnsi + idx;
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_ALU:
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_MUL:
+ case BPF_DIV:
+ case BPF_OR:
+ case BPF_AND:
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_NEG:
+ case BPF_MOD:
+ case BPF_XOR:
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+ break;
+ case BPF_MOV:
+ if (BPF_SRC(insn->code)) {
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+ } else {
+ /* IMM to REG move*/
+ if (insn->imm >= 0)
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+ else
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+ }
+ break;
+ case BPF_END:
+ if (insn->imm == 64)
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+ else if (insn->imm == 32)
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+ else /* insn->imm == 16 */
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+ break;
+ }
+ rvt[idx] |= RVT_DONE;
+ break;
+ case BPF_ALU64:
+ switch (BPF_OP(insn->code)) {
+ case BPF_MOV:
+ if (BPF_SRC(insn->code)) {
+ /* REG to REG move*/
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+ } else {
+ /* IMM to REG move*/
+ if (insn->imm >= 0)
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+ else
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT_32BIT);
+ }
+ break;
+ default:
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+ }
+ rvt[idx] |= RVT_DONE;
+ break;
+ case BPF_LD:
+ switch (BPF_SIZE(insn->code)) {
+ case BPF_DW:
+ if (BPF_MODE(insn->code) == BPF_IMM) {
+ s64 val;
+
+ val = (s64)((u32)insn->imm | ((u64)(insn + 1)->imm << 32));
+ if (val > 0 && val <= S32_MAX)
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+ else if (val >= S32_MIN && val <= S32_MAX)
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT_32BIT);
+ else
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+ rvt[idx] |= RVT_DONE;
+ idx++;
+ } else {
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+ }
+ break;
+ case BPF_B:
+ case BPF_H:
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+ break;
+ case BPF_W:
+ if (BPF_MODE(insn->code) == BPF_IMM)
+ set_reg_val_type(&exit_rvt, insn->dst_reg,
+ insn->imm >= 0 ? REG_32BIT_POS : REG_32BIT);
+ else
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+ break;
+ }
+ rvt[idx] |= RVT_DONE;
+ break;
+ case BPF_LDX:
+ switch (BPF_SIZE(insn->code)) {
+ case BPF_DW:
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+ break;
+ case BPF_B:
+ case BPF_H:
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+ break;
+ case BPF_W:
+ set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+ break;
+ }
+ rvt[idx] |= RVT_DONE;
+ break;
+ case BPF_JMP:
+ switch (BPF_OP(insn->code)) {
+ case BPF_EXIT:
+ rvt[idx] = RVT_DONE | exit_rvt;
+ rvt[prog->len] = exit_rvt;
+ return idx;
+ case BPF_JA:
+ rvt[idx] |= RVT_DONE;
+ idx += insn->off;
+ break;
+ case BPF_JEQ:
+ case BPF_JGT:
+ case BPF_JGE:
+ case BPF_JSET:
+ case BPF_JNE:
+ case BPF_JSGT:
+ case BPF_JSGE:
+ if (follow_taken) {
+ rvt[idx] |= RVT_BRANCH_TAKEN;
+ idx += insn->off;
+ follow_taken = false;
+ } else {
+ rvt[idx] |= RVT_FALL_THROUGH;
+ }
+ break;
+ case BPF_CALL:
+ set_reg_val_type(&exit_rvt, BPF_REG_0, REG_64BIT);
+ /* Upon call return, argument registers are clobbered. */
+ for (reg = BPF_REG_0; reg <= BPF_REG_5; reg++)
+ set_reg_val_type(&exit_rvt, reg, REG_64BIT);
+
+ rvt[idx] |= RVT_DONE;
+ break;
+ default:
+ WARN(1, "Unhandled BPF_JMP case.\n");
+ rvt[idx] |= RVT_DONE;
+ break;
+ }
+ break;
+ default:
+ rvt[idx] |= RVT_DONE;
+ break;
+ }
+ }
+ return idx;
+}
+
+/*
+ * Track the value range (i.e. 32-bit vs. 64-bit) of each register at
+ * each eBPF insn. This allows unneeded sign and zero extension
+ * operations to be omitted.
+ *
+ * Doesn't handle yet confluence of control paths with conflicting
+ * ranges, but it is good enough for most sane code.
+ */
+static int reg_val_propagate(struct jit_ctx *ctx)
+{
+ const struct bpf_prog *prog = ctx->skf;
+ u64 exit_rvt;
+ int reg;
+ int i;
+
+ /*
+ * 11 registers * 3 bits/reg leaves top bits free for other
+ * uses. Bit-62..63 used to see if we have visited an insn.
+ */
+ exit_rvt = 0;
+
+ /* Upon entry, argument registers are 64-bit. */
+ for (reg = BPF_REG_1; reg <= BPF_REG_5; reg++)
+ set_reg_val_type(&exit_rvt, reg, REG_64BIT);
+
+ /*
+ * First follow all conditional branches on the fall-through
+ * edge of control flow..
+ */
+ reg_val_propagate_range(ctx, exit_rvt, 0, false);
+restart_search:
+ /*
+ * Then repeatedly find the first conditional branch where
+ * both edges of control flow have not been taken, and follow
+ * the branch taken edge. We will end up restarting the
+ * search once per conditional branch insn.
+ */
+ for (i = 0; i < prog->len; i++) {
+ u64 rvt = ctx->reg_val_types[i];
+
+ if ((rvt & RVT_VISITED_MASK) == RVT_DONE ||
+ (rvt & RVT_VISITED_MASK) == 0)
+ continue;
+ if ((rvt & RVT_VISITED_MASK) == RVT_FALL_THROUGH) {
+ reg_val_propagate_range(ctx, rvt & ~RVT_VISITED_MASK, i, true);
+ } else { /* RVT_BRANCH_TAKEN */
+ WARN(1, "Unexpected RVT_BRANCH_TAKEN case.\n");
+ reg_val_propagate_range(ctx, rvt & ~RVT_VISITED_MASK, i, false);
+ }
+ goto restart_search;
+ }
+ /*
+ * Eventually all conditional branches have been followed on
+ * both branches and we are done. Any insn that has not been
+ * visited at this point is dead.
+ */
+
+ return 0;
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+ u32 *p;
+
+ /* We are guaranteed to have aligned memory. */
+ for (p = area; size >= sizeof(u32); size -= sizeof(u32))
+ uasm_i_break(&p, BRK_BUG); /* Increments p */
+}
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ struct bpf_prog *orig_prog = prog;
+ bool tmp_blinded = false;
+ struct bpf_prog *tmp;
+ struct bpf_binary_header *header = NULL;
+ struct jit_ctx ctx;
+ unsigned int image_size;
+ u8 *image_ptr;
+
+ if (!bpf_jit_enable || !cpu_has_mips64r2)
+ return prog;
+
+ tmp = bpf_jit_blind_constants(prog);
+ /* If blinding was requested and we failed during blinding,
+ * we must fall back to the interpreter.
+ */
+ if (IS_ERR(tmp))
+ return orig_prog;
+ if (tmp != prog) {
+ tmp_blinded = true;
+ prog = tmp;
+ }
+
+ memset(&ctx, 0, sizeof(ctx));
+
+ ctx.offsets = kcalloc(prog->len + 1, sizeof(*ctx.offsets), GFP_KERNEL);
+ if (ctx.offsets == NULL)
+ goto out_err;
+
+ ctx.reg_val_types = kcalloc(prog->len + 1, sizeof(*ctx.reg_val_types), GFP_KERNEL);
+ if (ctx.reg_val_types == NULL)
+ goto out_err;
+
+ ctx.skf = prog;
+
+ if (reg_val_propagate(&ctx))
+ goto out_err;
+
+ /*
+ * First pass discovers used resources and instruction offsets
+ * assuming short branches are used.
+ */
+ if (build_int_body(&ctx))
+ goto out_err;
+
+ /*
+ * If no calls are made (EBPF_SAVE_RA), then tail call count
+ * in $v1, else we must save in n$s4.
+ */
+ if (ctx.flags & EBPF_SEEN_TC) {
+ if (ctx.flags & EBPF_SAVE_RA)
+ ctx.flags |= EBPF_SAVE_S4;
+ else
+ ctx.flags |= EBPF_TCC_IN_V1;
+ }
+
+ /*
+ * Second pass generates offsets, if any branches are out of
+ * range a jump-around long sequence is generated, and we have
+ * to try again from the beginning to generate the new
+ * offsets. This is done until no additional conversions are
+ * necessary.
+ */
+ do {
+ ctx.idx = 0;
+ ctx.gen_b_offsets = 1;
+ ctx.long_b_conversion = 0;
+ if (gen_int_prologue(&ctx))
+ goto out_err;
+ if (build_int_body(&ctx))
+ goto out_err;
+ if (build_int_epilogue(&ctx, MIPS_R_RA))
+ goto out_err;
+ } while (ctx.long_b_conversion);
+
+ image_size = 4 * ctx.idx;
+
+ header = bpf_jit_binary_alloc(image_size, &image_ptr,
+ sizeof(u32), jit_fill_hole);
+ if (header == NULL)
+ goto out_err;
+
+ ctx.target = (u32 *)image_ptr;
+
+ /* Third pass generates the code */
+ ctx.idx = 0;
+ if (gen_int_prologue(&ctx))
+ goto out_err;
+ if (build_int_body(&ctx))
+ goto out_err;
+ if (build_int_epilogue(&ctx, MIPS_R_RA))
+ goto out_err;
+
+ /* Update the icache */
+ flush_icache_range((unsigned long)ctx.target,
+ (unsigned long)(ctx.target + ctx.idx * sizeof(u32)));
+
+ if (bpf_jit_enable > 1)
+ /* Dump JIT code */
+ bpf_jit_dump(prog->len, image_size, 2, ctx.target);
+
+ bpf_jit_binary_lock_ro(header);
+ prog->bpf_func = (void *)ctx.target;
+ prog->jited = 1;
+ prog->jited_len = image_size;
+out_normal:
+ if (tmp_blinded)
+ bpf_jit_prog_release_other(prog, prog == orig_prog ?
+ tmp : orig_prog);
+ kfree(ctx.offsets);
+ kfree(ctx.reg_val_types);
+
+ return prog;
+
+out_err:
+ prog = orig_prog;
+ if (header)
+ bpf_jit_binary_free(header);
+ goto out_normal;
+}
diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c
index bd67ac74fe2d..9632436d74d7 100644
--- a/arch/mips/pci/pci.c
+++ b/arch/mips/pci/pci.c
@@ -28,16 +28,15 @@ EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
static int __init pcibios_set_cache_line_size(void)
{
- struct cpuinfo_mips *c = &current_cpu_data;
unsigned int lsize;
/*
* Set PCI cacheline size to that of the highest level in the
* cache hierarchy.
*/
- lsize = c->dcache.linesz;
- lsize = c->scache.linesz ? : lsize;
- lsize = c->tcache.linesz ? : lsize;
+ lsize = cpu_dcache_line_size();
+ lsize = cpu_scache_line_size() ? : lsize;
+ lsize = cpu_tcache_line_size() ? : lsize;
BUG_ON(!lsize);
diff --git a/arch/mips/ralink/mt7620.c b/arch/mips/ralink/mt7620.c
index 094a0ee4af46..9be8b08ae46b 100644
--- a/arch/mips/ralink/mt7620.c
+++ b/arch/mips/ralink/mt7620.c
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/bug.h>
#include <asm/mipsregs.h>
#include <asm/mach-ralink/ralink_regs.h>
diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c
index 974276e828b2..e2690d7ca4dd 100644
--- a/arch/mips/vdso/gettimeofday.c
+++ b/arch/mips/vdso/gettimeofday.c
@@ -35,7 +35,8 @@ static __always_inline long gettimeofday_fallback(struct timeval *_tv,
" syscall\n"
: "=r" (ret), "=r" (error)
: "r" (tv), "r" (tz), "r" (nr)
- : "memory");
+ : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13",
+ "$14", "$15", "$24", "$25", "hi", "lo", "memory");
return error ? -ret : ret;
}
@@ -55,7 +56,8 @@ static __always_inline long clock_gettime_fallback(clockid_t _clkid,
" syscall\n"
: "=r" (ret), "=r" (error)
: "r" (clkid), "r" (ts), "r" (nr)
- : "memory");
+ : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13",
+ "$14", "$15", "$24", "$25", "hi", "lo", "memory");
return error ? -ret : ret;
}
diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h
index 9c7b8f7942d8..fe413b41df6c 100644
--- a/arch/mn10300/include/asm/spinlock.h
+++ b/arch/mn10300/include/asm/spinlock.h
@@ -26,11 +26,6 @@
#define arch_spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) != 0)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
static inline void arch_spin_unlock(arch_spinlock_t *lock)
{
asm volatile(
diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h
index 778087341977..8fed278a24b8 100644
--- a/arch/openrisc/include/asm/futex.h
+++ b/arch/openrisc/include/asm/futex.h
@@ -30,20 +30,10 @@
})
static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
-
pagefault_disable();
switch (op) {
@@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- ret = (oldval == cmparg);
- break;
- case FUTEX_OP_CMP_NE:
- ret = (oldval != cmparg);
- break;
- case FUTEX_OP_CMP_LT:
- ret = (oldval < cmparg);
- break;
- case FUTEX_OP_CMP_GE:
- ret = (oldval >= cmparg);
- break;
- case FUTEX_OP_CMP_LE:
- ret = (oldval <= cmparg);
- break;
- case FUTEX_OP_CMP_GT:
- ret = (oldval > cmparg);
- break;
- default:
- ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 531da9eb8f43..dda1f558ef35 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -47,6 +47,9 @@ config PARISC
and later HP3000 series). The PA-RISC Linux project home page is
at <http://www.parisc-linux.org/>.
+config CPU_BIG_ENDIAN
+ def_bool y
+
config MMU
def_bool y
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 5394b9c5f914..17b98a87e5e2 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i)
_atomic_spin_unlock_irqrestore(v, flags);
}
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
static __inline__ int atomic_read(const atomic_t *v)
{
return READ_ONCE((v)->counter);
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 0ba14300cd8e..c601aab2fb36 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
}
static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
{
unsigned long int flags;
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval, ret;
u32 tmp;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)))
- return -EFAULT;
-
_futex_spin_lock_irqsave(uaddr, &flags);
pagefault_disable();
@@ -85,17 +75,9 @@ out_pagefault_enable:
pagefault_enable();
_futex_spin_unlock_irqrestore(uaddr, &flags);
- if (ret == 0) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index e32936cd7f10..55bfe4affca3 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -14,13 +14,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x)
#define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *x)
-{
- volatile unsigned int *a = __ldcw_align(x);
-
- smp_cond_load_acquire(a, VAL);
-}
-
static inline void arch_spin_lock_flags(arch_spinlock_t *x,
unsigned long flags)
{
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index 88fe0aad4390..bc208136bbb2 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -34,7 +34,7 @@ struct thread_info {
/* thread information allocation */
-#define THREAD_SIZE_ORDER 2 /* PA-RISC requires at least 16k stack */
+#define THREAD_SIZE_ORDER 3 /* PA-RISC requires at least 32k stack */
/* Be sure to hunt all references to this down when you change the size of
* the kernel stack */
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 85a92db70afc..19c0c141bc3f 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -587,13 +587,12 @@ void flush_cache_range(struct vm_area_struct *vma,
if (parisc_requires_coherency())
flush_tlb_range(vma, start, end);
- if ((end - start) >= parisc_cache_flush_threshold) {
+ if ((end - start) >= parisc_cache_flush_threshold
+ || vma->vm_mm->context != mfsp(3)) {
flush_cache_all();
return;
}
- BUG_ON(vma->vm_mm->context != mfsp(3));
-
flush_user_dcache_range_asm(start, end);
if (vma->vm_flags & VM_EXEC)
flush_user_icache_range_asm(start, end);
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 5404e4086cb9..0ca254085a66 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -380,7 +380,7 @@ static inline int eirr_to_irq(unsigned long eirr)
/*
* IRQ STACK - used for irq handler
*/
-#define IRQ_STACK_SIZE (4096 << 2) /* 16k irq stack size */
+#define IRQ_STACK_SIZE (4096 << 3) /* 32k irq stack size */
union irq_stack_union {
unsigned long stack[IRQ_STACK_SIZE/sizeof(unsigned long)];
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 36f858c37ca7..81b0031f909f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -199,7 +199,7 @@ config PPC
select HAVE_OPTPROBES if PPC64
select HAVE_PERF_EVENTS
select HAVE_PERF_EVENTS_NMI if PPC64
- select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
+ select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE if SMP
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index a7814a7b1523..6f952fe1f084 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -25,12 +25,20 @@ compress-$(CONFIG_KERNEL_XZ) := CONFIG_KERNEL_XZ
BOOTCFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-fno-strict-aliasing -Os -msoft-float -pipe \
-fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
- -isystem $(shell $(CROSS32CC) -print-file-name=include) \
-D$(compress-y)
+BOOTCC := $(CC)
ifdef CONFIG_PPC64_BOOT_WRAPPER
BOOTCFLAGS += -m64
+else
+BOOTCFLAGS += -m32
+ifdef CROSS32_COMPILE
+ BOOTCC := $(CROSS32_COMPILE)gcc
+endif
endif
+
+BOOTCFLAGS += -isystem $(shell $(BOOTCC) -print-file-name=include)
+
ifdef CONFIG_CPU_BIG_ENDIAN
BOOTCFLAGS += -mbig-endian
else
@@ -183,10 +191,10 @@ clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
quiet_cmd_bootcc = BOOTCC $@
- cmd_bootcc = $(CROSS32CC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $<
+ cmd_bootcc = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $<
quiet_cmd_bootas = BOOTAS $@
- cmd_bootas = $(CROSS32CC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $<
+ cmd_bootas = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $<
quiet_cmd_bootar = BOOTAR $@
cmd_bootar = $(CROSS32AR) -cr$(KBUILD_ARFLAGS) $@.$$$$ $(filter-out FORCE,$^); mv $@.$$$$ $@
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 0695ce047d56..34fc9bbfca9e 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -293,7 +293,8 @@ CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_LATENCYTOP=y
CONFIG_SCHED_TRACER=y
CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 5175028c56ce..c5246d29f385 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -324,7 +324,8 @@ CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_LATENCYTOP=y
CONFIG_SCHED_TRACER=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 1a61aa20dfba..fd5d98a0b95c 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -291,7 +291,8 @@ CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_LATENCYTOP=y
CONFIG_SCHED_TRACER=y
CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 25d42bd3f114..9c601adfc500 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -74,13 +74,6 @@ do { \
___p1; \
})
-/*
- * This must resolve to hwsync on SMP for the context switch path.
- * See _switch, and core scheduler context switch memory ordering
- * comments.
- */
-#define smp_mb__before_spinlock() smp_mb()
-
#include <asm-generic/barrier.h>
#endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index d1da415e283c..818a58fc3f4f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -608,9 +608,17 @@ static inline pte_t pte_mkdevmap(pte_t pte)
return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP);
}
+/*
+ * This is potentially called with a pmd as the argument, in which case it's not
+ * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set.
+ * That's because the bit we use for _PAGE_DEVMAP is not reserved for software
+ * use in page directory entries (ie. non-ptes).
+ */
static inline int pte_devmap(pte_t pte)
{
- return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
+ u64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
+
+ return (pte_raw(pte) & mask) == mask;
}
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index eaada6c92344..719ed9b61ea7 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -29,18 +29,10 @@
: "b" (uaddr), "i" (-EFAULT), "r" (oparg) \
: "cr0", "memory")
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
pagefault_disable();
@@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8b3f1238d07f..e372ed871c51 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -67,11 +67,6 @@ extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address)
-{
-}
-
#define HPTEG_CACHE_NUM (1 << 15)
#define HPTEG_HASH_BITS_PTE 13
#define HPTEG_HASH_BITS_PTE_LONG 12
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 0c76675394c5..35bec1c5bd5a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -90,6 +90,24 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev,
/* Mark this context has been used on the new CPU */
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+
+ /*
+ * This full barrier orders the store to the cpumask above vs
+ * a subsequent operation which allows this CPU to begin loading
+ * translations for next.
+ *
+ * When using the radix MMU that operation is the load of the
+ * MMU context id, which is then moved to SPRN_PID.
+ *
+ * For the hash MMU it is either the first load from slb_cache
+ * in switch_slb(), and/or the store of paca->mm_ctx_id in
+ * copy_mm_to_paca().
+ *
+ * On the read side the barrier is in pte_xchg(), which orders
+ * the store to the PTE vs the load of mm_cpumask.
+ */
+ smp_mb();
+
new_on_cpu = true;
}
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h
index 9c0f5db5cf46..67e7e3d990f4 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -87,6 +87,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
unsigned long *p = (unsigned long *)ptep;
__be64 prev;
+ /* See comment in switch_mm_irqs_off() */
prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old),
(__force unsigned long)pte_raw(new));
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
index 8bd3b13fe2fb..369a164b545c 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -62,6 +62,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
{
unsigned long *p = (unsigned long *)ptep;
+ /* See comment in switch_mm_irqs_off() */
return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new));
}
#endif
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 8c1b913de6d7..edbe571bcc54 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -170,39 +170,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
lock->slock = 0;
}
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- arch_spinlock_t lock_val;
-
- smp_mb();
-
- /*
- * Atomically load and store back the lock value (unchanged). This
- * ensures that our observation of the lock value is ordered with
- * respect to other lock operations.
- */
- __asm__ __volatile__(
-"1: " PPC_LWARX(%0, 0, %2, 0) "\n"
-" stwcx. %0, 0, %2\n"
-" bne- 1b\n"
- : "=&r" (lock_val), "+m" (*lock)
- : "r" (lock)
- : "cr0", "xer");
-
- if (arch_spin_value_unlocked(lock_val))
- goto out;
-
- while (lock->slock) {
- HMT_low();
- if (SHARED_PROCESSOR)
- __spin_yield(lock);
- }
- HMT_medium();
-
-out:
- smp_mb();
-}
-
/*
* Read-write spinlocks, allowing multiple readers
* but only one writer.
@@ -342,5 +309,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
#define arch_read_relax(lock) __rw_yield(lock)
#define arch_write_relax(lock) __rw_yield(lock)
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock() smp_mb()
+
#endif /* __KERNEL__ */
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 49d8422767b4..e925c1c99c71 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -223,17 +223,27 @@ system_call_exit:
andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
bne- .Lsyscall_exit_work
- /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */
- li r7,MSR_FP
+ andi. r0,r8,MSR_FP
+ beq 2f
#ifdef CONFIG_ALTIVEC
- oris r7,r7,MSR_VEC@h
+ andis. r0,r8,MSR_VEC@h
+ bne 3f
#endif
- and r0,r8,r7
- cmpd r0,r7
- bne .Lsyscall_restore_math
-.Lsyscall_restore_math_cont:
+2: addi r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_PPC_BOOK3S
+ li r10,MSR_RI
+ mtmsrd r10,1 /* Restore RI */
+#endif
+ bl restore_math
+#ifdef CONFIG_PPC_BOOK3S
+ li r11,0
+ mtmsrd r11,1
+#endif
+ ld r8,_MSR(r1)
+ ld r3,RESULT(r1)
+ li r11,-MAX_ERRNO
- cmpld r3,r11
+3: cmpld r3,r11
ld r5,_CCR(r1)
bge- .Lsyscall_error
.Lsyscall_error_cont:
@@ -267,40 +277,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
std r5,_CCR(r1)
b .Lsyscall_error_cont
-.Lsyscall_restore_math:
- /*
- * Some initial tests from restore_math to avoid the heavyweight
- * C code entry and MSR manipulations.
- */
- LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK)
- and. r0,r0,r8
- bne 1f
-
- ld r7,PACACURRENT(r13)
- lbz r0,THREAD+THREAD_LOAD_FP(r7)
-#ifdef CONFIG_ALTIVEC
- lbz r6,THREAD+THREAD_LOAD_VEC(r7)
- add r0,r0,r6
-#endif
- cmpdi r0,0
- beq .Lsyscall_restore_math_cont
-
-1: addi r3,r1,STACK_FRAME_OVERHEAD
-#ifdef CONFIG_PPC_BOOK3S
- li r10,MSR_RI
- mtmsrd r10,1 /* Restore RI */
-#endif
- bl restore_math
-#ifdef CONFIG_PPC_BOOK3S
- li r11,0
- mtmsrd r11,1
-#endif
- /* Restore volatiles, reload MSR from updated one */
- ld r8,_MSR(r1)
- ld r3,RESULT(r1)
- li r11,-MAX_ERRNO
- b .Lsyscall_restore_math_cont
-
/* Traced system call support */
.Lsyscall_dotrace:
bl save_nvgprs
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 9029afd1fa2a..f14f3c04ec7e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1325,10 +1325,18 @@ EXC_VIRT_NONE(0x5800, 0x100)
std r10,PACA_EXGEN+EX_R13(r13); \
EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H)
+/*
+ * Branch to soft_nmi_interrupt using the emergency stack. The emergency
+ * stack is one that is usable by maskable interrupts so long as MSR_EE
+ * remains off. It is used for recovery when something has corrupted the
+ * normal kernel stack, for example. The "soft NMI" must not use the process
+ * stack because we want irq disabled sections to avoid touching the stack
+ * at all (other than PMU interrupts), so use the emergency stack for this,
+ * and run it entirely with interrupts hard disabled.
+ */
EXC_COMMON_BEGIN(soft_nmi_common)
mr r10,r1
ld r1,PACAEMERGSP(r13)
- ld r1,PACA_NMI_EMERG_SP(r13)
subi r1,r1,INT_FRAME_SIZE
EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900,
system_reset, soft_nmi_interrupt,
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 516ebef905c0..e6252c5a57a4 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -460,11 +460,17 @@ pnv_restore_hyp_resource_arch300:
/*
* Workaround for POWER9, if we lost resources, the ERAT
* might have been mixed up and needs flushing. We also need
- * to reload MMCR0 (see comment above).
+ * to reload MMCR0 (see comment above). We also need to set
+ * then clear bit 60 in MMCRA to ensure the PMU starts running.
*/
blt cr3,1f
PPC_INVALIDATE_ERAT
ld r1,PACAR1(r13)
+ mfspr r4,SPRN_MMCRA
+ ori r4,r4,(1 << (63-60))
+ mtspr SPRN_MMCRA,r4
+ xori r4,r4,(1 << (63-60))
+ mtspr SPRN_MMCRA,r4
ld r4,_MMCR0(r1)
mtspr SPRN_MMCR0,r4
1:
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0bcec745a672..f291f7826abc 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -145,6 +145,19 @@ notrace unsigned int __check_irq_replay(void)
/* Clear bit 0 which we wouldn't clear otherwise */
local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+ if (happened & PACA_IRQ_HARD_DIS) {
+ /*
+ * We may have missed a decrementer interrupt if hard disabled.
+ * Check the decrementer register in case we had a rollover
+ * while hard disabled.
+ */
+ if (!(happened & PACA_IRQ_DEC)) {
+ if (decrementer_check_overflow()) {
+ local_paca->irq_happened |= PACA_IRQ_DEC;
+ happened |= PACA_IRQ_DEC;
+ }
+ }
+ }
/*
* Force the delivery of pending soft-disabled interrupts on PS3.
@@ -170,7 +183,7 @@ notrace unsigned int __check_irq_replay(void)
* in case we also had a rollover while hard disabled
*/
local_paca->irq_happened &= ~PACA_IRQ_DEC;
- if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow())
+ if (happened & PACA_IRQ_DEC)
return 0x900;
/* Finally check if an external interrupt happened */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9f3e2c932dcc..1f0fd361e09b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -362,7 +362,8 @@ void enable_kernel_vsx(void)
cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
- if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
+ if (current->thread.regs &&
+ (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) {
check_if_tm_restore_required(current);
/*
* If a thread has already been reclaimed then the
@@ -386,7 +387,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
{
if (tsk->thread.regs) {
preempt_disable();
- if (tsk->thread.regs->msr & MSR_VSX) {
+ if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) {
BUG_ON(tsk != current);
giveup_vsx(tsk);
}
@@ -511,10 +512,6 @@ void restore_math(struct pt_regs *regs)
{
unsigned long msr;
- /*
- * Syscall exit makes a similar initial check before branching
- * to restore_math. Keep them in synch.
- */
if (!msr_tm_active(regs->msr) &&
!current->thread.load_fp && !loadvec(current->thread))
return;
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 925a4ef90559..660ed39e9c9a 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -127,12 +127,19 @@ static void flush_tmregs_to_thread(struct task_struct *tsk)
* If task is not current, it will have been flushed already to
* it's thread_struct during __switch_to().
*
- * A reclaim flushes ALL the state.
+ * A reclaim flushes ALL the state or if not in TM save TM SPRs
+ * in the appropriate thread structures from live.
*/
- if (tsk == current && MSR_TM_SUSPENDED(mfmsr()))
- tm_reclaim_current(TM_CAUSE_SIGNAL);
+ if (tsk != current)
+ return;
+ if (MSR_TM_SUSPENDED(mfmsr())) {
+ tm_reclaim_current(TM_CAUSE_SIGNAL);
+ } else {
+ tm_enable();
+ tm_save_sprs(&(tsk->thread));
+ }
}
#else
static inline void flush_tmregs_to_thread(struct task_struct *tsk) { }
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 997c88d54acf..8d3320562c70 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -351,7 +351,7 @@ static void nmi_ipi_lock_start(unsigned long *flags)
hard_irq_disable();
while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
raw_local_irq_restore(*flags);
- cpu_relax();
+ spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
raw_local_irq_save(*flags);
hard_irq_disable();
}
@@ -360,7 +360,7 @@ static void nmi_ipi_lock_start(unsigned long *flags)
static void nmi_ipi_lock(void)
{
while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
- cpu_relax();
+ spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
}
static void nmi_ipi_unlock(void)
@@ -475,7 +475,7 @@ int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
nmi_ipi_lock_start(&flags);
while (nmi_ipi_busy_count) {
nmi_ipi_unlock_end(&flags);
- cpu_relax();
+ spin_until_cond(nmi_ipi_busy_count == 0);
nmi_ipi_lock_start(&flags);
}
@@ -1003,21 +1003,13 @@ static struct sched_domain_topology_level powerpc_topology[] = {
{ NULL, },
};
-static __init long smp_setup_cpu_workfn(void *data __always_unused)
-{
- smp_ops->setup_cpu(boot_cpuid);
- return 0;
-}
-
void __init smp_cpus_done(unsigned int max_cpus)
{
/*
- * We want the setup_cpu() here to be called on the boot CPU, but
- * init might run on any CPU, so make sure it's invoked on the boot
- * CPU.
+ * We are running pinned to the boot CPU, see rest_init().
*/
if (smp_ops && smp_ops->setup_cpu)
- work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
+ smp_ops->setup_cpu(boot_cpuid);
if (smp_ops && smp_ops->bringup_done)
smp_ops->bringup_done();
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index b67f8b03a32d..34721a257a77 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -71,15 +71,20 @@ static inline void wd_smp_lock(unsigned long *flags)
* This may be called from low level interrupt handlers at some
* point in future.
*/
- local_irq_save(*flags);
- while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock)))
- cpu_relax();
+ raw_local_irq_save(*flags);
+ hard_irq_disable(); /* Make it soft-NMI safe */
+ while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
+ raw_local_irq_restore(*flags);
+ spin_until_cond(!test_bit(0, &__wd_smp_lock));
+ raw_local_irq_save(*flags);
+ hard_irq_disable();
+ }
}
static inline void wd_smp_unlock(unsigned long *flags)
{
clear_bit_unlock(0, &__wd_smp_lock);
- local_irq_restore(*flags);
+ raw_local_irq_restore(*flags);
}
static void wd_lockup_ipi(struct pt_regs *regs)
@@ -96,10 +101,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
nmi_panic(regs, "Hard LOCKUP");
}
-static void set_cpu_stuck(int cpu, u64 tb)
+static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
{
- cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
- cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
+ cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
+ cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
if (cpumask_empty(&wd_smp_cpus_pending)) {
wd_smp_last_reset_tb = tb;
cpumask_andnot(&wd_smp_cpus_pending,
@@ -107,6 +112,10 @@ static void set_cpu_stuck(int cpu, u64 tb)
&wd_smp_cpus_stuck);
}
}
+static void set_cpu_stuck(int cpu, u64 tb)
+{
+ set_cpumask_stuck(cpumask_of(cpu), tb);
+}
static void watchdog_smp_panic(int cpu, u64 tb)
{
@@ -135,11 +144,9 @@ static void watchdog_smp_panic(int cpu, u64 tb)
}
smp_flush_nmi_ipi(1000000);
- /* Take the stuck CPU out of the watch group */
- for_each_cpu(c, &wd_smp_cpus_pending)
- set_cpu_stuck(c, tb);
+ /* Take the stuck CPUs out of the watch group */
+ set_cpumask_stuck(&wd_smp_cpus_pending, tb);
-out:
wd_smp_unlock(&flags);
printk_safe_flush();
@@ -152,6 +159,11 @@ out:
if (hardlockup_panic)
nmi_panic(NULL, "Hard LOCKUP");
+
+ return;
+
+out:
+ wd_smp_unlock(&flags);
}
static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
@@ -258,9 +270,11 @@ static void wd_timer_fn(unsigned long data)
void arch_touch_nmi_watchdog(void)
{
+ unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
int cpu = smp_processor_id();
- watchdog_timer_interrupt(cpu);
+ if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
+ watchdog_timer_interrupt(cpu);
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
@@ -283,6 +297,8 @@ static void stop_watchdog_timer_on(unsigned int cpu)
static int start_wd_on_cpu(unsigned int cpu)
{
+ unsigned long flags;
+
if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
WARN_ON(1);
return 0;
@@ -297,12 +313,14 @@ static int start_wd_on_cpu(unsigned int cpu)
if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
return 0;
+ wd_smp_lock(&flags);
cpumask_set_cpu(cpu, &wd_cpus_enabled);
if (cpumask_weight(&wd_cpus_enabled) == 1) {
cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
wd_smp_last_reset_tb = get_tb();
}
- smp_wmb();
+ wd_smp_unlock(&flags);
+
start_watchdog_timer_on(cpu);
return 0;
@@ -310,12 +328,17 @@ static int start_wd_on_cpu(unsigned int cpu)
static int stop_wd_on_cpu(unsigned int cpu)
{
+ unsigned long flags;
+
if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
return 0; /* Can happen in CPU unplug case */
stop_watchdog_timer_on(cpu);
+ wd_smp_lock(&flags);
cpumask_clear_cpu(cpu, &wd_cpus_enabled);
+ wd_smp_unlock(&flags);
+
wd_smp_clear_cpu_pending(cpu, get_tb());
return 0;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index a160c14304eb..53766e2bc029 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce_64 *args)
{
struct kvmppc_spapr_tce_table *stt = NULL;
+ struct kvmppc_spapr_tce_table *siter;
unsigned long npages, size;
int ret = -ENOMEM;
int i;
+ int fd = -1;
if (!args->size)
return -EINVAL;
- /* Check this LIOBN hasn't been previously allocated */
- list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
- if (stt->liobn == args->liobn)
- return -EBUSY;
- }
-
size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
npages = kvmppc_tce_pages(size);
ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
- if (ret) {
- stt = NULL;
- goto fail;
- }
+ if (ret)
+ return ret;
ret = -ENOMEM;
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
GFP_KERNEL);
if (!stt)
- goto fail;
+ goto fail_acct;
stt->liobn = args->liobn;
stt->page_shift = args->page_shift;
@@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
goto fail;
}
- kvm_get_kvm(kvm);
+ ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+ stt, O_RDWR | O_CLOEXEC);
+ if (ret < 0)
+ goto fail;
mutex_lock(&kvm->lock);
- list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+
+ /* Check this LIOBN hasn't been previously allocated */
+ ret = 0;
+ list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) {
+ if (siter->liobn == args->liobn) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ if (!ret) {
+ list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+ kvm_get_kvm(kvm);
+ }
mutex_unlock(&kvm->lock);
- return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
- stt, O_RDWR | O_CLOEXEC);
+ if (!ret)
+ return fd;
-fail:
- if (stt) {
- for (i = 0; i < npages; i++)
- if (stt->pages[i])
- __free_page(stt->pages[i]);
+ put_unused_fd(fd);
- kfree(stt);
- }
+ fail:
+ for (i = 0; i < npages; i++)
+ if (stt->pages[i])
+ __free_page(stt->pages[i]);
+
+ kfree(stt);
+ fail_acct:
+ kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
return ret;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c52184a8efdf..9c9c983b864f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1291,6 +1291,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
/* Hypervisor doorbell - exit only if host IPI flag set */
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
bne 3f
+BEGIN_FTR_SECTION
+ PPC_MSGSYNC
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r0, HSTATE_HOST_IPI(r13)
cmpwi r0, 0
beq 4f
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 4636ca6e7d38..d1ed2c41b5d2 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -16,7 +16,22 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
u8 cppr;
u16 ack;
- /* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */
+ /*
+ * Ensure any previous store to CPPR is ordered vs.
+ * the subsequent loads from PIPR or ACK.
+ */
+ eieio();
+
+ /*
+ * DD1 bug workaround: If PIPR is less favored than CPPR
+ * ignore the interrupt or we might incorrectly lose an IPB
+ * bit.
+ */
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+ u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+ if (pipr >= xc->hw_cppr)
+ return;
+ }
/* Perform the acknowledge OS to register cycle. */
ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG));
@@ -235,6 +250,11 @@ skip_ipi:
/*
* If we found an interrupt, adjust what the guest CPPR should
* be as if we had just fetched that interrupt from HW.
+ *
+ * Note: This can only make xc->cppr smaller as the previous
+ * loop will only exit with hirq != 0 if prio is lower than
+ * the current xc->cppr. Thus we don't need to re-check xc->mfrr
+ * for pending IPIs.
*/
if (hirq)
xc->cppr = prio;
@@ -381,6 +401,12 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
xc->cppr = cppr;
/*
+ * Order the above update of xc->cppr with the subsequent
+ * read of xc->mfrr inside push_pending_to_hw()
+ */
+ smp_mb();
+
+ /*
* We are masking less, we need to look for pending things
* to deliver and set VP pending bits accordingly to trigger
* a new interrupt otherwise we might miss MFRR changes for
@@ -420,21 +446,37 @@ X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr)
* used to signal MFRR changes is EOId when fetched from
* the queue.
*/
- if (irq == XICS_IPI || irq == 0)
+ if (irq == XICS_IPI || irq == 0) {
+ /*
+ * This barrier orders the setting of xc->cppr vs.
+ * subsquent test of xc->mfrr done inside
+ * scan_interrupts and push_pending_to_hw
+ */
+ smp_mb();
goto bail;
+ }
/* Find interrupt source */
sb = kvmppc_xive_find_source(xive, irq, &src);
if (!sb) {
pr_devel(" source not found !\n");
rc = H_PARAMETER;
+ /* Same as above */
+ smp_mb();
goto bail;
}
state = &sb->irq_state[src];
kvmppc_xive_select_irq(state, &hw_num, &xd);
state->in_eoi = true;
- mb();
+
+ /*
+ * This barrier orders both setting of in_eoi above vs,
+ * subsequent test of guest_priority, and the setting
+ * of xc->cppr vs. subsquent test of xc->mfrr done inside
+ * scan_interrupts and push_pending_to_hw
+ */
+ smp_mb();
again:
if (state->guest_priority == MASKED) {
@@ -461,6 +503,14 @@ again:
}
+ /*
+ * This barrier orders the above guest_priority check
+ * and spin_lock/unlock with clearing in_eoi below.
+ *
+ * It also has to be a full mb() as it must ensure
+ * the MMIOs done in source_eoi() are completed before
+ * state->in_eoi is visible.
+ */
mb();
state->in_eoi = false;
bail:
@@ -495,6 +545,18 @@ X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
/* Locklessly write over MFRR */
xc->mfrr = mfrr;
+ /*
+ * The load of xc->cppr below and the subsequent MMIO store
+ * to the IPI must happen after the above mfrr update is
+ * globally visible so that:
+ *
+ * - Synchronize with another CPU doing an H_EOI or a H_CPPR
+ * updating xc->cppr then reading xc->mfrr.
+ *
+ * - The target of the IPI sees the xc->mfrr update
+ */
+ mb();
+
/* Shoot the IPI if most favored than target cppr */
if (mfrr < xc->cppr)
__x_writeq(0, __x_trig_page(&xc->vp_ipi_data));
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6c2d4168daec..2e3eb7431571 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
- if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+ if (event->attr.sample_type &
+ (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
perf_get_data_addr(regs, &data.addr);
if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index d7c9b186954d..763ffca9628d 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -89,7 +89,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
goto err;
ret = of_irq_to_resource(np, 0, &res[1]);
- if (!ret)
+ if (ret <= 0)
goto err;
pdev = platform_device_alloc("mpc83xx_spi", i);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 2abee070373f..a553aeea7af6 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -56,6 +56,7 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
*/
static u64 pnv_deepest_stop_psscr_val;
static u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_flag;
static bool deepest_stop_found;
static int pnv_save_sprs_for_deep_states(void)
@@ -185,8 +186,40 @@ static void pnv_alloc_idle_core_states(void)
update_subcore_sibling_mask();
- if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
- pnv_save_sprs_for_deep_states();
+ if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+ int rc = pnv_save_sprs_for_deep_states();
+
+ if (likely(!rc))
+ return;
+
+ /*
+ * The stop-api is unable to restore hypervisor
+ * resources on wakeup from platform idle states which
+ * lose full context. So disable such states.
+ */
+ supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+ pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
+ pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+ (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+ /*
+ * Use the default stop state for CPU-Hotplug
+ * if available.
+ */
+ if (default_stop_found) {
+ pnv_deepest_stop_psscr_val =
+ pnv_default_stop_val;
+ pnv_deepest_stop_psscr_mask =
+ pnv_default_stop_mask;
+ pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
+ pnv_deepest_stop_psscr_val);
+ } else { /* Fallback to snooze loop for CPU-Hotplug */
+ deepest_stop_found = false;
+ pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
+ }
+ }
+ }
}
u32 pnv_get_supported_cpuidle_states(void)
@@ -375,7 +408,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
pnv_deepest_stop_psscr_val;
srr1 = power9_idle_stop(psscr);
- } else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+ } else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
+ (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
(idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
@@ -553,6 +587,7 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
max_residency_ns = residency_ns[i];
pnv_deepest_stop_psscr_val = psscr_val[i];
pnv_deepest_stop_psscr_mask = psscr_mask[i];
+ pnv_deepest_stop_flag = flags[i];
deepest_stop_found = true;
}
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index b5d960d6db3d..4c7b8591f737 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -614,15 +614,6 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
mmio_invalidate(npu_context, 1, address, true);
}
-static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address)
-{
- struct npu_context *npu_context = mn_to_npu_context(mn);
-
- mmio_invalidate(npu_context, 1, address, true);
-}
-
static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -640,7 +631,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
.release = pnv_npu2_mn_release,
.change_pte = pnv_npu2_mn_change_pte,
- .invalidate_page = pnv_npu2_mn_invalidate_page,
.invalidate_range = pnv_npu2_mn_invalidate_range,
};
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 437613588df1..b900eb1d5e17 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1852,6 +1852,14 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
/* 4GB offset bypasses 32-bit space */
set_dma_offset(&pdev->dev, (1ULL << 32));
set_dma_ops(&pdev->dev, &dma_direct_ops);
+ } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
+ /*
+ * Fail the request if a DMA mask between 32 and 64 bits
+ * was requested but couldn't be fulfilled. Ideally we
+ * would do this for 64-bits but historically we have
+ * always fallen back to 32-bits.
+ */
+ return -ENOMEM;
} else {
dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
set_dma_ops(&pdev->dev, &dma_iommu_ops);
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index b9300f8aee10..07a82bc933a7 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -8,11 +8,12 @@
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
-#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p(typeof(0?(t)0:0ULL), u64))
+#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \
+ typeof(0?(__force t)0:0ULL), u64))
#define __SC_DELOUSE(t,v) ({ \
BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \
- (t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
+ (__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
})
#define PSW32_MASK_PER 0x40000000UL
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index a4811aa0304d..8f8eec9e1198 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -21,17 +21,12 @@
: "0" (-EFAULT), "d" (oparg), "a" (uaddr), \
"m" (*uaddr) : "cc");
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, newval, ret;
load_kernel_asce();
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
pagefault_disable();
switch (op) {
@@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
}
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 4541ac44b35f..24bc41622a98 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -44,6 +44,11 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION3;
break;
+ case -PAGE_SIZE:
+ /* forked 5-level task, set new asce with new_mm->pgd */
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
+ break;
case 1UL << 53:
/* forked 4-level task, set new asce with new mm->pgd */
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index f7838ecd83c6..217ee5210c32 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -98,13 +98,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
: "cc", "memory");
}
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- while (arch_spin_is_locked(lock))
- arch_spin_relax(lock);
- smp_acquire__after_ctrl_dep();
-}
-
/*
* Read-write spinlocks, allowing multiple readers
* but only one writer.
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 7317b3108a88..2eb8ff0d6fca 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -47,10 +47,9 @@ struct mmu_table_batch {
extern void tlb_table_flush(struct mmu_gather *tlb);
extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
-static inline void tlb_gather_mmu(struct mmu_gather *tlb,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
+static inline void
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
tlb->start = start;
@@ -76,9 +75,15 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
tlb_flush_mmu_free(tlb);
}
-static inline void tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end)
+static inline void
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
+ if (force) {
+ tlb->start = start;
+ tlb->end = end;
+ }
+
tlb_flush_mmu(tlb);
}
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
index 926b5244263e..a2e5c24f47a7 100644
--- a/arch/s390/kvm/sthyi.c
+++ b/arch/s390/kvm/sthyi.c
@@ -394,7 +394,7 @@ static int sthyi(u64 vaddr)
"srl %[cc],28\n"
: [cc] "=d" (cc)
: [code] "d" (code), [addr] "a" (addr)
- : "memory", "cc");
+ : "3", "memory", "cc");
return cc;
}
@@ -425,7 +425,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
trace_kvm_s390_handle_sthyi(vcpu, code, addr);
- if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+ if (reg1 == reg2 || reg1 & 1 || reg2 & 1)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
if (code & 0xffff) {
@@ -433,6 +433,9 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
goto out;
}
+ if (addr & ~PAGE_MASK)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
/*
* If the page has not yet been faulted in, we want to do that
* now and not after all the expensive calculations.
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 2e10d2b8ad35..5bea139517a2 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -119,7 +119,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit) {
+ if (addr + len > current->mm->context.asce_limit &&
+ addr + len <= TASK_SIZE) {
rc = crst_table_upgrade(mm, addr + len);
if (rc)
return (unsigned long) rc;
@@ -183,7 +184,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit) {
+ if (addr + len > current->mm->context.asce_limit &&
+ addr + len <= TASK_SIZE) {
rc = crst_table_upgrade(mm, addr + len);
if (rc)
return (unsigned long) rc;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 01c6fbc3e85b..1803797fc885 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1253,7 +1253,8 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
insn_count = bpf_jit_insn(jit, fp, i);
if (insn_count < 0)
return -1;
- jit->addrs[i + 1] = jit->prg; /* Next instruction address */
+ /* Next instruction address */
+ jit->addrs[i + insn_count] = jit->prg;
}
bpf_jit_epilogue(jit);
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index d0078747d308..8f8cf941a8cd 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
}
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- u32 oparg = (encoded_op << 8) >> 20;
- u32 cmparg = (encoded_op << 20) >> 20;
u32 oldval, newval, prev;
int ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
-
pagefault_disable();
do {
@@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break;
- case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break;
- case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break;
- case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
return ret;
}
diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h
index c46e8cc7b515..5ed7dbbd94ff 100644
--- a/arch/sh/include/asm/spinlock-cas.h
+++ b/arch/sh/include/asm/spinlock-cas.h
@@ -29,11 +29,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new
#define arch_spin_is_locked(x) ((x)->lock <= 0)
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->lock, VAL > 0);
-}
-
static inline void arch_spin_lock(arch_spinlock_t *lock)
{
while (!__sl_cas(&lock->lock, 1, 0));
diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h
index cec78143fa83..f77263aae760 100644
--- a/arch/sh/include/asm/spinlock-llsc.h
+++ b/arch/sh/include/asm/spinlock-llsc.h
@@ -21,11 +21,6 @@
#define arch_spin_is_locked(x) ((x)->lock <= 0)
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->lock, VAL > 0);
-}
-
/*
* Simple spin lock operations. There are two variants, one clears IRQ's
* on the local processor, one does not.
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 46e0d635e36f..51a8bc967e75 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -36,7 +36,8 @@ static inline void init_tlb_gather(struct mmu_gather *tlb)
}
static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
tlb->start = start;
@@ -47,9 +48,10 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
}
static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
- if (tlb->fullmm)
+ if (tlb->fullmm || force)
flush_tlb_mm(tlb->mm);
/* keep the page table cache within bounds */
diff --git a/arch/sparc/configs/sparc32_defconfig b/arch/sparc/configs/sparc32_defconfig
index c74d3701ad68..207a43a2d8b3 100644
--- a/arch/sparc/configs/sparc32_defconfig
+++ b/arch/sparc/configs/sparc32_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_LOG_BUF_SHIFT=14
@@ -23,7 +22,6 @@ CONFIG_IP_PNP_DHCP=y
CONFIG_INET_AH=y
CONFIG_INET_ESP=y
CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
CONFIG_INET6_AH=m
CONFIG_INET6_ESP=m
CONFIG_INET6_IPCOMP=m
@@ -69,7 +67,6 @@ CONFIG_EXT2_FS=y
CONFIG_EXT2_FS_XATTR=y
CONFIG_EXT2_FS_POSIX_ACL=y
CONFIG_EXT2_FS_SECURITY=y
-CONFIG_AUTOFS_FS=m
CONFIG_AUTOFS4_FS=m
CONFIG_ISO9660_FS=m
CONFIG_PROC_KCORE=y
@@ -82,7 +79,6 @@ CONFIG_NLS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_SCHED_DEBUG is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_KGDB=y
CONFIG_KGDB_TESTS=y
CONFIG_CRYPTO_NULL=m
diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig
index b2e650d1764f..ca8609d7292f 100644
--- a/arch/sparc/configs/sparc64_defconfig
+++ b/arch/sparc/configs/sparc64_defconfig
@@ -1,5 +1,4 @@
CONFIG_64BIT=y
-CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
@@ -184,7 +183,6 @@ CONFIG_HID_TOPSEED=y
CONFIG_HID_THRUSTMASTER=y
CONFIG_HID_ZEROPLUS=y
CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
CONFIG_USB_EHCI_HCD=m
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
CONFIG_USB_OHCI_HCD=y
@@ -210,8 +208,6 @@ CONFIG_LOCKUP_DETECTOR=y
CONFIG_DETECT_HUNG_TASK=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_UPROBE_EVENTS=y
CONFIG_KEYS=y
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index ee3f11c43cda..7643e979e333 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int);
int __atomic_add_unless(atomic_t *, int, int);
void atomic_set(atomic_t *, int);
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
#define atomic_read(v) ACCESS_ONCE((v)->counter)
#define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v)))
diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h
index 4e899b0dabf7..1cfd89d92208 100644
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -29,22 +29,14 @@
: "r" (uaddr), "r" (oparg), "i" (-EFAULT) \
: "memory")
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret, tem;
- if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
- return -EFAULT;
if (unlikely((((unsigned long) uaddr) & 0x3UL)))
return -EINVAL;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
pagefault_disable();
switch (op) {
@@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
- case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
- case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
- case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
- case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
- case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
- default: ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index 2cddcda4f85f..87841d687f8d 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -27,9 +27,11 @@ void destroy_context(struct mm_struct *mm);
void __tsb_context_switch(unsigned long pgd_pa,
struct tsb_config *tsb_base,
struct tsb_config *tsb_huge,
- unsigned long tsb_descr_pa);
+ unsigned long tsb_descr_pa,
+ unsigned long secondary_ctx);
-static inline void tsb_context_switch(struct mm_struct *mm)
+static inline void tsb_context_switch_ctx(struct mm_struct *mm,
+ unsigned long ctx)
{
__tsb_context_switch(__pa(mm->pgd),
&mm->context.tsb_block[MM_TSB_BASE],
@@ -40,9 +42,12 @@ static inline void tsb_context_switch(struct mm_struct *mm)
#else
NULL
#endif
- , __pa(&mm->context.tsb_descr[MM_TSB_BASE]));
+ , __pa(&mm->context.tsb_descr[MM_TSB_BASE]),
+ ctx);
}
+#define tsb_context_switch(X) tsb_context_switch_ctx(X, 0)
+
void tsb_grow(struct mm_struct *mm,
unsigned long tsb_index,
unsigned long mm_rss);
@@ -112,8 +117,7 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
* cpu0 to update it's TSB because at that point the cpu_vm_mask
* only had cpu1 set in it.
*/
- load_secondary_context(mm);
- tsb_context_switch(mm);
+ tsb_context_switch_ctx(mm, CTX_HWBITS(mm->context));
/* Any time a processor runs a context on an address space
* for the first time, we must flush that context out of the
diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h
index 0efd0583a8c9..6249214148c2 100644
--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
@@ -68,6 +68,7 @@ typedef struct { unsigned long iopgprot; } iopgprot_t;
#define iopgprot_val(x) ((x).iopgprot)
#define __pte(x) ((pte_t) { (x) } )
+#define __pmd(x) ((pmd_t) { { (x) }, })
#define __iopte(x) ((iopte_t) { (x) } )
#define __pgd(x) ((pgd_t) { (x) } )
#define __ctxd(x) ((ctxd_t) { (x) } )
@@ -95,6 +96,7 @@ typedef unsigned long iopgprot_t;
#define iopgprot_val(x) (x)
#define __pte(x) (x)
+#define __pmd(x) ((pmd_t) { { (x) }, })
#define __iopte(x) (x)
#define __pgd(x) (x)
#define __ctxd(x) (x)
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 8011e79f59c9..67345b2dc408 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -14,11 +14,6 @@
#define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
static inline void arch_spin_lock(arch_spinlock_t *lock)
{
__asm__ __volatile__(
diff --git a/arch/sparc/include/asm/spitfire.h b/arch/sparc/include/asm/spitfire.h
index 1d8321c827a8..1b1286d05069 100644
--- a/arch/sparc/include/asm/spitfire.h
+++ b/arch/sparc/include/asm/spitfire.h
@@ -47,10 +47,26 @@
#define SUN4V_CHIP_NIAGARA5 0x05
#define SUN4V_CHIP_SPARC_M6 0x06
#define SUN4V_CHIP_SPARC_M7 0x07
+#define SUN4V_CHIP_SPARC_M8 0x08
#define SUN4V_CHIP_SPARC64X 0x8a
#define SUN4V_CHIP_SPARC_SN 0x8b
#define SUN4V_CHIP_UNKNOWN 0xff
+/*
+ * The following CPU_ID_xxx constants are used
+ * to identify the CPU type in the setup phase
+ * (see head_64.S)
+ */
+#define CPU_ID_NIAGARA1 ('1')
+#define CPU_ID_NIAGARA2 ('2')
+#define CPU_ID_NIAGARA3 ('3')
+#define CPU_ID_NIAGARA4 ('4')
+#define CPU_ID_NIAGARA5 ('5')
+#define CPU_ID_M6 ('6')
+#define CPU_ID_M7 ('7')
+#define CPU_ID_M8 ('8')
+#define CPU_ID_SONOMA1 ('N')
+
#ifndef __ASSEMBLY__
enum ultra_tlb_layout {
diff --git a/arch/sparc/kernel/cpu.c b/arch/sparc/kernel/cpu.c
index 493e023a468a..ef4f18f7a674 100644
--- a/arch/sparc/kernel/cpu.c
+++ b/arch/sparc/kernel/cpu.c
@@ -506,6 +506,12 @@ static void __init sun4v_cpu_probe(void)
sparc_pmu_type = "sparc-m7";
break;
+ case SUN4V_CHIP_SPARC_M8:
+ sparc_cpu_type = "SPARC-M8";
+ sparc_fpu_type = "SPARC-M8 integrated FPU";
+ sparc_pmu_type = "sparc-m8";
+ break;
+
case SUN4V_CHIP_SPARC_SN:
sparc_cpu_type = "SPARC-SN";
sparc_fpu_type = "SPARC-SN integrated FPU";
diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
index 45c820e1cba5..90d550bbfeef 100644
--- a/arch/sparc/kernel/cpumap.c
+++ b/arch/sparc/kernel/cpumap.c
@@ -328,6 +328,7 @@ static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index)
case SUN4V_CHIP_NIAGARA5:
case SUN4V_CHIP_SPARC_M6:
case SUN4V_CHIP_SPARC_M7:
+ case SUN4V_CHIP_SPARC_M8:
case SUN4V_CHIP_SPARC_SN:
case SUN4V_CHIP_SPARC64X:
rover_inc_table = niagara_iterate_method;
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index 41a407328667..78e0211753d2 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -424,22 +424,25 @@ EXPORT_SYMBOL(sun4v_chip_type)
nop
70: ldub [%g1 + 7], %g2
- cmp %g2, '3'
+ cmp %g2, CPU_ID_NIAGARA3
be,pt %xcc, 5f
mov SUN4V_CHIP_NIAGARA3, %g4
- cmp %g2, '4'
+ cmp %g2, CPU_ID_NIAGARA4
be,pt %xcc, 5f
mov SUN4V_CHIP_NIAGARA4, %g4
- cmp %g2, '5'
+ cmp %g2, CPU_ID_NIAGARA5
be,pt %xcc, 5f
mov SUN4V_CHIP_NIAGARA5, %g4
- cmp %g2, '6'
+ cmp %g2, CPU_ID_M6
be,pt %xcc, 5f
mov SUN4V_CHIP_SPARC_M6, %g4
- cmp %g2, '7'
+ cmp %g2, CPU_ID_M7
be,pt %xcc, 5f
mov SUN4V_CHIP_SPARC_M7, %g4
- cmp %g2, 'N'
+ cmp %g2, CPU_ID_M8
+ be,pt %xcc, 5f
+ mov SUN4V_CHIP_SPARC_M8, %g4
+ cmp %g2, CPU_ID_SONOMA1
be,pt %xcc, 5f
mov SUN4V_CHIP_SPARC_SN, %g4
ba,pt %xcc, 49f
@@ -448,10 +451,10 @@ EXPORT_SYMBOL(sun4v_chip_type)
91: sethi %hi(prom_cpu_compatible), %g1
or %g1, %lo(prom_cpu_compatible), %g1
ldub [%g1 + 17], %g2
- cmp %g2, '1'
+ cmp %g2, CPU_ID_NIAGARA1
be,pt %xcc, 5f
mov SUN4V_CHIP_NIAGARA1, %g4
- cmp %g2, '2'
+ cmp %g2, CPU_ID_NIAGARA2
be,pt %xcc, 5f
mov SUN4V_CHIP_NIAGARA2, %g4
@@ -602,6 +605,9 @@ niagara_tlb_fixup:
cmp %g1, SUN4V_CHIP_SPARC_M7
be,pt %xcc, niagara4_patch
nop
+ cmp %g1, SUN4V_CHIP_SPARC_M8
+ be,pt %xcc, niagara4_patch
+ nop
cmp %g1, SUN4V_CHIP_SPARC_SN
be,pt %xcc, niagara4_patch
nop
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index f10e2f712394..9ebebf1fd93d 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -1266,8 +1266,6 @@ static int pci_sun4v_probe(struct platform_device *op)
* ATU group, but ATU hcalls won't be available.
*/
hv_atu = false;
- pr_err(PFX "Could not register hvapi ATU err=%d\n",
- err);
} else {
pr_info(PFX "Registered hvapi ATU major[%lu] minor[%lu]\n",
vatu_major, vatu_minor);
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index a38787b84322..732af9a9f6dd 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -602,7 +602,7 @@ void pcibios_fixup_bus(struct pci_bus *bus)
{
struct pci_dev *dev;
int i, has_io, has_mem;
- unsigned int cmd;
+ unsigned int cmd = 0;
struct linux_pcic *pcic;
/* struct linux_pbm_info* pbm = &pcic->pbm; */
int node;
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index 4d9c3e13c150..150ee7d4b059 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -288,10 +288,17 @@ static void __init sun4v_patch(void)
sun4v_patch_2insn_range(&__sun4v_2insn_patch,
&__sun4v_2insn_patch_end);
- if (sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
- sun4v_chip_type == SUN4V_CHIP_SPARC_SN)
+
+ switch (sun4v_chip_type) {
+ case SUN4V_CHIP_SPARC_M7:
+ case SUN4V_CHIP_SPARC_M8:
+ case SUN4V_CHIP_SPARC_SN:
sun_m7_patch_2insn_range(&__sun_m7_2insn_patch,
&__sun_m7_2insn_patch_end);
+ break;
+ default:
+ break;
+ }
sun4v_hvapi_init();
}
@@ -529,6 +536,7 @@ static void __init init_sparc64_elf_hwcap(void)
sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
+ sun4v_chip_type == SUN4V_CHIP_SPARC_M8 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_SN ||
sun4v_chip_type == SUN4V_CHIP_SPARC64X)
cap |= HWCAP_SPARC_BLKINIT;
@@ -538,6 +546,7 @@ static void __init init_sparc64_elf_hwcap(void)
sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
+ sun4v_chip_type == SUN4V_CHIP_SPARC_M8 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_SN ||
sun4v_chip_type == SUN4V_CHIP_SPARC64X)
cap |= HWCAP_SPARC_N2;
@@ -568,6 +577,7 @@ static void __init init_sparc64_elf_hwcap(void)
sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
+ sun4v_chip_type == SUN4V_CHIP_SPARC_M8 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_SN ||
sun4v_chip_type == SUN4V_CHIP_SPARC64X)
cap |= (AV_SPARC_VIS | AV_SPARC_VIS2 |
@@ -578,6 +588,7 @@ static void __init init_sparc64_elf_hwcap(void)
sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
+ sun4v_chip_type == SUN4V_CHIP_SPARC_M8 ||
sun4v_chip_type == SUN4V_CHIP_SPARC_SN ||
sun4v_chip_type == SUN4V_CHIP_SPARC64X)
cap |= (AV_SPARC_VIS3 | AV_SPARC_HPC |
diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S
index 07c0df924960..db872dbfafe9 100644
--- a/arch/sparc/kernel/tsb.S
+++ b/arch/sparc/kernel/tsb.S
@@ -360,6 +360,7 @@ tsb_flush:
* %o1: TSB base config pointer
* %o2: TSB huge config pointer, or NULL if none
* %o3: Hypervisor TSB descriptor physical address
+ * %o4: Secondary context to load, if non-zero
*
* We have to run this whole thing with interrupts
* disabled so that the current cpu doesn't change
@@ -372,6 +373,17 @@ __tsb_context_switch:
rdpr %pstate, %g1
wrpr %g1, PSTATE_IE, %pstate
+ brz,pn %o4, 1f
+ mov SECONDARY_CONTEXT, %o5
+
+661: stxa %o4, [%o5] ASI_DMMU
+ .section .sun4v_1insn_patch, "ax"
+ .word 661b
+ stxa %o4, [%o5] ASI_MMU
+ .previous
+ flush %g6
+
+1:
TRAP_LOAD_TRAP_BLOCK(%g2, %g3)
stx %o0, [%g2 + TRAP_PER_CPU_PGD_PADDR]
diff --git a/arch/sparc/lib/U3memcpy.S b/arch/sparc/lib/U3memcpy.S
index 54f98706b03b..5a8cb37f0a3b 100644
--- a/arch/sparc/lib/U3memcpy.S
+++ b/arch/sparc/lib/U3memcpy.S
@@ -145,13 +145,13 @@ ENDPROC(U3_retl_o2_plus_GS_plus_0x08)
ENTRY(U3_retl_o2_and_7_plus_GS)
and %o2, 7, %o2
retl
- add %o2, GLOBAL_SPARE, %o2
+ add %o2, GLOBAL_SPARE, %o0
ENDPROC(U3_retl_o2_and_7_plus_GS)
ENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
add GLOBAL_SPARE, 8, GLOBAL_SPARE
and %o2, 7, %o2
retl
- add %o2, GLOBAL_SPARE, %o2
+ add %o2, GLOBAL_SPARE, %o0
ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
#endif
diff --git a/arch/sparc/lib/multi3.S b/arch/sparc/lib/multi3.S
index d6b6c97fe3c7..703127aaf4a5 100644
--- a/arch/sparc/lib/multi3.S
+++ b/arch/sparc/lib/multi3.S
@@ -5,26 +5,26 @@
.align 4
ENTRY(__multi3) /* %o0 = u, %o1 = v */
mov %o1, %g1
- srl %o3, 0, %g4
- mulx %g4, %g1, %o1
+ srl %o3, 0, %o4
+ mulx %o4, %g1, %o1
srlx %g1, 0x20, %g3
- mulx %g3, %g4, %g5
- sllx %g5, 0x20, %o5
- srl %g1, 0, %g4
+ mulx %g3, %o4, %g7
+ sllx %g7, 0x20, %o5
+ srl %g1, 0, %o4
sub %o1, %o5, %o5
srlx %o5, 0x20, %o5
- addcc %g5, %o5, %g5
+ addcc %g7, %o5, %g7
srlx %o3, 0x20, %o5
- mulx %g4, %o5, %g4
+ mulx %o4, %o5, %o4
mulx %g3, %o5, %o5
sethi %hi(0x80000000), %g3
- addcc %g5, %g4, %g5
- srlx %g5, 0x20, %g5
+ addcc %g7, %o4, %g7
+ srlx %g7, 0x20, %g7
add %g3, %g3, %g3
movcc %xcc, %g0, %g3
- addcc %o5, %g5, %o5
- sllx %g4, 0x20, %g4
- add %o1, %g4, %o1
+ addcc %o5, %g7, %o5
+ sllx %o4, 0x20, %o4
+ add %o1, %o4, %o1
add %o5, %g3, %g2
mulx %g1, %o2, %g1
add %g1, %g2, %g1
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 3c40ebd50f92..afa0099f3748 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -325,6 +325,29 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde
}
#ifdef CONFIG_HUGETLB_PAGE
+static void __init add_huge_page_size(unsigned long size)
+{
+ unsigned int order;
+
+ if (size_to_hstate(size))
+ return;
+
+ order = ilog2(size) - PAGE_SHIFT;
+ hugetlb_add_hstate(order);
+}
+
+static int __init hugetlbpage_init(void)
+{
+ add_huge_page_size(1UL << HPAGE_64K_SHIFT);
+ add_huge_page_size(1UL << HPAGE_SHIFT);
+ add_huge_page_size(1UL << HPAGE_256MB_SHIFT);
+ add_huge_page_size(1UL << HPAGE_2GB_SHIFT);
+
+ return 0;
+}
+
+arch_initcall(hugetlbpage_init);
+
static int __init setup_hugepagesz(char *string)
{
unsigned long long hugepage_size;
@@ -364,7 +387,7 @@ static int __init setup_hugepagesz(char *string)
goto out;
}
- hugetlb_add_hstate(hugepage_shift - PAGE_SHIFT);
+ add_huge_page_size(hugepage_size);
rc = 1;
out:
@@ -1921,12 +1944,22 @@ static void __init setup_page_offset(void)
break;
case SUN4V_CHIP_SPARC_M7:
case SUN4V_CHIP_SPARC_SN:
- default:
/* M7 and later support 52-bit virtual addresses. */
sparc64_va_hole_top = 0xfff8000000000000UL;
sparc64_va_hole_bottom = 0x0008000000000000UL;
max_phys_bits = 49;
break;
+ case SUN4V_CHIP_SPARC_M8:
+ default:
+ /* M8 and later support 54-bit virtual addresses.
+ * However, restricting M8 and above VA bits to 53
+ * as 4-level page table cannot support more than
+ * 53 VA bits.
+ */
+ sparc64_va_hole_top = 0xfff0000000000000UL;
+ sparc64_va_hole_bottom = 0x0010000000000000UL;
+ max_phys_bits = 51;
+ break;
}
}
@@ -2138,6 +2171,7 @@ static void __init sun4v_linear_pte_xor_finalize(void)
*/
switch (sun4v_chip_type) {
case SUN4V_CHIP_SPARC_M7:
+ case SUN4V_CHIP_SPARC_M8:
case SUN4V_CHIP_SPARC_SN:
pagecv_flag = 0x00;
break;
@@ -2290,6 +2324,7 @@ void __init paging_init(void)
*/
switch (sun4v_chip_type) {
case SUN4V_CHIP_SPARC_M7:
+ case SUN4V_CHIP_SPARC_M8:
case SUN4V_CHIP_SPARC_SN:
page_cache4v_flag = _PAGE_CP_4V;
break;
diff --git a/arch/sparc/power/hibernate.c b/arch/sparc/power/hibernate.c
index 17bd2e167e07..df707a8ad311 100644
--- a/arch/sparc/power/hibernate.c
+++ b/arch/sparc/power/hibernate.c
@@ -35,6 +35,5 @@ void restore_processor_state(void)
{
struct mm_struct *mm = current->active_mm;
- load_secondary_context(mm);
- tsb_context_switch(mm);
+ tsb_context_switch_ctx(mm, CTX_HWBITS(mm->context));
}
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index a93774255136..53a423e7cb92 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n)
_atomic_xchg(&v->counter, n);
}
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
/* A 64bit atomic type */
typedef struct {
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index e64a1b75fc38..83c1e639b411 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -106,12 +106,9 @@
lock = __atomic_hashed_lock((int __force *)uaddr)
#endif
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int uninitialized_var(val), ret;
__futex_prolog();
@@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
/* The 32-bit futex code makes this assumption, so validate it here. */
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
-
pagefault_disable();
switch (op) {
case FUTEX_OP_SET:
@@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
}
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- ret = (val == cmparg);
- break;
- case FUTEX_OP_CMP_NE:
- ret = (val != cmparg);
- break;
- case FUTEX_OP_CMP_LT:
- ret = (val < cmparg);
- break;
- case FUTEX_OP_CMP_GE:
- ret = (val >= cmparg);
- break;
- case FUTEX_OP_CMP_LE:
- ret = (val <= cmparg);
- break;
- case FUTEX_OP_CMP_GT:
- ret = (val > cmparg);
- break;
- default:
- ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = val;
+
return ret;
}
diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h
index b14b1ba5bf9c..cba8ba9b8da6 100644
--- a/arch/tile/include/asm/spinlock_32.h
+++ b/arch/tile/include/asm/spinlock_32.h
@@ -64,8 +64,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
lock->current_ticket = old_ticket + TICKET_QUANTUM;
}
-void arch_spin_unlock_wait(arch_spinlock_t *lock);
-
/*
* Read-write spinlocks, allowing multiple readers
* but only one writer.
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index b9718fb4e74a..9a2c2d605752 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -58,8 +58,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
__insn_fetchadd4(&lock->lock, 1U << __ARCH_SPIN_CURRENT_SHIFT);
}
-void arch_spin_unlock_wait(arch_spinlock_t *lock);
-
void arch_spin_lock_slow(arch_spinlock_t *lock, u32 val);
/* Grab the "next" ticket number and bump it atomically.
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 076c6cc43113..db9333f2447c 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -62,29 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock)
}
EXPORT_SYMBOL(arch_spin_trylock);
-void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- u32 iterations = 0;
- int curr = READ_ONCE(lock->current_ticket);
- int next = READ_ONCE(lock->next_ticket);
-
- /* Return immediately if unlocked. */
- if (next == curr)
- return;
-
- /* Wait until the current locker has released the lock. */
- do {
- delay_backoff(iterations++);
- } while (READ_ONCE(lock->current_ticket) == curr);
-
- /*
- * The TILE architecture doesn't do read speculation; therefore
- * a control dependency guarantees a LOAD->{LOAD,STORE} order.
- */
- barrier();
-}
-EXPORT_SYMBOL(arch_spin_unlock_wait);
-
/*
* The low byte is always reserved to be the marker for a "tns" operation
* since the low bit is set to "1" by a tns. The next seven bits are
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
index a4b5b2cbce93..de414c22892f 100644
--- a/arch/tile/lib/spinlock_64.c
+++ b/arch/tile/lib/spinlock_64.c
@@ -62,28 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock)
}
EXPORT_SYMBOL(arch_spin_trylock);
-void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- u32 iterations = 0;
- u32 val = READ_ONCE(lock->lock);
- u32 curr = arch_spin_current(val);
-
- /* Return immediately if unlocked. */
- if (arch_spin_next(val) == curr)
- return;
-
- /* Wait until the current locker has released the lock. */
- do {
- delay_backoff(iterations++);
- } while (arch_spin_current(READ_ONCE(lock->lock)) == curr);
-
- /*
- * The TILE architecture doesn't do read speculation; therefore
- * a control dependency guarantees a LOAD->{LOAD,STORE} order.
- */
- barrier();
-}
-EXPORT_SYMBOL(arch_spin_unlock_wait);
/*
* If the read lock fails due to a writer, we retry periodically
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index 600a2e9bfee2..344d95619d03 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -45,7 +45,8 @@ static inline void init_tlb_gather(struct mmu_gather *tlb)
}
static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
tlb->start = start;
@@ -80,13 +81,19 @@ tlb_flush_mmu(struct mmu_gather *tlb)
tlb_flush_mmu_free(tlb);
}
-/* tlb_finish_mmu
+/* arch_tlb_finish_mmu
* Called at the end of the shootdown operation to free up any resources
* that were required.
*/
static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
+ if (force) {
+ tlb->start = start;
+ tlb->end = end;
+ tlb->need_flush = 1;
+ }
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h
new file mode 100644
index 000000000000..7ffa5437b761
--- /dev/null
+++ b/arch/um/include/asm/unwind.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_UML_UNWIND_H
+#define _ASM_UML_UNWIND_H
+
+static inline void
+unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size) {}
+
+#endif /* _ASM_UML_UNWIND_H */
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 586b786b3edf..f65a804b86f0 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
# Hyper-V paravirtualization support
obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
obj-y += realmode/
obj-y += kernel/
obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b1abda70c2d1..acb366bf6bc1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,8 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_PMEM_API if X86_64
+ # Causing hangs/crashes, see the commit that added this change for details.
+ select ARCH_HAS_REFCOUNT if BROKEN
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
@@ -73,7 +75,6 @@ config X86
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
- select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANTS_THP_SWAP if X86_64
select BUILDTIME_EXTABLE_SORT
@@ -100,6 +101,7 @@ config X86
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
+ select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
@@ -157,17 +159,19 @@ config X86
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_PERF_EVENTS_NMI
- select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI
+ select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION
+ select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
select HAVE_STACK_VALIDATION if X86_64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -326,6 +330,7 @@ config FIX_EARLYCON_MEM
config PGTABLE_LEVELS
int
+ default 5 if X86_5LEVEL
default 4 if X86_64
default 3 if X86_PAE
default 2
@@ -777,8 +782,6 @@ config KVM_DEBUG_FS
Statistics are displayed in debugfs filesystem. Enabling this option
may incur significant overhead.
-source "arch/x86/lguest/Kconfig"
-
config PARAVIRT_TIME_ACCOUNTING
bool "Paravirtual steal time accounting"
depends on PARAVIRT
@@ -1398,6 +1401,24 @@ config X86_PAE
has the cost of more pagetable lookup overhead, and also
consumes more pagetable space per process.
+config X86_5LEVEL
+ bool "Enable 5-level page tables support"
+ depends on X86_64
+ ---help---
+ 5-level paging enables access to larger address space:
+ upto 128 PiB of virtual address space and 4 PiB of
+ physical address space.
+
+ It will be supported by future Intel CPUs.
+
+ Note: a kernel with this option enabled can only be booted
+ on machines that support the feature.
+
+ See Documentation/x86/x86_64/5level-paging.txt for more
+ information.
+
+ Say N if unsure.
+
config ARCH_PHYS_ADDR_T_64BIT
def_bool y
depends on X86_64 || X86_PAE
@@ -1415,6 +1436,35 @@ config X86_DIRECT_GBPAGES
supports them), so don't confuse the user by printing
that we have them enabled.
+config ARCH_HAS_MEM_ENCRYPT
+ def_bool y
+
+config AMD_MEM_ENCRYPT
+ bool "AMD Secure Memory Encryption (SME) support"
+ depends on X86_64 && CPU_SUP_AMD
+ ---help---
+ Say yes to enable support for the encryption of system memory.
+ This requires an AMD processor that supports Secure Memory
+ Encryption (SME).
+
+config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+ bool "Activate AMD Secure Memory Encryption (SME) by default"
+ default y
+ depends on AMD_MEM_ENCRYPT
+ ---help---
+ Say yes to have system memory encrypted by default if running on
+ an AMD processor that supports Secure Memory Encryption (SME).
+
+ If set to Y, then the encryption of system memory can be
+ deactivated with the mem_encrypt=off command line option.
+
+ If set to N, then the encryption of system memory can be
+ activated with the mem_encrypt=on command line option.
+
+config ARCH_USE_MEMREMAP_PROT
+ def_bool y
+ depends on AMD_MEM_ENCRYPT
+
# Common NUMA Features
config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index cd20ca0b4043..71a48a30fc84 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -305,8 +305,6 @@ config DEBUG_ENTRY
Some of these sanity checks may slow down kernel entries and
exits or otherwise impact performance.
- This is currently used to help test NMI code.
-
If unsure, say N.
config DEBUG_NMI_SELFTEST
@@ -358,4 +356,61 @@ config PUNIT_ATOM_DEBUG
The current power state can be read from
/sys/kernel/debug/punit_atom/dev_power_state
+choice
+ prompt "Choose kernel unwinder"
+ default FRAME_POINTER_UNWINDER
+ ---help---
+ This determines which method will be used for unwinding kernel stack
+ traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+ livepatch, lockdep, and more.
+
+config FRAME_POINTER_UNWINDER
+ bool "Frame pointer unwinder"
+ select FRAME_POINTER
+ ---help---
+ This option enables the frame pointer unwinder for unwinding kernel
+ stack traces.
+
+ The unwinder itself is fast and it uses less RAM than the ORC
+ unwinder, but the kernel text size will grow by ~3% and the kernel's
+ overall performance will degrade by roughly 5-10%.
+
+ This option is recommended if you want to use the livepatch
+ consistency model, as this is currently the only way to get a
+ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config ORC_UNWINDER
+ bool "ORC unwinder"
+ depends on X86_64
+ select STACK_VALIDATION
+ ---help---
+ This option enables the ORC (Oops Rewind Capability) unwinder for
+ unwinding kernel stack traces. It uses a custom data format which is
+ a simplified version of the DWARF Call Frame Information standard.
+
+ This unwinder is more accurate across interrupt entry frames than the
+ frame pointer unwinder. It also enables a 5-10% performance
+ improvement across the entire kernel compared to frame pointers.
+
+ Enabling this option will increase the kernel's runtime memory usage
+ by roughly 2-4MB, depending on your kernel config.
+
+config GUESS_UNWINDER
+ bool "Guess unwinder"
+ depends on EXPERT
+ ---help---
+ This option enables the "guess" unwinder for unwinding kernel stack
+ traces. It scans the stack and reports every kernel text address it
+ finds. Some of the addresses it reports may be incorrect.
+
+ While this option often produces false positives, it can still be
+ useful in many cases. Unlike the other unwinders, it has no runtime
+ overhead.
+
+endchoice
+
+config FRAME_POINTER
+ depends on !ORC_UNWINDER && !GUESS_UNWINDER
+ bool
+
endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1e902f926be3..6276572259c8 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -14,9 +14,11 @@ endif
# For gcc stack alignment is specified with -mpreferred-stack-boundary,
# clang has the option -mstack-alignment for that purpose.
ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
- cc_stack_align_opt := -mpreferred-stack-boundary
-else ifneq ($(call cc-option, -mstack-alignment=4),)
- cc_stack_align_opt := -mstack-alignment
+ cc_stack_align4 := -mpreferred-stack-boundary=2
+ cc_stack_align8 := -mpreferred-stack-boundary=3
+else ifneq ($(call cc-option, -mstack-alignment=16),)
+ cc_stack_align4 := -mstack-alignment=4
+ cc_stack_align8 := -mstack-alignment=8
endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
@@ -36,7 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
export REALMODE_CFLAGS
# BITS is used as extension for files which are available in a 32 bit
@@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y)
# Align the stack to the register width instead of using the default
# alignment of 16 bytes. This reduces stack usage and the number of
# alignment instructions.
- KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2)
+ KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4))
# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
# a lot more stack due to the lack of sharing of stacklots:
@@ -115,7 +117,7 @@ else
# default alignment which keep the stack *mis*aligned.
# Furthermore an alignment to the register width reduces stack usage
# and the number of alignment instructions.
- KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3)
+ KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8))
# Use -mskip-rax-setup if supported.
KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
@@ -232,9 +234,6 @@ KBUILD_CFLAGS += -Wno-sign-compare
#
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-KBUILD_CFLAGS += $(mflags-y)
-KBUILD_AFLAGS += $(mflags-y)
-
archscripts: scripts_basic
$(Q)$(MAKE) $(build)=arch/x86/tools relocs
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c3e869eaef0c..e007887a33b0 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params,
m |= (u64)efi->efi_memmap_hi << 32;
#endif
- d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
+ d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i);
switch (d->type) {
case EFI_RESERVED_TYPE:
case EFI_RUNTIME_SERVICES_CODE:
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d85b9625e836..11c68cf53d4e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -61,71 +61,6 @@
__HEAD
ENTRY(startup_32)
-#ifdef CONFIG_EFI_STUB
- jmp preferred_addr
-
- /*
- * We don't need the return address, so set up the stack so
- * efi_main() can find its arguments.
- */
-ENTRY(efi_pe_entry)
- add $0x4, %esp
-
- call 1f
-1: popl %esi
- subl $1b, %esi
-
- popl %ecx
- movl %ecx, efi32_config(%esi) /* Handle */
- popl %ecx
- movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */
-
- /* Relocate efi_config->call() */
- leal efi32_config(%esi), %eax
- add %esi, 40(%eax)
- pushl %eax
-
- call make_boot_params
- cmpl $0, %eax
- je fail
- movl %esi, BP_code32_start(%eax)
- popl %ecx
- pushl %eax
- pushl %ecx
- jmp 2f /* Skip efi_config initialization */
-
-ENTRY(efi32_stub_entry)
- add $0x4, %esp
- popl %ecx
- popl %edx
-
- call 1f
-1: popl %esi
- subl $1b, %esi
-
- movl %ecx, efi32_config(%esi) /* Handle */
- movl %edx, efi32_config+8(%esi) /* EFI System table pointer */
-
- /* Relocate efi_config->call() */
- leal efi32_config(%esi), %eax
- add %esi, 40(%eax)
- pushl %eax
-2:
- call efi_main
- cmpl $0, %eax
- movl %eax, %esi
- jne 2f
-fail:
- /* EFI init failed, so hang. */
- hlt
- jmp fail
-2:
- movl BP_code32_start(%esi), %eax
- leal preferred_addr(%eax), %eax
- jmp *%eax
-
-preferred_addr:
-#endif
cld
/*
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -208,6 +143,70 @@ preferred_addr:
jmp *%eax
ENDPROC(startup_32)
+#ifdef CONFIG_EFI_STUB
+/*
+ * We don't need the return address, so set up the stack so efi_main() can find
+ * its arguments.
+ */
+ENTRY(efi_pe_entry)
+ add $0x4, %esp
+
+ call 1f
+1: popl %esi
+ subl $1b, %esi
+
+ popl %ecx
+ movl %ecx, efi32_config(%esi) /* Handle */
+ popl %ecx
+ movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */
+
+ /* Relocate efi_config->call() */
+ leal efi32_config(%esi), %eax
+ add %esi, 40(%eax)
+ pushl %eax
+
+ call make_boot_params
+ cmpl $0, %eax
+ je fail
+ movl %esi, BP_code32_start(%eax)
+ popl %ecx
+ pushl %eax
+ pushl %ecx
+ jmp 2f /* Skip efi_config initialization */
+ENDPROC(efi_pe_entry)
+
+ENTRY(efi32_stub_entry)
+ add $0x4, %esp
+ popl %ecx
+ popl %edx
+
+ call 1f
+1: popl %esi
+ subl $1b, %esi
+
+ movl %ecx, efi32_config(%esi) /* Handle */
+ movl %edx, efi32_config+8(%esi) /* EFI System table pointer */
+
+ /* Relocate efi_config->call() */
+ leal efi32_config(%esi), %eax
+ add %esi, 40(%eax)
+ pushl %eax
+2:
+ call efi_main
+ cmpl $0, %eax
+ movl %eax, %esi
+ jne 2f
+fail:
+ /* EFI init failed, so hang. */
+ hlt
+ jmp fail
+2:
+ movl BP_code32_start(%esi), %eax
+ leal startup_32(%eax), %eax
+ jmp *%eax
+ENDPROC(efi32_stub_entry)
+#endif
+
.text
relocated:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fbf4c32d0b62..b4a5d284391c 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -243,65 +243,6 @@ ENTRY(startup_64)
* that maps our entire kernel(text+data+bss+brk), zero page
* and command line.
*/
-#ifdef CONFIG_EFI_STUB
- /*
- * The entry point for the PE/COFF executable is efi_pe_entry, so
- * only legacy boot loaders will execute this jmp.
- */
- jmp preferred_addr
-
-ENTRY(efi_pe_entry)
- movq %rcx, efi64_config(%rip) /* Handle */
- movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */
-
- leaq efi64_config(%rip), %rax
- movq %rax, efi_config(%rip)
-
- call 1f
-1: popq %rbp
- subq $1b, %rbp
-
- /*
- * Relocate efi_config->call().
- */
- addq %rbp, efi64_config+40(%rip)
-
- movq %rax, %rdi
- call make_boot_params
- cmpq $0,%rax
- je fail
- mov %rax, %rsi
- leaq startup_32(%rip), %rax
- movl %eax, BP_code32_start(%rsi)
- jmp 2f /* Skip the relocation */
-
-handover_entry:
- call 1f
-1: popq %rbp
- subq $1b, %rbp
-
- /*
- * Relocate efi_config->call().
- */
- movq efi_config(%rip), %rax
- addq %rbp, 40(%rax)
-2:
- movq efi_config(%rip), %rdi
- call efi_main
- movq %rax,%rsi
- cmpq $0,%rax
- jne 2f
-fail:
- /* EFI init failed, so hang. */
- hlt
- jmp fail
-2:
- movl BP_code32_start(%esi), %eax
- leaq preferred_addr(%rax), %rax
- jmp *%rax
-
-preferred_addr:
-#endif
/* Setup data segments. */
xorl %eax, %eax
@@ -413,6 +354,59 @@ lvl5:
jmp *%rax
#ifdef CONFIG_EFI_STUB
+
+/* The entry point for the PE/COFF executable is efi_pe_entry. */
+ENTRY(efi_pe_entry)
+ movq %rcx, efi64_config(%rip) /* Handle */
+ movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */
+
+ leaq efi64_config(%rip), %rax
+ movq %rax, efi_config(%rip)
+
+ call 1f
+1: popq %rbp
+ subq $1b, %rbp
+
+ /*
+ * Relocate efi_config->call().
+ */
+ addq %rbp, efi64_config+40(%rip)
+
+ movq %rax, %rdi
+ call make_boot_params
+ cmpq $0,%rax
+ je fail
+ mov %rax, %rsi
+ leaq startup_32(%rip), %rax
+ movl %eax, BP_code32_start(%rsi)
+ jmp 2f /* Skip the relocation */
+
+handover_entry:
+ call 1f
+1: popq %rbp
+ subq $1b, %rbp
+
+ /*
+ * Relocate efi_config->call().
+ */
+ movq efi_config(%rip), %rax
+ addq %rbp, 40(%rax)
+2:
+ movq efi_config(%rip), %rdi
+ call efi_main
+ movq %rax,%rsi
+ cmpq $0,%rax
+ jne 2f
+fail:
+ /* EFI init failed, so hang. */
+ hlt
+ jmp fail
+2:
+ movl BP_code32_start(%esi), %eax
+ leaq startup_64(%rax), %rax
+ jmp *%rax
+ENDPROC(efi_pe_entry)
+
.org 0x390
ENTRY(efi64_stub_entry)
movq %rdi, efi64_config(%rip) /* Handle */
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 91f27ab970ef..17818ba6906f 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -37,7 +37,9 @@
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/ctype.h>
+#include <linux/efi.h>
#include <generated/utsrelease.h>
+#include <asm/efi.h>
/* Macros used by the included decompressor code below. */
#define STATIC
@@ -479,35 +481,31 @@ static unsigned long slots_fetch_random(void)
return 0;
}
-static void process_e820_entry(struct boot_e820_entry *entry,
+static void process_mem_region(struct mem_vector *entry,
unsigned long minimum,
unsigned long image_size)
{
struct mem_vector region, overlap;
struct slot_area slot_area;
unsigned long start_orig, end;
- struct boot_e820_entry cur_entry;
-
- /* Skip non-RAM entries. */
- if (entry->type != E820_TYPE_RAM)
- return;
+ struct mem_vector cur_entry;
/* On 32-bit, ignore entries entirely above our maximum. */
- if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
+ if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
return;
/* Ignore entries entirely below our minimum. */
- if (entry->addr + entry->size < minimum)
+ if (entry->start + entry->size < minimum)
return;
/* Ignore entries above memory limit */
- end = min(entry->size + entry->addr, mem_limit);
- if (entry->addr >= end)
+ end = min(entry->size + entry->start, mem_limit);
+ if (entry->start >= end)
return;
- cur_entry.addr = entry->addr;
- cur_entry.size = end - entry->addr;
+ cur_entry.start = entry->start;
+ cur_entry.size = end - entry->start;
- region.start = cur_entry.addr;
+ region.start = cur_entry.start;
region.size = cur_entry.size;
/* Give up if slot area array is full. */
@@ -521,8 +519,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
/* Potentially raise address to meet alignment needs. */
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
- /* Did we raise the address above this e820 region? */
- if (region.start > cur_entry.addr + cur_entry.size)
+ /* Did we raise the address above the passed in memory entry? */
+ if (region.start > cur_entry.start + cur_entry.size)
return;
/* Reduce size by any delta from the original address. */
@@ -562,31 +560,126 @@ static void process_e820_entry(struct boot_e820_entry *entry,
}
}
-static unsigned long find_random_phys_addr(unsigned long minimum,
- unsigned long image_size)
+#ifdef CONFIG_EFI
+/*
+ * Returns true if mirror region found (and must have been processed
+ * for slots adding)
+ */
+static bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
{
+ struct efi_info *e = &boot_params->efi_info;
+ bool efi_mirror_found = false;
+ struct mem_vector region;
+ efi_memory_desc_t *md;
+ unsigned long pmap;
+ char *signature;
+ u32 nr_desc;
int i;
- unsigned long addr;
- /* Check if we had too many memmaps. */
- if (memmap_too_large) {
- debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n");
- return 0;
+ signature = (char *)&e->efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
+ return false;
+
+#ifdef CONFIG_X86_32
+ /* Can't handle data above 4GB at this time */
+ if (e->efi_memmap_hi) {
+ warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
+ return false;
}
+ pmap = e->efi_memmap;
+#else
+ pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
+#endif
- /* Make sure minimum is aligned. */
- minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+ nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ efi_mirror_found = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+
+ /*
+ * Here we are more conservative in picking free memory than
+ * the EFI spec allows:
+ *
+ * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also
+ * free memory and thus available to place the kernel image into,
+ * but in practice there's firmware where using that memory leads
+ * to crashes.
+ *
+ * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free.
+ */
+ if (md->type != EFI_CONVENTIONAL_MEMORY)
+ continue;
+
+ if (efi_mirror_found &&
+ !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
+ continue;
+
+ region.start = md->phys_addr;
+ region.size = md->num_pages << EFI_PAGE_SHIFT;
+ process_mem_region(&region, minimum, image_size);
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted EFI scan (slot_areas full)!\n");
+ break;
+ }
+ }
+ return true;
+}
+#else
+static inline bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ return false;
+}
+#endif
+
+static void process_e820_entries(unsigned long minimum,
+ unsigned long image_size)
+{
+ int i;
+ struct mem_vector region;
+ struct boot_e820_entry *entry;
/* Verify potential e820 positions, appending to slots list. */
for (i = 0; i < boot_params->e820_entries; i++) {
- process_e820_entry(&boot_params->e820_table[i], minimum,
- image_size);
+ entry = &boot_params->e820_table[i];
+ /* Skip non-RAM entries. */
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+ region.start = entry->addr;
+ region.size = entry->size;
+ process_mem_region(&region, minimum, image_size);
if (slot_area_index == MAX_SLOT_AREA) {
debug_putstr("Aborted e820 scan (slot_areas full)!\n");
break;
}
}
+}
+
+static unsigned long find_random_phys_addr(unsigned long minimum,
+ unsigned long image_size)
+{
+ /* Check if we had too many memmaps. */
+ if (memmap_too_large) {
+ debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
+ return 0;
+ }
+
+ /* Make sure minimum is aligned. */
+ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+
+ if (process_efi_entries(minimum, image_size))
+ return slots_fetch_random();
+ process_e820_entries(minimum, image_size);
return slots_fetch_random();
}
@@ -645,7 +738,7 @@ void choose_random_location(unsigned long input,
*/
min_addr = min(*output, 512UL << 20);
- /* Walk e820 and find a random address. */
+ /* Walk available memory entries to find a random address. */
random_addr = find_random_phys_addr(min_addr, output_size);
if (!random_addr) {
warn("Physical KASLR disabled: no suitable memory region!");
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a0838ab929f2..c14217cd0155 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -116,8 +116,7 @@ void __putstr(const char *s)
}
}
- if (boot_params->screen_info.orig_video_mode == 0 &&
- lines == 0 && cols == 0)
+ if (lines == 0 || cols == 0)
return;
x = boot_params->screen_info.orig_x;
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 28029be47fbb..f1aa43854bed 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -15,6 +15,13 @@
#define __pa(x) ((unsigned long)(x))
#define __va(x) ((void *)((unsigned long)(x)))
+/*
+ * The pgtable.h and mm/ident_map.c includes make use of the SME related
+ * information which is not used in the compressed image support. Un-define
+ * the SME support to avoid any compile and link errors.
+ */
+#undef CONFIG_AMD_MEM_ENCRYPT
+
#include "misc.h"
/* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 2ed8f0c25def..1bb08ecffd24 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -520,8 +520,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
# the description in lib/decompressor_xxx.c for specific information.
#
# extra_bytes = (uncompressed_size >> 12) + 65536 + 128
+#
+# LZ4 is even worse: data that cannot be further compressed grows by 0.4%,
+# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as
+# the size-dependent part now grows so fast.
+#
+# extra_bytes = (uncompressed_size >> 8) + 65536
-#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128)
+#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536)
#if ZO_z_output_len > ZO_z_input_len
# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \
ZO_z_input_len)
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 4b429df40d7a..550cd5012b73 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,3 +1,5 @@
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
# CONFIG_HIGHMEM64G is not set
+CONFIG_GUESS_UNWINDER=y
+# CONFIG_FRAME_POINTER_UNWINDER is not set
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1cd792db15ef..1eab79c9ac48 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -117,11 +117,10 @@
.set T1, REG_T1
.endm
-#define K_BASE %r8
#define HASH_PTR %r9
+#define BLOCKS_CTR %r8
#define BUFFER_PTR %r10
#define BUFFER_PTR2 %r13
-#define BUFFER_END %r11
#define PRECALC_BUF %r14
#define WK_BUF %r15
@@ -205,14 +204,14 @@
* blended AVX2 and ALU instruction scheduling
* 1 vector iteration per 8 rounds
*/
- vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+ vmovdqu (i * 2)(BUFFER_PTR), W_TMP
.elseif ((i & 7) == 1)
- vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
+ vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
WY_TMP, WY_TMP
.elseif ((i & 7) == 2)
vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
.elseif ((i & 7) == 4)
- vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
.elseif ((i & 7) == 7)
vmovdqu WY_TMP, PRECALC_WK(i&~7)
@@ -255,7 +254,7 @@
vpxor WY, WY_TMP, WY_TMP
.elseif ((i & 7) == 7)
vpxor WY_TMP2, WY_TMP, WY
- vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
@@ -291,7 +290,7 @@
vpsrld $30, WY, WY
vpor WY, WY_TMP, WY
.elseif ((i & 7) == 7)
- vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
@@ -446,6 +445,16 @@
.endm
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+ mov \a, RTA
+ add $\d, RTA
+ cmp $\c, \b
+ cmovge RTA, \a
+.endm
+
/*
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
*/
@@ -463,13 +472,16 @@
lea (2*4*80+32)(%rsp), WK_BUF
# Precalc WK for first 2 blocks
- PRECALC_OFFSET = 0
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
.set i, 0
.rept 160
PRECALC i
.set i, i + 1
.endr
- PRECALC_OFFSET = 128
+
+ /* Go to next block if needed */
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
xchg WK_BUF, PRECALC_BUF
.align 32
@@ -479,8 +491,8 @@ _loop:
* we use K_BASE value as a signal of a last block,
* it is set below by: cmovae BUFFER_PTR, K_BASE
*/
- cmp K_BASE, BUFFER_PTR
- jne _begin
+ test BLOCKS_CTR, BLOCKS_CTR
+ jnz _begin
.align 32
jmp _end
.align 32
@@ -512,10 +524,10 @@ _loop0:
.set j, j+2
.endr
- add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */
- cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
- cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
-
+ /* Update Counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
/*
* rounds
* 60,62,64,66,68
@@ -532,8 +544,8 @@ _loop0:
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
- cmp K_BASE, BUFFER_PTR /* is current block the last one? */
- je _loop
+ test BLOCKS_CTR, BLOCKS_CTR
+ jz _loop
mov TB, B
@@ -575,10 +587,10 @@ _loop2:
.set j, j+2
.endr
- add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */
-
- cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
- cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
+ /* update counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
jmp _loop3
_loop3:
@@ -641,19 +653,12 @@ _loop3:
avx2_zeroupper
- lea K_XMM_AR(%rip), K_BASE
-
+ /* Setup initial values */
mov CTX, HASH_PTR
mov BUF, BUFFER_PTR
- lea 64(BUF), BUFFER_PTR2
-
- shl $6, CNT /* mul by 64 */
- add BUF, CNT
- add $64, CNT
- mov CNT, BUFFER_END
- cmp BUFFER_END, BUFFER_PTR2
- cmovae K_BASE, BUFFER_PTR2
+ mov BUF, BUFFER_PTR2
+ mov CNT, BLOCKS_CTR
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index f960a043cdeb..fc61739150e7 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
static bool avx2_usable(void)
{
- if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
&& boot_cpu_has(X86_FEATURE_BMI1)
&& boot_cpu_has(X86_FEATURE_BMI2))
return true;
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 9976fcecd17e..af28a8a24366 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,7 +2,6 @@
# Makefile for the x86 low level entry code
#
-OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d393da7..640aafebdc00 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,4 +1,5 @@
#include <linux/jump_label.h>
+#include <asm/unwind_hints.h>
/*
@@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
movq %rdx, 12*8+\offset(%rsp)
movq %rsi, 13*8+\offset(%rsp)
movq %rdi, 14*8+\offset(%rsp)
+ UNWIND_HINT_REGS offset=\offset extra=0
.endm
.macro SAVE_C_REGS offset=0
SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with
movq %r12, 3*8+\offset(%rsp)
movq %rbp, 4*8+\offset(%rsp)
movq %rbx, 5*8+\offset(%rsp)
+ UNWIND_HINT_REGS offset=\offset
.endm
.macro RESTORE_EXTRA_REGS offset=0
@@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with
movq 3*8+\offset(%rsp), %r12
movq 4*8+\offset(%rsp), %rbp
movq 5*8+\offset(%rsp), %rbx
+ UNWIND_HINT_REGS offset=\offset extra=0
.endm
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
@@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
.endif
movq 13*8(%rsp), %rsi
movq 14*8(%rsp), %rdi
+ UNWIND_HINT_IRET_REGS offset=16*8
.endm
.macro RESTORE_C_REGS
RESTORE_C_REGS_HELPER 1,1,1,1,1
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..03505ffbe1b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ addr_limit_user_check();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d271fb79248f..ca0b250eefc4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
+#include <asm/frame.h>
#include <linux/err.h>
.code64
@@ -43,9 +44,10 @@
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
+ UNWIND_HINT_EMPTY
swapgs
sysretq
-ENDPROC(native_usergs_sysret64)
+END(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
.macro TRACE_IRQS_IRETQ
@@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64)
*/
ENTRY(entry_SYSCALL_64)
+ UNWIND_HINT_EMPTY
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
- SWAPGS_UNSAFE_STACK
- /*
- * A hypervisor implementation might want to use a label
- * after the swapgs, so that it can do the swapgs
- * for the guest and jump here on syscall.
- */
-GLOBAL(entry_SYSCALL_64_after_swapgs)
+ swapgs
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+ UNWIND_HINT_REGS extra=0
/*
* If we need to do entry work or if we guess we'll need to do
@@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath:
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
USERGS_SYSRET64
1:
@@ -316,6 +316,7 @@ syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
USERGS_SYSRET64
opportunistic_sysret_failed:
@@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64)
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
popq %rax
+ UNWIND_HINT_REGS extra=0
jmp entry_SYSCALL64_slow_path
1:
@@ -351,6 +353,7 @@ END(stub_ptregs_64)
.macro ptregs_stub func
ENTRY(ptregs_\func)
+ UNWIND_HINT_FUNC
leaq \func(%rip), %rax
jmp stub_ptregs_64
END(ptregs_\func)
@@ -367,6 +370,7 @@ END(ptregs_\func)
* %rsi: next task
*/
ENTRY(__switch_to_asm)
+ UNWIND_HINT_FUNC
/*
* Save callee-saved registers
* This must match the order in inactive_task_frame
@@ -406,6 +410,7 @@ END(__switch_to_asm)
* r12: kernel thread arg
*/
ENTRY(ret_from_fork)
+ UNWIND_HINT_EMPTY
movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
@@ -413,6 +418,7 @@ ENTRY(ret_from_fork)
jnz 1f /* kernel threads are uncommon */
2:
+ UNWIND_HINT_REGS
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
@@ -440,13 +446,102 @@ END(ret_from_fork)
ENTRY(irq_entries_start)
vector=FIRST_EXTERNAL_VECTOR
.rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ UNWIND_HINT_IRET_REGS
pushq $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
jmp common_interrupt
.align 8
+ vector=vector+1
.endr
END(irq_entries_start)
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+ pushfq
+ testl $X86_EFLAGS_IF, (%rsp)
+ jz .Lokay_\@
+ ud2
+.Lokay_\@:
+ addq $8, %rsp
+#endif
+.endm
+
+/*
+ * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
+ * Requires kernel GSBASE.
+ *
+ * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+ */
+.macro ENTER_IRQ_STACK regs=1 old_rsp
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+ movq %rsp, \old_rsp
+
+ .if \regs
+ UNWIND_HINT_REGS base=\old_rsp
+ .endif
+
+ incl PER_CPU_VAR(irq_count)
+ jnz .Lirq_stack_push_old_rsp_\@
+
+ /*
+ * Right now, if we just incremented irq_count to zero, we've
+ * claimed the IRQ stack but we haven't switched to it yet.
+ *
+ * If anything is added that can interrupt us here without using IST,
+ * it must be *extremely* careful to limit its stack usage. This
+ * could include kprobes and a hypothetical future IST-less #DB
+ * handler.
+ *
+ * The OOPS unwinder relies on the word at the top of the IRQ
+ * stack linking back to the previous RSP for the entire time we're
+ * on the IRQ stack. For this to work reliably, we need to write
+ * it before we actually move ourselves to the IRQ stack.
+ */
+
+ movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
+ movq PER_CPU_VAR(irq_stack_ptr), %rsp
+
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * If the first movq above becomes wrong due to IRQ stack layout
+ * changes, the only way we'll notice is if we try to unwind right
+ * here. Assert that we set up the stack right to catch this type
+ * of bug quickly.
+ */
+ cmpq -8(%rsp), \old_rsp
+ je .Lirq_stack_okay\@
+ ud2
+ .Lirq_stack_okay\@:
+#endif
+
+.Lirq_stack_push_old_rsp_\@:
+ pushq \old_rsp
+
+ .if \regs
+ UNWIND_HINT_REGS indirect=1
+ .endif
+.endm
+
+/*
+ * Undoes ENTER_IRQ_STACK.
+ */
+.macro LEAVE_IRQ_STACK regs=1
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+ /* We need to be off the IRQ stack before decrementing irq_count. */
+ popq %rsp
+
+ .if \regs
+ UNWIND_HINT_REGS
+ .endif
+
+ /*
+ * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
+ * the irq stack but we're not on it.
+ */
+
+ decl PER_CPU_VAR(irq_count)
+.endm
+
/*
* Interrupt entry/exit.
*
@@ -485,17 +580,7 @@ END(irq_entries_start)
CALL_enter_from_user_mode
1:
- /*
- * Save previous stack pointer, optionally switch to interrupt stack.
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
- movq %rsp, %rdi
- incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
- pushq %rdi
+ ENTER_IRQ_STACK old_rsp=%rdi
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
@@ -515,10 +600,8 @@ common_interrupt:
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- decl PER_CPU_VAR(irq_count)
- /* Restore saved previous stack */
- popq %rsp
+ LEAVE_IRQ_STACK
testb $3, CS(%rsp)
jz retint_kernel
@@ -561,6 +644,7 @@ restore_c_regs_and_iret:
INTERRUPT_RETURN
ENTRY(native_iret)
+ UNWIND_HINT_IRET_REGS
/*
* Are we returning to a stack segment from the LDT? Note: in
* 64-bit mode SS:RSP on the exception stack is always valid.
@@ -633,6 +717,7 @@ native_irq_return_ldt:
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
movq %rax, %rsp
+ UNWIND_HINT_IRET_REGS offset=8
/*
* At this point, we cannot write to the stack any more, but we can
@@ -654,6 +739,7 @@ END(common_interrupt)
*/
.macro apicinterrupt3 num sym do_sym
ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS
ASM_CLAC
pushq $~(\num)
.Lcommon_\sym:
@@ -675,13 +761,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
#endif
/* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY .popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY .popsection
.macro apicinterrupt num sym do_sym
PUSH_SECTION_IRQENTRY
@@ -740,6 +821,8 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS offset=8
+
/* Sanity check */
.if \shift_ist != -1 && \paranoid == 0
.error "using shift_ist requires paranoid=1"
@@ -763,6 +846,7 @@ ENTRY(\sym)
.else
call error_entry
.endif
+ UNWIND_HINT_REGS
/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
.if \paranoid
@@ -860,6 +944,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
* edi: new selector
*/
ENTRY(native_load_gs_index)
+ FRAME_BEGIN
pushfq
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
@@ -868,8 +953,9 @@ ENTRY(native_load_gs_index)
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
SWAPGS
popfq
+ FRAME_END
ret
-END(native_load_gs_index)
+ENDPROC(native_load_gs_index)
EXPORT_SYMBOL(native_load_gs_index)
_ASM_EXTABLE(.Lgs_change, bad_gs)
@@ -892,14 +978,12 @@ bad_gs:
ENTRY(do_softirq_own_stack)
pushq %rbp
mov %rsp, %rbp
- incl PER_CPU_VAR(irq_count)
- cmove PER_CPU_VAR(irq_stack_ptr), %rsp
- push %rbp /* frame pointer backlink */
+ ENTER_IRQ_STACK regs=0 old_rsp=%r11
call __do_softirq
+ LEAVE_IRQ_STACK regs=0
leaveq
- decl PER_CPU_VAR(irq_count)
ret
-END(do_softirq_own_stack)
+ENDPROC(do_softirq_own_stack)
#ifdef CONFIG_XEN
idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
@@ -923,14 +1007,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
* see the correct pointer to the pt_regs
*/
+ UNWIND_HINT_FUNC
movq %rdi, %rsp /* we don't return, adjust the stack frame */
-11: incl PER_CPU_VAR(irq_count)
- movq %rsp, %rbp
- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
- pushq %rbp /* frame pointer backlink */
+ UNWIND_HINT_REGS
+
+ ENTER_IRQ_STACK old_rsp=%r10
call xen_evtchn_do_upcall
- popq %rsp
- decl PER_CPU_VAR(irq_count)
+ LEAVE_IRQ_STACK
+
#ifndef CONFIG_PREEMPT
call xen_maybe_preempt_hcall
#endif
@@ -951,6 +1035,7 @@ END(xen_do_hypervisor_callback)
* with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
+ UNWIND_HINT_EMPTY
movl %ds, %ecx
cmpw %cx, 0x10(%rsp)
jne 1f
@@ -970,11 +1055,13 @@ ENTRY(xen_failsafe_callback)
pushq $0 /* RIP */
pushq %r11
pushq %rcx
+ UNWIND_HINT_IRET_REGS offset=8
jmp general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp), %rcx
movq 8(%rsp), %r11
addq $0x30, %rsp
+ UNWIND_HINT_IRET_REGS
pushq $-1 /* orig_ax = -1 => not a system call */
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
@@ -1020,6 +1107,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
+ UNWIND_HINT_FUNC
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
@@ -1047,6 +1135,7 @@ END(paranoid_entry)
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
*/
ENTRY(paranoid_exit)
+ UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
@@ -1068,6 +1157,7 @@ END(paranoid_exit)
* Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
+ UNWIND_HINT_FUNC
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
@@ -1152,6 +1242,7 @@ END(error_entry)
* 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
*/
ENTRY(error_exit)
+ UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
testl %ebx, %ebx
@@ -1161,6 +1252,7 @@ END(error_exit)
/* Runs on exception stack */
ENTRY(nmi)
+ UNWIND_HINT_IRET_REGS
/*
* Fix up the exception frame if we're on Xen.
* PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
@@ -1211,6 +1303,8 @@ ENTRY(nmi)
* other IST entries.
*/
+ ASM_CLAC
+
/* Use %rdx as our temp variable throughout */
pushq %rdx
@@ -1232,11 +1326,13 @@ ENTRY(nmi)
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
pushq 3*8(%rdx) /* pt_regs->flags */
pushq 2*8(%rdx) /* pt_regs->cs */
pushq 1*8(%rdx) /* pt_regs->rip */
+ UNWIND_HINT_IRET_REGS
pushq $-1 /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -1253,6 +1349,7 @@ ENTRY(nmi)
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
ENCODE_FRAME_POINTER
/*
@@ -1407,6 +1504,7 @@ first_nmi:
.rept 5
pushq 11*8(%rsp)
.endr
+ UNWIND_HINT_IRET_REGS
/* Everything up to here is safe from nested NMIs */
@@ -1422,6 +1520,7 @@ first_nmi:
pushq $__KERNEL_CS /* CS */
pushq $1f /* RIP */
INTERRUPT_RETURN /* continues at repeat_nmi below */
+ UNWIND_HINT_IRET_REGS
1:
#endif
@@ -1471,6 +1570,7 @@ end_repeat_nmi:
* exceptions might do.
*/
call paranoid_entry
+ UNWIND_HINT_REGS
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
@@ -1508,17 +1608,19 @@ nmi_restore:
END(nmi)
ENTRY(ignore_sysret)
+ UNWIND_HINT_EMPTY
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
ENTRY(rewind_stack_do_exit)
+ UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
- leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
+ leaq -PTREGS_SIZE(%rax), %rsp
+ UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
call do_exit
-1: jmp 1b
END(rewind_stack_do_exit)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..4b86d8da3ea3 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
*/
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
- SWAPGS_UNSAFE_STACK
+ swapgs
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- /* Zero-extending 32-bit regs, do not remove */
- movl %eax, %eax
-
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+ movl %eax, %eax /* discard orig_ax high bits */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -342,8 +341,7 @@ ENTRY(entry_INT80_compat)
jmp restore_regs_and_iret
END(entry_INT80_compat)
- ALIGN
-GLOBAL(stub32_clone)
+ENTRY(stub32_clone)
/*
* The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
* The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
@@ -353,3 +351,4 @@ GLOBAL(stub32_clone)
*/
xchg %r8, %rcx
jmp sys_clone
+ENDPROC(stub32_clone)
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index ad44af0dd667..f5cbbba99283 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
if (amd_uncore_llc) {
unsigned int apicid = cpu_data(cpu).apicid;
- unsigned int nshared;
+ unsigned int nshared, subleaf, prev_eax = 0;
uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
- cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
- nshared = ((eax >> 14) & 0xfff) + 1;
+ /*
+ * Iterate over Cache Topology Definition leaves until no
+ * more cache descriptions are available.
+ */
+ for (subleaf = 0; subleaf < 5; subleaf++) {
+ cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx);
+
+ /* EAX[0:4] gives type of cache */
+ if (!(eax & 0x1f))
+ break;
+
+ prev_eax = eax;
+ }
+ nshared = ((prev_eax >> 14) & 0xfff) + 1;
+
uncore->id = apicid - (apicid % nshared);
uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
@@ -555,7 +568,7 @@ static int __init amd_uncore_init(void)
ret = 0;
}
- if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
amd_uncore_llc = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_llc) {
ret = -ENOMEM;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 8e3db8f642a7..80534d3c2480 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event)
return m == b;
}
-int x86_pmu_hw_config(struct perf_event *event)
+int x86_pmu_max_precise(void)
{
- if (event->attr.precise_ip) {
- int precise = 0;
+ int precise = 0;
- /* Support for constant skid */
- if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+ /* Support for constant skid */
+ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+ precise++;
+
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
- /* Support for IP fixup */
- if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
- precise++;
+ if (x86_pmu.pebs_prec_dist)
+ precise++;
+ }
+ return precise;
+}
- if (x86_pmu.pebs_prec_dist)
- precise++;
- }
+int x86_pmu_hw_config(struct perf_event *event)
+{
+ if (event->attr.precise_ip) {
+ int precise = x86_pmu_max_precise();
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
@@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
}
static struct attribute_group x86_pmu_attr_group;
+static struct attribute_group x86_pmu_caps_group;
static int __init init_hw_perf_events(void)
{
@@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void)
x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+ if (x86_pmu.caps_attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_caps_group.attrs = tmp;
+ }
+
if (x86_pmu.event_attrs)
x86_pmu_events_group.attrs = x86_pmu.event_attrs;
@@ -2114,7 +2129,7 @@ static void refresh_pce(void *ignored)
load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
}
-static void x86_pmu_event_mapped(struct perf_event *event)
+static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;
@@ -2129,22 +2144,20 @@ static void x86_pmu_event_mapped(struct perf_event *event)
* For now, this can't happen because all callers hold mmap_sem
* for write. If this changes, we'll need a different solution.
*/
- lockdep_assert_held_exclusive(&current->mm->mmap_sem);
+ lockdep_assert_held_exclusive(&mm->mmap_sem);
- if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
- on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+ if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
+ on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
}
-static void x86_pmu_event_unmapped(struct perf_event *event)
+static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!current->mm)
- return;
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;
- if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
- on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+ if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
+ on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
}
static int x86_pmu_event_idx(struct perf_event *event)
@@ -2215,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = {
.attrs = x86_pmu_attrs,
};
+static ssize_t max_precise_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
+}
+
+static DEVICE_ATTR_RO(max_precise);
+
+static struct attribute *x86_pmu_caps_attrs[] = {
+ &dev_attr_max_precise.attr,
+ NULL
+};
+
+static struct attribute_group x86_pmu_caps_group = {
+ .name = "caps",
+ .attrs = x86_pmu_caps_attrs,
+};
+
static const struct attribute_group *x86_pmu_attr_groups[] = {
&x86_pmu_attr_group,
&x86_pmu_format_group,
&x86_pmu_events_group,
+ &x86_pmu_caps_group,
NULL,
};
@@ -2337,12 +2370,9 @@ static unsigned long get_segment_base(unsigned int segment)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
- if (idx > LDT_ENTRIES)
- return 0;
-
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt);
- if (!ldt || idx > ldt->nr_entries)
+ if (!ldt || idx >= ldt->nr_entries)
return 0;
desc = &ldt->entries[idx];
@@ -2350,7 +2380,7 @@ static unsigned long get_segment_base(unsigned int segment)
return 0;
#endif
} else {
- if (idx > GDT_ENTRIES)
+ if (idx >= GDT_ENTRIES)
return 0;
desc = raw_cpu_ptr(gdt_page.gdt) + idx;
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 8ae8c5ce3a1f..16076eb34699 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -69,7 +69,7 @@ struct bts_buffer {
struct bts_phys buf[0];
};
-struct pmu bts_pmu;
+static struct pmu bts_pmu;
static size_t buf_size(struct page *page)
{
@@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags)
bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
- event->hw.itrace_started = 1;
+ perf_event_itrace_started(event);
event->hw.state = 0;
__bts_event_start(event);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 98b0f0729527..829e89cfcee2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3415,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = {
&format_attr_any.attr,
&format_attr_inv.attr,
&format_attr_cmask.attr,
+ NULL,
+};
+
+static struct attribute *hsw_format_attr[] = {
&format_attr_in_tx.attr,
&format_attr_in_tx_cp.attr,
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
- &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
- &format_attr_ldlat.attr, /* PEBS load latency */
- NULL,
+static struct attribute *nhm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
+
+static struct attribute *slm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ NULL
};
static struct attribute *skl_format_attr[] = {
@@ -3781,6 +3795,36 @@ done:
static DEVICE_ATTR_RW(freeze_on_smi);
+static ssize_t branches_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *lbr_attrs[] = {
+ &dev_attr_branches.attr,
+ NULL
+};
+
+static char pmu_name_str[30];
+
+static ssize_t pmu_name_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
+}
+
+static DEVICE_ATTR_RO(pmu_name);
+
+static struct attribute *intel_pmu_caps_attrs[] = {
+ &dev_attr_pmu_name.attr,
+ NULL
+};
+
static struct attribute *intel_pmu_attrs[] = {
&dev_attr_freeze_on_smi.attr,
NULL,
@@ -3795,6 +3839,8 @@ __init int intel_pmu_init(void)
unsigned int unused;
struct extra_reg *er;
int version, i;
+ struct attribute **extra_attr = NULL;
+ char *name;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -3862,6 +3908,7 @@ __init int intel_pmu_init(void)
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_CORE_YONAH:
pr_cont("Core events, ");
+ name = "core";
break;
case INTEL_FAM6_CORE2_MEROM:
@@ -3877,6 +3924,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_core2_event_constraints;
x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
pr_cont("Core2 events, ");
+ name = "core2";
break;
case INTEL_FAM6_NEHALEM:
@@ -3905,8 +3953,11 @@ __init int intel_pmu_init(void)
intel_pmu_pebs_data_source_nhm();
x86_add_quirk(intel_nehalem_quirk);
+ x86_pmu.pebs_no_tlb = 1;
+ extra_attr = nhm_format_attr;
pr_cont("Nehalem events, ");
+ name = "nehalem";
break;
case INTEL_FAM6_ATOM_PINEVIEW:
@@ -3923,6 +3974,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
pr_cont("Atom events, ");
+ name = "bonnell";
break;
case INTEL_FAM6_ATOM_SILVERMONT1:
@@ -3940,7 +3992,9 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_slm_extra_regs;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = slm_events_attrs;
+ extra_attr = slm_format_attr;
pr_cont("Silvermont events, ");
+ name = "silvermont";
break;
case INTEL_FAM6_ATOM_GOLDMONT:
@@ -3965,7 +4019,9 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = glm_events_attrs;
+ extra_attr = slm_format_attr;
pr_cont("Goldmont events, ");
+ name = "goldmont";
break;
case INTEL_FAM6_ATOM_GEMINI_LAKE:
@@ -3991,7 +4047,9 @@ __init int intel_pmu_init(void)
x86_pmu.cpu_events = glm_events_attrs;
/* Goldmont Plus has 4-wide pipeline */
event_attr_td_total_slots_scale_glm.event_str = "4";
+ extra_attr = slm_format_attr;
pr_cont("Goldmont plus events, ");
+ name = "goldmont_plus";
break;
case INTEL_FAM6_WESTMERE:
@@ -4020,7 +4078,9 @@ __init int intel_pmu_init(void)
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
intel_pmu_pebs_data_source_nhm();
+ extra_attr = nhm_format_attr;
pr_cont("Westmere events, ");
+ name = "westmere";
break;
case INTEL_FAM6_SANDYBRIDGE:
@@ -4056,7 +4116,10 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+ extra_attr = nhm_format_attr;
+
pr_cont("SandyBridge events, ");
+ name = "sandybridge";
break;
case INTEL_FAM6_IVYBRIDGE:
@@ -4090,7 +4153,10 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ extra_attr = nhm_format_attr;
+
pr_cont("IvyBridge events, ");
+ name = "ivybridge";
break;
@@ -4118,7 +4184,10 @@ __init int intel_pmu_init(void)
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.lbr_double_abort = true;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
pr_cont("Haswell events, ");
+ name = "haswell";
break;
case INTEL_FAM6_BROADWELL_CORE:
@@ -4154,7 +4223,10 @@ __init int intel_pmu_init(void)
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.limit_period = bdw_limit_period;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
pr_cont("Broadwell events, ");
+ name = "broadwell";
break;
case INTEL_FAM6_XEON_PHI_KNL:
@@ -4172,8 +4244,9 @@ __init int intel_pmu_init(void)
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
+ extra_attr = slm_format_attr;
pr_cont("Knights Landing/Mill events, ");
+ name = "knights-landing";
break;
case INTEL_FAM6_SKYLAKE_MOBILE:
@@ -4203,11 +4276,14 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
- x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
- skl_format_attr);
- WARN_ON(!x86_pmu.format_attrs);
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_attr = merge_attr(extra_attr, skl_format_attr);
x86_pmu.cpu_events = hsw_events_attrs;
+ intel_pmu_pebs_data_source_skl(
+ boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
pr_cont("Skylake events, ");
+ name = "skylake";
break;
default:
@@ -4215,6 +4291,7 @@ __init int intel_pmu_init(void)
case 1:
x86_pmu.event_constraints = intel_v1_event_constraints;
pr_cont("generic architected perfmon v1, ");
+ name = "generic_arch_v1";
break;
default:
/*
@@ -4222,10 +4299,19 @@ __init int intel_pmu_init(void)
*/
x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("generic architected perfmon, ");
+ name = "generic_arch_v2+";
break;
}
}
+ snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name);
+
+ if (version >= 2 && extra_attr) {
+ x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+ extra_attr);
+ WARN_ON(!x86_pmu.format_attrs);
+ }
+
if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
@@ -4272,8 +4358,13 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_nr = 0;
}
- if (x86_pmu.lbr_nr)
+ x86_pmu.caps_attrs = intel_pmu_caps_attrs;
+
+ if (x86_pmu.lbr_nr) {
+ x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs);
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+ }
+
/*
* Access extra MSR may cause #GP under certain circumstances.
* E.g. KVM doesn't support offcore event
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a322fed5f8ed..e1965e5ff570 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -49,34 +49,47 @@ union intel_x86_pebs_dse {
*/
#define P(a, b) PERF_MEM_S(a, b)
#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
/* Version for Sandy Bridge and later */
static u64 pebs_data_source[] = {
- P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
- OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
- OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
- OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
- OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
- OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
- OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
- OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
- OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
- OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
- OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */
- OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
+ OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */
+ OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+ OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+ OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
};
/* Patch up minor differences in the bits */
void __init intel_pmu_pebs_data_source_nhm(void)
{
- pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT);
- pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
- pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+ u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+ pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+ pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+ pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
}
static u64 precise_store_data(u64 status)
@@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
- int model = boot_cpu_data.x86_model;
- int fam = boot_cpu_data.x86;
dse.val = status;
@@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status)
/*
* Nehalem models do not support TLB, Lock infos
*/
- if (fam == 0x6 && (model == 26 || model == 30
- || model == 31 || model == 46)) {
+ if (x86_pmu.pebs_no_tlb) {
val |= P(TLB, NA) | P(LOCK, NA);
return val;
}
@@ -1175,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
else
regs->flags &= ~PERF_EFLAGS_EXACT;
- if ((sample_type & PERF_SAMPLE_ADDR) &&
+ if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
x86_pmu.intel_cap.pebs_format >= 1)
data->addr = pebs->dla;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 955457a30197..8a6bbacd17dc 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@ enum {
X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
X86_BR_CALL_STACK = 1 << 16,/* call stack */
X86_BR_IND_JMP = 1 << 17,/* indirect jump */
+
+ X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */
+
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -514,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].in_tx = 0;
cpuc->lbr_entries[i].abort = 0;
cpuc->lbr_entries[i].cycles = 0;
+ cpuc->lbr_entries[i].type = 0;
cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
@@ -600,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
cpuc->lbr_entries[out].cycles = cycles;
+ cpuc->lbr_entries[out].type = 0;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
@@ -677,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_CALL)
mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+ mask |= X86_BR_TYPE_SAVE;
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -930,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
return ret;
}
+#define X86_BR_TYPE_MAP_MAX 16
+
+static int branch_map[X86_BR_TYPE_MAP_MAX] = {
+ PERF_BR_CALL, /* X86_BR_CALL */
+ PERF_BR_RET, /* X86_BR_RET */
+ PERF_BR_SYSCALL, /* X86_BR_SYSCALL */
+ PERF_BR_SYSRET, /* X86_BR_SYSRET */
+ PERF_BR_UNKNOWN, /* X86_BR_INT */
+ PERF_BR_UNKNOWN, /* X86_BR_IRET */
+ PERF_BR_COND, /* X86_BR_JCC */
+ PERF_BR_UNCOND, /* X86_BR_JMP */
+ PERF_BR_UNKNOWN, /* X86_BR_IRQ */
+ PERF_BR_IND_CALL, /* X86_BR_IND_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_ABORT */
+ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */
+ PERF_BR_UNKNOWN, /* X86_BR_NO_TX */
+ PERF_BR_CALL, /* X86_BR_ZERO_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */
+ PERF_BR_IND, /* X86_BR_IND_JMP */
+};
+
+static int
+common_branch_type(int type)
+{
+ int i;
+
+ type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+
+ if (type) {
+ i = __ffs(type);
+ if (i < X86_BR_TYPE_MAP_MAX)
+ return branch_map[i];
+ }
+
+ return PERF_BR_UNKNOWN;
+}
+
/*
* implement actual branch filter based on user demand.
* Hardware may not exactly satisfy that request, thus
@@ -946,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
bool compress = false;
/* if sampling all branches, then nothing to filter */
- if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+ ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
return;
for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -967,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].from = 0;
compress = true;
}
+
+ if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+ cpuc->lbr_entries[i].type = common_branch_type(type);
}
if (!compress)
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index eb0533558c2b..d32c0eed38ca 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -587,7 +587,7 @@ static __initconst const u64 p4_hw_cache_event_ids
* P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
* either up to date automatically or not applicable at all.
*/
-struct p4_event_alias {
+static struct p4_event_alias {
u64 original;
u64 alternative;
} p4_event_aliases[] = {
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index ae8324d65e61..81fd41d5a0d9 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event)
struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 reg;
- if (!event->hw.itrace_started) {
- event->hw.itrace_started = 1;
+ /* First round: clear STATUS, in particular the PSB byte counter. */
+ if (!event->hw.config) {
+ perf_event_itrace_started(event);
wrmsrl(MSR_IA32_RTIT_STATUS, 0);
}
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index a45e2114a846..8e2457cb6b4a 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -559,7 +559,7 @@ static struct attribute_group rapl_pmu_format_group = {
.attrs = rapl_formats_attr,
};
-const struct attribute_group *rapl_attr_groups[] = {
+static const struct attribute_group *rapl_attr_groups[] = {
&rapl_pmu_attr_group,
&rapl_pmu_format_group,
&rapl_pmu_events_group,
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 44ec523287f6..1c5390f1cf09 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -721,7 +721,7 @@ static struct attribute *uncore_pmu_attrs[] = {
NULL,
};
-static struct attribute_group uncore_pmu_attr_group = {
+static const struct attribute_group uncore_pmu_attr_group = {
.attrs = uncore_pmu_attrs,
};
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index cda569332005..6a5cbe90f859 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -272,7 +272,7 @@ static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_ubox_format_group = {
+static const struct attribute_group nhmex_uncore_ubox_format_group = {
.name = "format",
.attrs = nhmex_uncore_ubox_formats_attr,
};
@@ -299,7 +299,7 @@ static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_cbox_format_group = {
+static const struct attribute_group nhmex_uncore_cbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_cbox_formats_attr,
};
@@ -407,7 +407,7 @@ static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_bbox_format_group = {
+static const struct attribute_group nhmex_uncore_bbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_bbox_formats_attr,
};
@@ -484,7 +484,7 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_sbox_format_group = {
+static const struct attribute_group nhmex_uncore_sbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_sbox_formats_attr,
};
@@ -898,7 +898,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_mbox_format_group = {
+static const struct attribute_group nhmex_uncore_mbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_mbox_formats_attr,
};
@@ -1163,7 +1163,7 @@ static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
NULL,
};
-static struct attribute_group nhmex_uncore_rbox_format_group = {
+static const struct attribute_group nhmex_uncore_rbox_format_group = {
.name = "format",
.attrs = nhmex_uncore_rbox_formats_attr,
};
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index a3dcc12bef4a..db1127ce685e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -130,7 +130,7 @@ static struct attribute *snb_uncore_formats_attr[] = {
NULL,
};
-static struct attribute_group snb_uncore_format_group = {
+static const struct attribute_group snb_uncore_format_group = {
.name = "format",
.attrs = snb_uncore_formats_attr,
};
@@ -289,7 +289,7 @@ static struct attribute *snb_uncore_imc_formats_attr[] = {
NULL,
};
-static struct attribute_group snb_uncore_imc_format_group = {
+static const struct attribute_group snb_uncore_imc_format_group = {
.name = "format",
.attrs = snb_uncore_imc_formats_attr,
};
@@ -769,7 +769,7 @@ static struct attribute *nhm_uncore_formats_attr[] = {
NULL,
};
-static struct attribute_group nhm_uncore_format_group = {
+static const struct attribute_group nhm_uncore_format_group = {
.name = "format",
.attrs = nhm_uncore_formats_attr,
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 4f9127644b80..db1fe377e6dd 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -602,27 +602,27 @@ static struct uncore_event_desc snbep_uncore_qpi_events[] = {
{ /* end: all zeroes */ },
};
-static struct attribute_group snbep_uncore_format_group = {
+static const struct attribute_group snbep_uncore_format_group = {
.name = "format",
.attrs = snbep_uncore_formats_attr,
};
-static struct attribute_group snbep_uncore_ubox_format_group = {
+static const struct attribute_group snbep_uncore_ubox_format_group = {
.name = "format",
.attrs = snbep_uncore_ubox_formats_attr,
};
-static struct attribute_group snbep_uncore_cbox_format_group = {
+static const struct attribute_group snbep_uncore_cbox_format_group = {
.name = "format",
.attrs = snbep_uncore_cbox_formats_attr,
};
-static struct attribute_group snbep_uncore_pcu_format_group = {
+static const struct attribute_group snbep_uncore_pcu_format_group = {
.name = "format",
.attrs = snbep_uncore_pcu_formats_attr,
};
-static struct attribute_group snbep_uncore_qpi_format_group = {
+static const struct attribute_group snbep_uncore_qpi_format_group = {
.name = "format",
.attrs = snbep_uncore_qpi_formats_attr,
};
@@ -1431,27 +1431,27 @@ static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
NULL,
};
-static struct attribute_group ivbep_uncore_format_group = {
+static const struct attribute_group ivbep_uncore_format_group = {
.name = "format",
.attrs = ivbep_uncore_formats_attr,
};
-static struct attribute_group ivbep_uncore_ubox_format_group = {
+static const struct attribute_group ivbep_uncore_ubox_format_group = {
.name = "format",
.attrs = ivbep_uncore_ubox_formats_attr,
};
-static struct attribute_group ivbep_uncore_cbox_format_group = {
+static const struct attribute_group ivbep_uncore_cbox_format_group = {
.name = "format",
.attrs = ivbep_uncore_cbox_formats_attr,
};
-static struct attribute_group ivbep_uncore_pcu_format_group = {
+static const struct attribute_group ivbep_uncore_pcu_format_group = {
.name = "format",
.attrs = ivbep_uncore_pcu_formats_attr,
};
-static struct attribute_group ivbep_uncore_qpi_format_group = {
+static const struct attribute_group ivbep_uncore_qpi_format_group = {
.name = "format",
.attrs = ivbep_uncore_qpi_formats_attr,
};
@@ -1887,7 +1887,7 @@ static struct attribute *knl_uncore_ubox_formats_attr[] = {
NULL,
};
-static struct attribute_group knl_uncore_ubox_format_group = {
+static const struct attribute_group knl_uncore_ubox_format_group = {
.name = "format",
.attrs = knl_uncore_ubox_formats_attr,
};
@@ -1927,7 +1927,7 @@ static struct attribute *knl_uncore_cha_formats_attr[] = {
NULL,
};
-static struct attribute_group knl_uncore_cha_format_group = {
+static const struct attribute_group knl_uncore_cha_format_group = {
.name = "format",
.attrs = knl_uncore_cha_formats_attr,
};
@@ -2037,7 +2037,7 @@ static struct attribute *knl_uncore_pcu_formats_attr[] = {
NULL,
};
-static struct attribute_group knl_uncore_pcu_format_group = {
+static const struct attribute_group knl_uncore_pcu_format_group = {
.name = "format",
.attrs = knl_uncore_pcu_formats_attr,
};
@@ -2187,7 +2187,7 @@ static struct attribute *knl_uncore_irp_formats_attr[] = {
NULL,
};
-static struct attribute_group knl_uncore_irp_format_group = {
+static const struct attribute_group knl_uncore_irp_format_group = {
.name = "format",
.attrs = knl_uncore_irp_formats_attr,
};
@@ -2385,7 +2385,7 @@ static struct attribute *hswep_uncore_ubox_formats_attr[] = {
NULL,
};
-static struct attribute_group hswep_uncore_ubox_format_group = {
+static const struct attribute_group hswep_uncore_ubox_format_group = {
.name = "format",
.attrs = hswep_uncore_ubox_formats_attr,
};
@@ -2439,7 +2439,7 @@ static struct attribute *hswep_uncore_cbox_formats_attr[] = {
NULL,
};
-static struct attribute_group hswep_uncore_cbox_format_group = {
+static const struct attribute_group hswep_uncore_cbox_format_group = {
.name = "format",
.attrs = hswep_uncore_cbox_formats_attr,
};
@@ -2621,7 +2621,7 @@ static struct attribute *hswep_uncore_sbox_formats_attr[] = {
NULL,
};
-static struct attribute_group hswep_uncore_sbox_format_group = {
+static const struct attribute_group hswep_uncore_sbox_format_group = {
.name = "format",
.attrs = hswep_uncore_sbox_formats_attr,
};
@@ -3314,7 +3314,7 @@ static struct attribute *skx_uncore_cha_formats_attr[] = {
NULL,
};
-static struct attribute_group skx_uncore_chabox_format_group = {
+static const struct attribute_group skx_uncore_chabox_format_group = {
.name = "format",
.attrs = skx_uncore_cha_formats_attr,
};
@@ -3427,7 +3427,7 @@ static struct attribute *skx_uncore_iio_formats_attr[] = {
NULL,
};
-static struct attribute_group skx_uncore_iio_format_group = {
+static const struct attribute_group skx_uncore_iio_format_group = {
.name = "format",
.attrs = skx_uncore_iio_formats_attr,
};
@@ -3484,7 +3484,7 @@ static struct attribute *skx_uncore_formats_attr[] = {
NULL,
};
-static struct attribute_group skx_uncore_format_group = {
+static const struct attribute_group skx_uncore_format_group = {
.name = "format",
.attrs = skx_uncore_formats_attr,
};
@@ -3605,7 +3605,7 @@ static struct attribute *skx_upi_uncore_formats_attr[] = {
NULL,
};
-static struct attribute_group skx_upi_uncore_format_group = {
+static const struct attribute_group skx_upi_uncore_format_group = {
.name = "format",
.attrs = skx_upi_uncore_formats_attr,
};
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 476aec3a4cab..4196f81ec0e1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
- PERF_SAMPLE_TRANSACTION)
+ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
/*
* A debug store configuration.
@@ -558,6 +558,7 @@ struct x86_pmu {
int attr_rdpmc;
struct attribute **format_attrs;
struct attribute **event_attrs;
+ struct attribute **caps_attrs;
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
@@ -591,7 +592,8 @@ struct x86_pmu {
pebs :1,
pebs_active :1,
pebs_broken :1,
- pebs_prec_dist :1;
+ pebs_prec_dist :1,
+ pebs_no_tlb :1;
int pebs_record_size;
int pebs_buffer_size;
void (*drain_pebs)(struct pt_regs *regs);
@@ -741,6 +743,8 @@ int x86_reserve_hardware(void);
void x86_release_hardware(void);
+int x86_pmu_max_precise(void);
+
void hw_perf_lbr_event_destroy(struct perf_event *event);
int x86_setup_perfctr(struct perf_event *event);
@@ -947,6 +951,8 @@ void intel_pmu_lbr_init_knl(void);
void intel_pmu_pebs_data_source_nhm(void);
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 724153797209..e0bb46c02857 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
if (ksig->ka.sa.sa_flags & SA_ONSTACK)
sp = sigsp(sp, ksig);
/* This is the legacy signal stack switching. */
- else if ((regs->ss & 0xffff) != __USER32_DS &&
+ else if (regs->ss != __USER32_DS &&
!(ksig->ka.sa.sa_flags & SA_RESTORER) &&
ksig->ka.sa.sa_restorer)
sp = (unsigned long) ksig->ka.sa.sa_restorer;
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2efc768e4362..72d867f6b518 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -150,8 +150,6 @@ static inline void disable_acpi(void) { }
extern int x86_acpi_numa_init(void);
#endif /* CONFIG_ACPI_NUMA */
-#define acpi_unlazy_tlb(x) leave_mm(x)
-
#ifdef CONFIG_ACPI_APEI
static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
{
@@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
* you call efi_mem_attributes() during boot and at runtime,
* you could theoretically see different attributes.
*
- * Since we are yet to see any x86 platforms that require
- * anything other than PAGE_KERNEL (some arm64 platforms
- * require the equivalent of PAGE_KERNEL_NOCACHE), return that
- * until we know differently.
+ * We are yet to see any x86 platforms that require anything
+ * other than PAGE_KERNEL (some ARM64 platforms require the
+ * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME
+ * is active, the ACPI information will not be encrypted,
+ * so return PAGE_KERNEL_NOENC until we know differently.
*/
- return PAGE_KERNEL;
+ return PAGE_KERNEL_NOENC;
}
#endif
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7a9df3beb89b..676ee5807d86 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -74,6 +74,9 @@
# define _ASM_EXTABLE_EX(from, to) \
_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+# define _ASM_EXTABLE_REFCOUNT(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
# define _ASM_NOKPROBE(entry) \
.pushsection "_kprobe_blacklist","aw" ; \
_ASM_ALIGN ; \
@@ -123,6 +126,9 @@
# define _ASM_EXTABLE_EX(from, to) \
_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+# define _ASM_EXTABLE_REFCOUNT(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
/* For C file, we already have NOKPROBE_SYMBOL macro */
#endif
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 33380b871463..0874ebda3069 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new)
return xchg(&v->counter, new);
}
-#define ATOMIC_OP(op) \
-static inline void atomic_##op(int i, atomic_t *v) \
-{ \
- asm volatile(LOCK_PREFIX #op"l %1,%0" \
- : "+m" (v->counter) \
- : "ir" (i) \
- : "memory"); \
+static inline void atomic_and(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "andl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static inline int atomic_fetch_and(int i, atomic_t *v)
+{
+ int val = atomic_read(v);
+
+ do { } while (!atomic_try_cmpxchg(v, &val, val & i));
+
+ return val;
}
-#define ATOMIC_FETCH_OP(op, c_op) \
-static inline int atomic_fetch_##op(int i, atomic_t *v) \
-{ \
- int val = atomic_read(v); \
- do { \
- } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \
- return val; \
+static inline void atomic_or(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "orl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
}
-#define ATOMIC_OPS(op, c_op) \
- ATOMIC_OP(op) \
- ATOMIC_FETCH_OP(op, c_op)
+static inline int atomic_fetch_or(int i, atomic_t *v)
+{
+ int val = atomic_read(v);
-ATOMIC_OPS(and, &)
-ATOMIC_OPS(or , |)
-ATOMIC_OPS(xor, ^)
+ do { } while (!atomic_try_cmpxchg(v, &val, val | i));
-#undef ATOMIC_OPS
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP
+ return val;
+}
+
+static inline void atomic_xor(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "xorl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static inline int atomic_fetch_xor(int i, atomic_t *v)
+{
+ int val = atomic_read(v);
+
+ do { } while (!atomic_try_cmpxchg(v, &val, val ^ i));
+
+ return val;
+}
/**
* __atomic_add_unless - add unless the number is already a given value
@@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^)
static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
int c = atomic_read(v);
+
do {
if (unlikely(c == u))
break;
} while (!atomic_try_cmpxchg(v, &c, c + a));
+
return c;
}
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 71d7705fb303..9e206f31ce2a 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
#undef alternative_atomic64
#undef __alternative_atomic64
-#define ATOMIC64_OP(op, c_op) \
-static inline void atomic64_##op(long long i, atomic64_t *v) \
-{ \
- long long old, c = 0; \
- while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \
- c = old; \
+static inline void atomic64_and(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
+ c = old;
}
-#define ATOMIC64_FETCH_OP(op, c_op) \
-static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \
-{ \
- long long old, c = 0; \
- while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \
- c = old; \
- return old; \
+static inline long long atomic64_fetch_and(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
+ c = old;
+
+ return old;
}
-ATOMIC64_FETCH_OP(add, +)
+static inline void atomic64_or(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
-#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v))
+ while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+ c = old;
+}
+
+static inline long long atomic64_fetch_or(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+ c = old;
+
+ return old;
+}
-#define ATOMIC64_OPS(op, c_op) \
- ATOMIC64_OP(op, c_op) \
- ATOMIC64_FETCH_OP(op, c_op)
+static inline void atomic64_xor(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+ c = old;
+}
-ATOMIC64_OPS(and, &)
-ATOMIC64_OPS(or, |)
-ATOMIC64_OPS(xor, ^)
+static inline long long atomic64_fetch_xor(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+ c = old;
+
+ return old;
+}
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
+static inline long long atomic64_fetch_add(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = atomic64_cmpxchg(v, c, c + i)) != c)
+ c = old;
+
+ return old;
+}
+
+#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v))
#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6189a433c9a9..5d9de36a2f04 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
}
#define atomic64_try_cmpxchg atomic64_try_cmpxchg
-static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new)
{
return try_cmpxchg(&v->counter, old, new);
}
@@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
*/
static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
{
- long c = atomic64_read(v);
+ s64 c = atomic64_read(v);
do {
if (unlikely(c == u))
return false;
@@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
*/
static inline long atomic64_dec_if_positive(atomic64_t *v)
{
- long dec, c = atomic64_read(v);
+ s64 dec, c = atomic64_read(v);
do {
dec = c - 1;
if (unlikely(dec < 0))
@@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
return dec;
}
-#define ATOMIC64_OP(op) \
-static inline void atomic64_##op(long i, atomic64_t *v) \
-{ \
- asm volatile(LOCK_PREFIX #op"q %1,%0" \
- : "+m" (v->counter) \
- : "er" (i) \
- : "memory"); \
+static inline void atomic64_and(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "andq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
}
-#define ATOMIC64_FETCH_OP(op, c_op) \
-static inline long atomic64_fetch_##op(long i, atomic64_t *v) \
-{ \
- long val = atomic64_read(v); \
- do { \
- } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \
- return val; \
+static inline long atomic64_fetch_and(long i, atomic64_t *v)
+{
+ s64 val = atomic64_read(v);
+
+ do {
+ } while (!atomic64_try_cmpxchg(v, &val, val & i));
+ return val;
}
-#define ATOMIC64_OPS(op, c_op) \
- ATOMIC64_OP(op) \
- ATOMIC64_FETCH_OP(op, c_op)
+static inline void atomic64_or(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "orq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
-ATOMIC64_OPS(and, &)
-ATOMIC64_OPS(or, |)
-ATOMIC64_OPS(xor, ^)
+static inline long atomic64_fetch_or(long i, atomic64_t *v)
+{
+ s64 val = atomic64_read(v);
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
+ do {
+ } while (!atomic64_try_cmpxchg(v, &val, val | i));
+ return val;
+}
+
+static inline void atomic64_xor(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "xorq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
+
+static inline long atomic64_fetch_xor(long i, atomic64_t *v)
+{
+ s64 val = atomic64_read(v);
+
+ do {
+ } while (!atomic64_try_cmpxchg(v, &val, val ^ i));
+ return val;
+}
#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
#define _ASM_X86_CMDLINE_H
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+ char *buffer, int bufsize);
#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d90296d061e8..b5069e802d5c 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -157,7 +157,7 @@ extern void __add_wrong_size(void)
#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \
({ \
bool success; \
- __typeof__(_ptr) _old = (_pold); \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
__typeof__(*(_ptr)) __old = *_old; \
__typeof__(*(_ptr)) __new = (_new); \
switch (size) { \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ca3c48c0872f..42bbbf0f173d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -177,7 +177,7 @@
#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
/*
@@ -196,6 +196,7 @@
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
@@ -286,7 +287,7 @@
#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
-#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 5dff775af7cd..c10c9128f54e 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -21,11 +21,13 @@
# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31))
# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID 0
#else
# define DISABLE_VME 0
# define DISABLE_K6_MTRR 0
# define DISABLE_CYRIX_ARR 0
# define DISABLE_CENTAUR_MCR 0
+# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31))
#endif /* CONFIG_X86_64 */
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -49,7 +51,7 @@
#define DISABLED_MASK1 0
#define DISABLED_MASK2 0
#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
-#define DISABLED_MASK4 0
+#define DISABLED_MASK4 (DISABLE_PCID)
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
#define DISABLED_MASK7 0
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 398c79889f5c..1387dafdba2d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,6 +12,7 @@
#include <asm/io.h>
#include <asm/swiotlb.h>
#include <linux/dma-contiguous.h>
+#include <linux/mem_encrypt.h>
#ifdef CONFIG_ISA
# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
{
- return paddr;
+ return __sme_set(paddr);
}
static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
{
- return daddr;
+ return __sme_clr(daddr);
}
#endif /* CONFIG_X86_DMA_REMAP */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 3c69fed215c5..a8e15b04565b 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
}
/* Use early IO mappings for DMI because it's initialized early */
-#define dmi_early_remap early_ioremap
-#define dmi_early_unmap early_iounmap
-#define dmi_remap ioremap_cache
-#define dmi_unmap iounmap
+#define dmi_early_remap early_memremap
+#define dmi_early_unmap early_memunmap
+#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB)
+#define dmi_unmap(_x) memunmap(_x)
#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index a504adc661a4..cd266d830e49 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void);
extern void e820__reallocate_tables(void);
extern void e820__register_nosave_regions(unsigned long limit_pfn);
+extern int e820__get_entry_type(u64 start, u64 end);
+
/*
* Returns true iff the specified range [start,end) is completely contained inside
* the ISA region.
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1c18d83d3f09..04330c8d9af9 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -126,15 +126,15 @@ do { \
pr_reg[4] = regs->di; \
pr_reg[5] = regs->bp; \
pr_reg[6] = regs->ax; \
- pr_reg[7] = regs->ds & 0xffff; \
- pr_reg[8] = regs->es & 0xffff; \
- pr_reg[9] = regs->fs & 0xffff; \
+ pr_reg[7] = regs->ds; \
+ pr_reg[8] = regs->es; \
+ pr_reg[9] = regs->fs; \
pr_reg[11] = regs->orig_ax; \
pr_reg[12] = regs->ip; \
- pr_reg[13] = regs->cs & 0xffff; \
+ pr_reg[13] = regs->cs; \
pr_reg[14] = regs->flags; \
pr_reg[15] = regs->sp; \
- pr_reg[16] = regs->ss & 0xffff; \
+ pr_reg[16] = regs->ss; \
} while (0);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
+ unsigned long base; \
unsigned v; \
(pr_reg)[0] = (regs)->r15; \
(pr_reg)[1] = (regs)->r14; \
@@ -226,8 +227,8 @@ do { \
(pr_reg)[18] = (regs)->flags; \
(pr_reg)[19] = (regs)->sp; \
(pr_reg)[20] = (regs)->ss; \
- (pr_reg)[21] = current->thread.fsbase; \
- (pr_reg)[22] = current->thread.gsbase; \
+ rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
+ rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
@@ -247,11 +248,11 @@ extern int force_personality32;
/*
* This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
* space open for things that want to use the area for 32-bit pointers.
*/
#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \
- 0x100000000UL)
+ (TASK_SIZE / 3 * 2))
/* This yields a mask that user programs can use to figure out what
instruction set this CPU supports. This could be done in user space,
@@ -304,8 +305,8 @@ static inline int mmap_is_ia32(void)
test_thread_flag(TIF_ADDR32));
}
-extern unsigned long tasksize_32bit(void);
-extern unsigned long tasksize_64bit(void);
+extern unsigned long task_size_32bit(void);
+extern unsigned long task_size_64bit(int full_addr_space);
extern unsigned long get_mmap_base(int is_legacy);
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b65155cc3760..dcd9fb55e679 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx,
}
#endif
+/*
+ * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
+ * supported for MMIO addresses, so make sure that the memory encryption
+ * mask is not part of the page attributes.
+ */
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
+
+/*
+ * Early memremap routines used for in-place encryption. The mappings created
+ * by these routines are intended to be used as temporary mappings.
+ */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+ unsigned long size);
+
#include <asm-generic/fixmap.h>
#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 255645f60ca2..554cdb205d17 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -450,10 +450,10 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
return 0;
}
-static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate)
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
{
if (use_xsave()) {
- copy_kernel_to_xregs(&fpstate->xsave, -1);
+ copy_kernel_to_xregs(&fpstate->xsave, mask);
} else {
if (use_fxsr())
copy_kernel_to_fxregs(&fpstate->fxsave);
@@ -477,7 +477,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
: : [addr] "m" (fpstate));
}
- __copy_kernel_to_fpregs(fpstate);
+ __copy_kernel_to_fpregs(fpstate, -1);
}
extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index b4c1f5453436..f4dc9b63bdda 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -41,20 +41,11 @@
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret, tem;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
-
pagefault_disable();
switch (op) {
@@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- ret = (oldval == cmparg);
- break;
- case FUTEX_OP_CMP_NE:
- ret = (oldval != cmparg);
- break;
- case FUTEX_OP_CMP_LT:
- ret = (oldval < cmparg);
- break;
- case FUTEX_OP_CMP_GE:
- ret = (oldval >= cmparg);
- break;
- case FUTEX_OP_CMP_LE:
- ret = (oldval <= cmparg);
- break;
- case FUTEX_OP_CMP_GT:
- ret = (oldval > cmparg);
- break;
- default:
- ret = -ENOSYS;
- }
- }
+ if (!ret)
+ *oval = oldval;
+
return ret;
}
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 21126155a739..0ead9dbb9130 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -43,6 +43,9 @@ struct hypervisor_x86 {
/* pin current vcpu to specified physical cpu (run rarely) */
void (*pin_vcpu)(int);
+
+ /* called during init_mem_mapping() to setup early mappings. */
+ void (*init_mem_mapping)(void);
};
extern const struct hypervisor_x86 *x86_hyper;
@@ -57,8 +60,15 @@ extern const struct hypervisor_x86 x86_hyper_kvm;
extern void init_hypervisor_platform(void);
extern bool hypervisor_x2apic_available(void);
extern void hypervisor_pin_vcpu(int cpu);
+
+static inline void hypervisor_init_mem_mapping(void)
+{
+ if (x86_hyper && x86_hyper->init_mem_mapping)
+ x86_hyper->init_mem_mapping();
+}
#else
static inline void init_hypervisor_platform(void) { }
static inline bool hypervisor_x2apic_available(void) { return false; }
+static inline void hypervisor_init_mem_mapping(void) { }
#endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 474eb8c66fee..05c4aa00cc86 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -7,6 +7,7 @@ struct x86_mapping_info {
unsigned long page_flag; /* page flag for PMD or PUD entry */
unsigned long offset; /* ident mapping offset */
bool direct_gbpages; /* PUD level 1GB page support */
+ unsigned long kernpg_flag; /* kernel pagetable flag override */
};
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 48febf07e828..c40a95c33bb8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", )
build_mmio_write(__writew, "w", unsigned short, "r", )
build_mmio_write(__writel, "l", unsigned int, "r", )
+#define readb readb
+#define readw readw
+#define readl readl
#define readb_relaxed(a) __readb(a)
#define readw_relaxed(a) __readw(a)
#define readl_relaxed(a) __readl(a)
@@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
#define __raw_readw __readw
#define __raw_readl __readl
+#define writeb writeb
+#define writew writew
+#define writel writel
#define writeb_relaxed(v, a) __writeb(v, a)
#define writew_relaxed(v, a) __writew(v, a)
#define writel_relaxed(v, a) __writel(v, a)
@@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
#ifdef CONFIG_X86_64
build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_read(__readq, "q", unsigned long, "=r", )
build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+build_mmio_write(__writeq, "q", unsigned long, "r", )
-#define readq_relaxed(a) readq(a)
-#define writeq_relaxed(v, a) writeq(v, a)
+#define readq_relaxed(a) __readq(a)
+#define writeq_relaxed(v, a) __writeq(v, a)
-#define __raw_readq(a) readq(a)
-#define __raw_writeq(val, addr) writeq(val, addr)
+#define __raw_readq __readq
+#define __raw_writeq __writeq
/* Let people know that we have them */
#define readq readq
@@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address)
{
return __pa(address);
}
+#define virt_to_phys virt_to_phys
/**
* phys_to_virt - map physical address to virtual
@@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address)
{
return __va(address);
}
+#define phys_to_virt phys_to_virt
/*
* Change "struct page" to physical address.
@@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
* else, you probably want one of the following.
*/
extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+#define ioremap_nocache ioremap_nocache
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
+#define ioremap_prot ioremap_prot
/**
* ioremap - map bus memory into CPU space
@@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
{
return ioremap_nocache(offset, size);
}
+#define ioremap ioremap
extern void iounmap(volatile void __iomem *addr);
+#define iounmap iounmap
extern void set_iounmap_nonlazy(void);
@@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void);
#include <asm-generic/iomap.h>
/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p) p
-
-/**
- * memset_io Set a range of I/O memory to a constant value
- * @addr: The beginning of the I/O-memory range to set
- * @val: The value to set the memory to
- * @count: The number of bytes to set
- *
- * Set a range of I/O memory to a given value.
- */
-static inline void
-memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
-{
- memset((void __force *)addr, val, count);
-}
-
-/**
- * memcpy_fromio Copy a block of data from I/O memory
- * @dst: The (RAM) destination for the copy
- * @src: The (I/O memory) source for the data
- * @count: The number of bytes to copy
- *
- * Copy a block of data from I/O memory.
- */
-static inline void
-memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
-{
- memcpy(dst, (const void __force *)src, count);
-}
-
-/**
- * memcpy_toio Copy a block of data into I/O memory
- * @dst: The (I/O memory) destination for the copy
- * @src: The (RAM) source for the data
- * @count: The number of bytes to copy
- *
- * Copy a block of data to I/O memory.
- */
-static inline void
-memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
-{
- memcpy((void __force *)dst, src, count);
-}
-
-/*
* ISA space is 'always mapped' on a typical x86 system, no need to
* explicitly ioremap() it. The fact that the ISA IO space is mapped
* to PAGE_OFFSET is pure coincidence - it does not mean ISA values
@@ -341,13 +309,38 @@ BUILDIO(b, b, char)
BUILDIO(w, w, short)
BUILDIO(l, , int)
+#define inb inb
+#define inw inw
+#define inl inl
+#define inb_p inb_p
+#define inw_p inw_p
+#define inl_p inl_p
+#define insb insb
+#define insw insw
+#define insl insl
+
+#define outb outb
+#define outw outw
+#define outl outl
+#define outb_p outb_p
+#define outw_p outw_p
+#define outl_p outl_p
+#define outsb outsb
+#define outsw outsw
+#define outsl outsl
+
extern void *xlate_dev_mem_ptr(phys_addr_t phys);
extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
+#define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
+
extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
enum page_cache_mode pcm);
extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+#define ioremap_wc ioremap_wc
extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
+#define ioremap_wt ioremap_wt
extern bool is_early_ioremap_ptep(pte_t *ptep);
@@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
#define IO_SPACE_LIMIT 0xffff
+#include <asm-generic/io.h>
+#undef PCI_IOBASE
+
#ifdef CONFIG_MTRR
extern int __must_check arch_phys_wc_index(int handle);
#define arch_phys_wc_index arch_phys_wc_index
@@ -381,4 +377,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
#endif
+extern bool arch_memremap_can_ram_remap(resource_size_t offset,
+ unsigned long size,
+ unsigned long flags);
+#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+
+extern bool phys_mem_access_encrypted(unsigned long phys_addr,
+ unsigned long size);
+
#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 70ef205489f0..942c1f444da8 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -147,7 +147,8 @@ unsigned long
relocate_kernel(unsigned long indirection_page,
unsigned long page_list,
unsigned long start_address,
- unsigned int preserve_context);
+ unsigned int preserve_context,
+ unsigned int sme_active);
#endif
#define ARCH_HAS_KIMAGE_ARCH
@@ -207,6 +208,14 @@ struct kexec_entry64_regs {
uint64_t r15;
uint64_t rip;
};
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+ gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
#endif
typedef void crash_vmclear_fn(void);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 87ac4fba6d8e..369e41c23f07 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -492,6 +492,7 @@ struct kvm_vcpu_arch {
unsigned long cr4;
unsigned long cr4_guest_owned_bits;
unsigned long cr8;
+ u32 pkru;
u32 hflags;
u64 efer;
u64 apic_base;
@@ -1078,7 +1079,7 @@ void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask);
+ u64 acc_track_mask, u64 me_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1374,8 +1375,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address);
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
deleted file mode 100644
index 73d0c9b92087..000000000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS 10
-#define GDT_ENTRY_LGUEST_DS 11
-#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
- /* Manually saved part. */
- unsigned long eax, ebx, ecx, edx;
- unsigned long esi, edi, ebp;
- unsigned long gs;
- unsigned long fs, ds, es;
- unsigned long trapnum, errcode;
- /* Trap pushed part */
- unsigned long eip;
- unsigned long cs;
- unsigned long eflags;
- unsigned long esp;
- unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
- /* Host information we need to restore when we switch back. */
- u32 host_cr3;
- struct desc_ptr host_idt_desc;
- struct desc_ptr host_gdt_desc;
- u32 host_sp;
-
- /* Fields which are used when guest is running. */
- struct desc_ptr guest_idt_desc;
- struct desc_ptr guest_gdt_desc;
- struct x86_hw_tss guest_tss;
- struct desc_struct guest_idt[IDT_ENTRIES];
- struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
- /* The GDT entries copied into lguest_ro_state when running. */
- struct desc_struct gdt[GDT_ENTRIES];
-
- /* The IDT entries: some copied into lguest_ro_state when running. */
- struct desc_struct idt[IDT_ENTRIES];
-
- /* The address of the last guest-visible pagefault (ie. cr2). */
- unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
- u32 cr0;
-
- cr0 = read_cr0();
- if (!(cr0 & 8))
- write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
- ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
deleted file mode 100644
index 6c119cfae218..000000000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC 0
-#define LHCALL_LGUEST_INIT 1
-#define LHCALL_SHUTDOWN 2
-#define LHCALL_NEW_PGTABLE 4
-#define LHCALL_FLUSH_TLB 5
-#define LHCALL_LOAD_IDT_ENTRY 6
-#define LHCALL_SET_STACK 7
-#define LHCALL_SET_CLOCKEVENT 9
-#define LHCALL_HALT 10
-#define LHCALL_SET_PMD 13
-#define LHCALL_SET_PTE 14
-#define LHCALL_SET_PGD 15
-#define LHCALL_LOAD_TLS 16
-#define LHCALL_LOAD_GDT_ENTRY 18
-#define LHCALL_SEND_INTERRUPTS 19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF 1
-#define LGUEST_SHUTDOWN_RESTART 2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations? There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts). Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure. This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
- unsigned long arg1, unsigned long arg2, unsigned long arg3,
- unsigned long arg4)
-{
- /* "int" is the Intel instruction to trigger a trap. */
- asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
- /* The call in %eax (aka "a") might be overwritten */
- : "=a"(call)
- /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
- : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
- /* "memory" means this might write somewhere in memory.
- * This isn't true for all calls, but it's safe to tell
- * gcc that it might happen so it doesn't get clever. */
- : "memory");
- return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
- /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
- unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..8e618fcf1f7c
--- /dev/null
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -0,0 +1,80 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __X86_MEM_ENCRYPT_H__
+#define __X86_MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern unsigned long sme_me_mask;
+
+void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
+ unsigned long decrypted_kernel_vaddr,
+ unsigned long kernel_len,
+ unsigned long encryption_wa,
+ unsigned long encryption_pgd);
+
+void __init sme_early_encrypt(resource_size_t paddr,
+ unsigned long size);
+void __init sme_early_decrypt(resource_size_t paddr,
+ unsigned long size);
+
+void __init sme_map_bootdata(char *real_mode_data);
+void __init sme_unmap_bootdata(char *real_mode_data);
+
+void __init sme_early_init(void);
+
+void __init sme_encrypt_kernel(void);
+void __init sme_enable(struct boot_params *bp);
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
+
+#else /* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define sme_me_mask 0UL
+
+static inline void __init sme_early_encrypt(resource_size_t paddr,
+ unsigned long size) { }
+static inline void __init sme_early_decrypt(resource_size_t paddr,
+ unsigned long size) { }
+
+static inline void __init sme_map_bootdata(char *real_mode_data) { }
+static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
+
+static inline void __init sme_early_init(void) { }
+
+static inline void __init sme_encrypt_kernel(void) { }
+static inline void __init sme_enable(struct boot_params *bp) { }
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+/*
+ * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
+ * writing to or comparing values from the cr3 register. Having the
+ * encryption mask set in cr3 enables the PGD entry to be encrypted and
+ * avoid special case handling of PGD allocations.
+ */
+#define __sme_pa(x) (__pa(x) | sme_me_mask)
+#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 79b647a7ebd0..bb8c597c2248 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -3,12 +3,28 @@
#include <linux/spinlock.h>
#include <linux/mutex.h>
+#include <linux/atomic.h>
/*
- * The x86 doesn't have a mmu context, but
- * we put the segment information here.
+ * x86 has arch-specific MMU state beyond what lives in mm_struct.
*/
typedef struct {
+ /*
+ * ctx_id uniquely identifies this mm_struct. A ctx_id will never
+ * be reused, and zero is not a valid ctx_id.
+ */
+ u64 ctx_id;
+
+ /*
+ * Any code that needs to do any sort of TLB flushing for this
+ * mm will first make its changes to the page tables, then
+ * increment tlb_gen, then flush. This lets the low-level
+ * flushing code keep track of what needs flushing.
+ *
+ * This is not used on Xen PV.
+ */
+ atomic64_t tlb_gen;
+
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
#endif
@@ -37,6 +53,11 @@ typedef struct {
#endif
} mm_context_t;
+#define INIT_MM_CONTEXT(mm) \
+ .context = { \
+ .ctx_id = 1, \
+ }
+
void leave_mm(int cpu);
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 265c907d7d4c..7ae318c340d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,6 +12,9 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/mpx.h>
+
+extern atomic64_t last_mm_ctx_id;
+
#ifndef CONFIG_PARAVIRT
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
@@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+ cpumask_clear_cpu(cpu, mm_cpumask(mm));
}
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
+ mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
+ atomic64_set(&mm->context.tlb_gen, 0);
+
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */
@@ -140,9 +148,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.execute_only_pkey = -1;
}
#endif
- init_new_context_ldt(tsk, mm);
-
- return 0;
+ return init_new_context_ldt(tsk, mm);
}
static inline void destroy_context(struct mm_struct *mm)
{
@@ -292,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void)
{
unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e3b7819caeef..9eb7c718aaf8 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -2,6 +2,15 @@
#define _ASM_X86_MODULE_H
#include <asm-generic/module.h>
+#include <asm/orc_types.h>
+
+struct mod_arch_specific {
+#ifdef CONFIG_ORC_UNWINDER
+ unsigned int num_orcs;
+ int *orc_unwind_ip;
+ struct orc_entry *orc_unwind;
+#endif
+};
#ifdef CONFIG_X86_64
/* X86_64 does not define MODULE_PROC_FAMILY */
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index a0d662be4c5b..7d7404756bb4 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm)
}
void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long start, unsigned long end);
+
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+ unsigned long flags);
#else
static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
{
@@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
}
+
+static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
+ unsigned long len, unsigned long flags)
+{
+ return addr;
+}
#endif /* CONFIG_X86_INTEL_MPX */
#endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5573c75f8e4c..17f5c12e1afd 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -356,6 +356,8 @@
#define MSR_K8_TOP_MEM1 0xc001001a
#define MSR_K8_TOP_MEM2 0xc001001d
#define MSR_K8_SYSCFG 0xc0010010
+#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23
+#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
#define MSR_K8_INT_PENDING_MSG 0xc0010055
/* C1E active bits in int pending message */
#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h
new file mode 100644
index 000000000000..91c8d868424d
--- /dev/null
+++ b/arch/x86/include/asm/orc_lookup.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _ORC_LOOKUP_H
+#define _ORC_LOOKUP_H
+
+/*
+ * This is a lookup table for speeding up access to the .orc_unwind table.
+ * Given an input address offset, the corresponding lookup table entry
+ * specifies a subset of the .orc_unwind table to search.
+ *
+ * Each block represents the end of the previous range and the start of the
+ * next range. An extra block is added to give the last range an end.
+ *
+ * The block size should be a power of 2 to avoid a costly 'div' instruction.
+ *
+ * A block size of 256 was chosen because it roughly doubles unwinder
+ * performance while only adding ~5% to the ORC data footprint.
+ */
+#define LOOKUP_BLOCK_ORDER 8
+#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER)
+
+#ifndef LINKER_SCRIPT
+
+extern unsigned int orc_lookup[];
+extern unsigned int orc_lookup_end[];
+
+#define LOOKUP_START_IP (unsigned long)_stext
+#define LOOKUP_STOP_IP (unsigned long)_etext
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* _ORC_LOOKUP_H */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
new file mode 100644
index 000000000000..9c9dc579bd7d
--- /dev/null
+++ b/arch/x86/include/asm/orc_types.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and BP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED 0
+#define ORC_REG_PREV_SP 1
+#define ORC_REG_DX 2
+#define ORC_REG_DI 3
+#define ORC_REG_BP 4
+#define ORC_REG_SP 5
+#define ORC_REG_R10 6
+#define ORC_REG_R13 7
+#define ORC_REG_BP_INDIRECT 8
+#define ORC_REG_SP_INDIRECT 9
+#define ORC_REG_MAX 15
+
+/*
+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
+ * caller's SP right before it made the call). Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
+ * to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
+ * points to the iret return frame.
+ *
+ * The UNWIND_HINT macros are used only for the unwind_hint struct. They
+ * aren't used in struct orc_entry due to size and complexity constraints.
+ * Objtool converts them to real types when it converts the hints to orc
+ * entries.
+ */
+#define ORC_TYPE_CALL 0
+#define ORC_TYPE_REGS 1
+#define ORC_TYPE_REGS_IRET 2
+#define UNWIND_HINT_TYPE_SAVE 3
+#define UNWIND_HINT_TYPE_RESTORE 4
+
+#ifndef __ASSEMBLY__
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard. It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the
+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on
+ * the stack for a given code address. Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+ s16 sp_offset;
+ s16 bp_offset;
+ unsigned sp_reg:4;
+ unsigned bp_reg:4;
+ unsigned type:2;
+} __packed;
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack for the ORC unwinder.
+ *
+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
+ */
+struct unwind_hint {
+ u32 ip;
+ s16 sp_offset;
+ u8 sp_reg;
+ u8 type;
+};
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b4a0d43248cf..b50df06ad251 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -51,6 +51,10 @@ static inline void clear_page(void *page)
void copy_page(void *to, void *from);
+#ifdef CONFIG_X86_MCE
+#define arch_unmap_kpfn arch_unmap_kpfn
+#endif
+
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 7bd0099384ca..b98ed9d14630 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
#include <linux/const.h>
#include <linux/types.h>
+#include <linux/mem_encrypt.h>
/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT 12
@@ -15,7 +16,7 @@
#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
-#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1)))
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 77037b6f1caa..bbeae4a2bd01 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H
+#include <linux/mem_encrypt.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
@@ -13,9 +14,18 @@
cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \
: (prot))
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot)))
+#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
+
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_checkwx(void);
@@ -38,6 +48,8 @@ extern struct list_head pgd_list;
extern struct mm_struct *pgd_page_get_mm(struct page *page);
+extern pmdval_t early_pmd_flags;
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else /* !CONFIG_PARAVIRT */
@@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d)
return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}
+static inline unsigned long pgd_pfn(pgd_t pgd)
+{
+ return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
static inline int p4d_large(p4d_t p4d)
{
/* No 512 GiB pages yet */
@@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pmd_page(pmd) \
- pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
+#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
/*
* the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud)
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pud_page(pud) \
- pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
+#define pud_page(pud) pfn_to_page(pud_pfn(pud))
/* Find an entry in the second-level page table.. */
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d)
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define p4d_page(p4d) \
- pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
+#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d))
/* Find an entry in the third-level page table.. */
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
@@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd))
/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index bf9638e1ee42..399261ce904c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -2,6 +2,8 @@
#define _ASM_X86_PGTABLE_DEFS_H
#include <linux/const.h>
+#include <linux/mem_encrypt.h>
+
#include <asm/page_types.h>
#define FIRST_USER_ADDRESS 0UL
@@ -121,10 +123,10 @@
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
- _PAGE_DIRTY)
+#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
/*
* Set of bits not changed in pte_modify. The pte's
@@ -159,6 +161,7 @@ enum page_cache_mode {
#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
@@ -187,22 +190,42 @@ enum page_cache_mode {
#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP)
#define __PAGE_KERNEL_IO (__PAGE_KERNEL)
#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
+#ifndef __ASSEMBLY__
+
+#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
+
+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
+ _PAGE_DIRTY | _PAGE_ENC)
+
+#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
+#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC)
+
+#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL)
+#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP)
+
+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC)
+#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC)
+#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
+
+#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
-#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#endif /* __ASSEMBLY__ */
/* xwr */
#define __P000 PAGE_NONE
@@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d)
#else
#include <asm-generic/pgtable-nop4d.h>
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+ return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
+}
+
static inline p4dval_t native_p4d_val(p4d_t p4d)
{
return native_pgd_val(p4d.pgd);
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 79aa2f98398d..dc723b64acf0 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -2,6 +2,7 @@
#define _ASM_X86_PROCESSOR_FLAGS_H
#include <uapi/asm/processor-flags.h>
+#include <linux/mem_encrypt.h>
#ifdef CONFIG_VM86
#define X86_VM_MASK X86_EFLAGS_VM
@@ -32,16 +33,18 @@
* CR3_ADDR_MASK is the mask used by read_cr3_pa().
*/
#ifdef CONFIG_X86_64
-/* Mask off the address space ID bits. */
-#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
-#define CR3_PCID_MASK 0xFFFull
+/* Mask off the address space ID and SME encryption bits. */
+#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
+#define CR3_PCID_MASK 0xFFFull
+#define CR3_NOFLUSH BIT_ULL(63)
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
* a tiny bit of code size by setting all the bits.
*/
-#define CR3_ADDR_MASK 0xFFFFFFFFull
-#define CR3_PCID_MASK 0ull
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#define CR3_NOFLUSH 0
#endif
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 028245e1c42b..3fa26a61eabc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
#include <asm/nops.h>
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
+#include <asm/unwind_hints.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -29,6 +30,7 @@ struct vm86;
#include <linux/math64.h>
#include <linux/err.h>
#include <linux/irqflags.h>
+#include <linux/mem_encrypt.h>
/*
* We handle most unaligned accesses in hardware. On the other hand
@@ -239,9 +241,14 @@ static inline unsigned long read_cr3_pa(void)
return __read_cr3() & CR3_ADDR_MASK;
}
+static inline unsigned long native_read_cr3_pa(void)
+{
+ return __native_read_cr3() & CR3_ADDR_MASK;
+}
+
static inline void load_cr3(pgd_t *pgdir)
{
- write_cr3(__pa(pgdir));
+ write_cr3(__sme_pa(pgdir));
}
#ifdef CONFIG_X86_32
@@ -661,7 +668,7 @@ static inline void sync_core(void)
* In case NMI unmasking or performance ever becomes a problem,
* the next best option appears to be MOV-to-CR2 and an
* unconditional jump. That sequence also works on all CPUs,
- * but it will fault at CPL3 (i.e. Xen PV and lguest).
+ * but it will fault at CPL3 (i.e. Xen PV).
*
* CPUID is the conventional way, but it's nasty: it doesn't
* exist on some 486-like CPUs, and it usually exits to a
@@ -684,6 +691,7 @@ static inline void sync_core(void)
unsigned int tmp;
asm volatile (
+ UNWIND_HINT_SAVE
"mov %%ss, %0\n\t"
"pushq %q0\n\t"
"pushq %%rsp\n\t"
@@ -693,6 +701,7 @@ static inline void sync_core(void)
"pushq %q0\n\t"
"pushq $1f\n\t"
"iretq\n\t"
+ UNWIND_HINT_RESTORE
"1:"
: "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
#endif
@@ -802,7 +811,9 @@ static inline void spin_lock_prefetch(const void *x)
*/
#define IA32_PAGE_OFFSET PAGE_OFFSET
#define TASK_SIZE PAGE_OFFSET
+#define TASK_SIZE_LOW TASK_SIZE
#define TASK_SIZE_MAX TASK_SIZE
+#define DEFAULT_MAP_WINDOW TASK_SIZE
#define STACK_TOP TASK_SIZE
#define STACK_TOP_MAX STACK_TOP
@@ -842,7 +853,9 @@ static inline void spin_lock_prefetch(const void *x)
* particular problem by preventing anything from being mapped
* at the maximum canonical address.
*/
-#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
@@ -850,12 +863,14 @@ static inline void spin_lock_prefetch(const void *x)
#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
0xc0000000 : 0xFFFFe000)
+#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
+ IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
-#define STACK_TOP TASK_SIZE
+#define STACK_TOP TASK_SIZE_LOW
#define STACK_TOP_MAX TASK_SIZE_MAX
#define INIT_THREAD { \
@@ -876,7 +891,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
* space during mmap's.
*/
#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
-#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE)
+#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
#define KSTK_EIP(task) (task_pt_regs(task)->ip)
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2b5d686ea9f3..91c04c8e67fa 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -9,6 +9,20 @@
#ifdef __i386__
struct pt_regs {
+ /*
+ * NB: 32-bit x86 CPUs are inconsistent as what happens in the
+ * following cases (where %seg represents a segment register):
+ *
+ * - pushl %seg: some do a 16-bit write and leave the high
+ * bits alone
+ * - movl %seg, [mem]: some do a 16-bit write despite the movl
+ * - IDT entry: some (e.g. 486) will leave the high bits of CS
+ * and (if applicable) SS undefined.
+ *
+ * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
+ * so we can just treat all of the segment registers as 16-bit
+ * values.
+ */
unsigned long bx;
unsigned long cx;
unsigned long dx;
@@ -16,16 +30,22 @@ struct pt_regs {
unsigned long di;
unsigned long bp;
unsigned long ax;
- unsigned long ds;
- unsigned long es;
- unsigned long fs;
- unsigned long gs;
+ unsigned short ds;
+ unsigned short __dsh;
+ unsigned short es;
+ unsigned short __esh;
+ unsigned short fs;
+ unsigned short __fsh;
+ unsigned short gs;
+ unsigned short __gsh;
unsigned long orig_ax;
unsigned long ip;
- unsigned long cs;
+ unsigned short cs;
+ unsigned short __csh;
unsigned long flags;
unsigned long sp;
- unsigned long ss;
+ unsigned short ss;
+ unsigned short __ssh;
};
#else /* __i386__ */
@@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
if (offset == offsetof(struct pt_regs, sp) &&
regs->cs == __KERNEL_CS)
return kernel_stack_pointer(regs);
+
+ /* The selector fields are 16-bit. */
+ if (offset == offsetof(struct pt_regs, cs) ||
+ offset == offsetof(struct pt_regs, ss) ||
+ offset == offsetof(struct pt_regs, ds) ||
+ offset == offsetof(struct pt_regs, es) ||
+ offset == offsetof(struct pt_regs, fs) ||
+ offset == offsetof(struct pt_regs, gs)) {
+ return *(u16 *)((unsigned long)regs + offset);
+
+ }
#endif
return *(unsigned long *)((unsigned long)regs + offset);
}
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 230e1903acf0..90d91520c13a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -1,6 +1,15 @@
#ifndef _ARCH_X86_REALMODE_H
#define _ARCH_X86_REALMODE_H
+/*
+ * Flag bit definitions for use with the flags field of the trampoline header
+ * in the CONFIG_X86_64 variant.
+ */
+#define TH_FLAGS_SME_ACTIVE_BIT 0
+#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT)
+
+#ifndef __ASSEMBLY__
+
#include <linux/types.h>
#include <asm/io.h>
@@ -38,6 +47,7 @@ struct trampoline_header {
u64 start;
u64 efer;
u32 cr4;
+ u32 flags;
#endif
};
@@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void)
void set_real_mode_mem(phys_addr_t mem, size_t size);
void reserve_real_mode(void);
+#endif /* __ASSEMBLY__ */
+
#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..ff871210b9f2
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,109 @@
+#ifndef __ASM_X86_REFCOUNT_H
+#define __ASM_X86_REFCOUNT_H
+/*
+ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
+ * PaX/grsecurity.
+ */
+#include <linux/refcount.h>
+
+/*
+ * This is the first portion of the refcount error handling, which lives in
+ * .text.unlikely, and is jumped to from the CPU flag check (in the
+ * following macros). This saves the refcount value location into CX for
+ * the exception handler to use (in mm/extable.c), and then triggers the
+ * central refcount exception. The fixup address for the exception points
+ * back to the regular execution flow in .text.
+ */
+#define _REFCOUNT_EXCEPTION \
+ ".pushsection .text.unlikely\n" \
+ "111:\tlea %[counter], %%" _ASM_CX "\n" \
+ "112:\t" ASM_UD0 "\n" \
+ ASM_UNREACHABLE \
+ ".popsection\n" \
+ "113:\n" \
+ _ASM_EXTABLE_REFCOUNT(112b, 113b)
+
+/* Trigger refcount exception if refcount result is negative. */
+#define REFCOUNT_CHECK_LT_ZERO \
+ "js 111f\n\t" \
+ _REFCOUNT_EXCEPTION
+
+/* Trigger refcount exception if refcount result is zero or negative. */
+#define REFCOUNT_CHECK_LE_ZERO \
+ "jz 111f\n\t" \
+ REFCOUNT_CHECK_LT_ZERO
+
+/* Trigger refcount exception unconditionally. */
+#define REFCOUNT_ERROR \
+ "jmp 111f\n\t" \
+ _REFCOUNT_EXCEPTION
+
+static __always_inline void refcount_add(unsigned int i, refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
+ REFCOUNT_CHECK_LT_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : "ir" (i)
+ : "cc", "cx");
+}
+
+static __always_inline void refcount_inc(refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "incl %0\n\t"
+ REFCOUNT_CHECK_LT_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : : "cc", "cx");
+}
+
+static __always_inline void refcount_dec(refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "decl %0\n\t"
+ REFCOUNT_CHECK_LE_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : : "cc", "cx");
+}
+
+static __always_inline __must_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+ GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
+ r->refs.counter, "er", i, "%0", e);
+}
+
+static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+ GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
+ r->refs.counter, "%0", e);
+}
+
+static __always_inline __must_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+ int c, result;
+
+ c = atomic_read(&(r->refs));
+ do {
+ if (unlikely(c == 0))
+ return false;
+
+ result = c + i;
+
+ /* Did we try to increment from/to an undesirable state? */
+ if (unlikely(c < 0 || c == INT_MAX || result < c)) {
+ asm volatile(REFCOUNT_ERROR
+ : : [counter] "m" (r->refs.counter)
+ : "cc", "cx");
+ break;
+ }
+
+ } while (!atomic_try_cmpxchg(&(r->refs), &c, result));
+
+ return c != 0;
+}
+
+static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+ return refcount_add_not_zero(1, r);
+}
+
+#endif
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 661dd305694a..045f99211a99 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -1,45 +1,56 @@
#ifndef _ASM_X86_RMWcc
#define _ASM_X86_RMWcc
+#define __CLOBBERS_MEM "memory"
+#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx"
+
#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
/* Use asm goto */
-#define __GEN_RMWcc(fullop, var, cc, ...) \
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
do { \
asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \
- : : "m" (var), ## __VA_ARGS__ \
- : "memory" : cc_label); \
+ : : [counter] "m" (var), ## __VA_ARGS__ \
+ : clobbers : cc_label); \
return 0; \
cc_label: \
return 1; \
} while (0)
-#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
- __GEN_RMWcc(op " " arg0, var, cc)
+#define __BINARY_RMWcc_ARG " %1, "
-#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
- __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
/* Use flags output or a set instruction */
-#define __GEN_RMWcc(fullop, var, cc, ...) \
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
do { \
bool c; \
asm volatile (fullop ";" CC_SET(cc) \
- : "+m" (var), CC_OUT(cc) (c) \
- : __VA_ARGS__ : "memory"); \
+ : [counter] "+m" (var), CC_OUT(cc) (c) \
+ : __VA_ARGS__ : clobbers); \
return c; \
} while (0)
+#define __BINARY_RMWcc_ARG " %2, "
+
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+
#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
- __GEN_RMWcc(op " " arg0, var, cc)
+ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM)
+
+#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \
+ __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM_CC_CX)
#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
- __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
+ __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \
+ __CLOBBERS_MEM, vcon (val))
-#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \
+ __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM_CC_CX, vcon (val))
#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index eaec6c364e42..cd71273ec49d 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -11,6 +11,7 @@
* Executability : eXeutable, NoteXecutable
* Read/Write : ReadOnly, ReadWrite
* Presence : NotPresent
+ * Encryption : Encrypted, Decrypted
*
* Within a category, the attributes are mutually exclusive.
*
@@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages);
int set_memory_wb(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
int set_memory_array_uc(unsigned long *addr, int addrinarray);
int set_memory_array_wc(unsigned long *addr, int addrinarray);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd6e7b3..5161da1a0fa0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -98,6 +98,7 @@ struct thread_info {
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
#define TIF_X32 30 /* 32-bit native x86-64 binary */
+#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -122,6 +123,7 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
/*
* work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
@@ -137,7 +139,8 @@ struct thread_info {
(_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
_TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \
_TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \
- _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT)
+ _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_FSCHECK)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index c7797307fc2b..79a4ca6a9606 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -15,4 +15,18 @@
#include <asm-generic/tlb.h>
+/*
+ * While x86 architecture in general requires an IPI to perform TLB
+ * shootdown, enablement code for several hypervisors overrides
+ * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
+ * a hypercall. To keep software pagetable walkers safe in this case we
+ * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment
+ * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h
+ * for more details.
+ */
+static inline void __tlb_remove_table(void *table)
+{
+ free_page_and_swap_cache(table);
+}
+
#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 50ea3482e1d1..d23e61dc0640 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void)
__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
}
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+ u64 new_tlb_gen;
+
+ /*
+ * Bump the generation count. This also serves as a full barrier
+ * that synchronizes with switch_mm(): callers are required to order
+ * their read of mm_cpumask after their writes to the paging
+ * structures.
+ */
+ smp_mb__before_atomic();
+ new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
+ smp_mb__after_atomic();
+
+ return new_tlb_gen;
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void)
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+struct tlb_context {
+ u64 ctx_id;
+ u64 tlb_gen;
+};
+
struct tlb_state {
/*
* cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -73,13 +101,35 @@ struct tlb_state {
* mode even if we've already switched back to swapper_pg_dir.
*/
struct mm_struct *loaded_mm;
- int state;
+ u16 loaded_mm_asid;
+ u16 next_asid;
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
*/
unsigned long cr4;
+
+ /*
+ * This is a list of all contexts that might exist in the TLB.
+ * There is one per ASID that we use, and the ASID (what the
+ * CPU calls PCID) is the index into ctxts.
+ *
+ * For each context, ctx_id indicates which mm the TLB's user
+ * entries came from. As an invariant, the TLB will never
+ * contain entries that are out-of-date as when that mm reached
+ * the tlb_gen in the list.
+ *
+ * To be clear, this means that it's legal for the TLB code to
+ * flush the TLB without updating tlb_gen. This can happen
+ * (for now, at least) due to paravirt remote flushes.
+ *
+ * NB: context 0 is a bit special, since it's also used by
+ * various bits of init code. This is fine -- code that
+ * isn't aware of PCID will end up harmlessly flushing
+ * context 0.
+ */
+ struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
@@ -207,6 +257,14 @@ static inline void __flush_tlb_all(void)
__flush_tlb_global();
else
__flush_tlb();
+
+ /*
+ * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+ * we'd end up flushing kernel translations for the current ASID but
+ * we might fail to flush kernel translations for other cached ASIDs.
+ *
+ * To avoid this issue, we force PCID off if PGE is off.
+ */
}
static inline void __flush_tlb_one(unsigned long addr)
@@ -231,9 +289,26 @@ static inline void __flush_tlb_one(unsigned long addr)
* and page-granular flushes are available only on i486 and up.
*/
struct flush_tlb_info {
- struct mm_struct *mm;
- unsigned long start;
- unsigned long end;
+ /*
+ * We support several kinds of flushes.
+ *
+ * - Fully flush a single mm. .mm will be set, .end will be
+ * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+ * which the IPI sender is trying to catch us up.
+ *
+ * - Partially flush a single mm. .mm will be set, .start and
+ * .end will indicate the range, and .new_tlb_gen will be set
+ * such that the changes between generation .new_tlb_gen-1 and
+ * .new_tlb_gen are entirely contained in the indicated range.
+ *
+ * - Fully flush all mms whose tlb_gens have been updated. .mm
+ * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+ * will be zero.
+ */
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+ u64 new_tlb_gen;
};
#define local_flush_tlb() __flush_tlb()
@@ -256,12 +331,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
-#define TLBSTATE_OK 1
-#define TLBSTATE_LAZY 2
-
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{
+ inc_mm_tlb_gen(mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
}
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 6358a85e2270..c1d2a9892352 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
extern void setup_node_to_cpumask_map(void);
-/*
- * Returns the number of the node containing Node 'node'. This
- * architecture is flat, so it is a pretty simple function!
- */
-#define parent_node(node) (node)
-
#define pcibus_to_node(bus) __pcibus_to_node(bus)
extern int __node_distance(int, int);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 30269dafec47..184eb9894dae 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -26,7 +26,12 @@
#define get_ds() (KERNEL_DS)
#define get_fs() (current->thread.addr_limit)
-#define set_fs(x) (current->thread.addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+ current->thread.addr_limit = fs;
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
+}
#define segment_eq(a, b) ((a).seg == (b).seg)
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e6676495b125..e9f793e2df7a 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,11 +12,14 @@ struct unwind_state {
struct task_struct *task;
int graph_idx;
bool error;
-#ifdef CONFIG_FRAME_POINTER
+#if defined(CONFIG_ORC_UNWINDER)
+ bool signal, full_regs;
+ unsigned long sp, bp, ip;
+ struct pt_regs *regs;
+#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
bool got_irq;
- unsigned long *bp, *orig_sp;
+ unsigned long *bp, *orig_sp, ip;
struct pt_regs *regs;
- unsigned long ip;
#else
unsigned long *sp;
#endif
@@ -24,41 +27,30 @@ struct unwind_state {
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame);
-
bool unwind_next_frame(struct unwind_state *state);
-
unsigned long unwind_get_return_address(struct unwind_state *state);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
static inline bool unwind_done(struct unwind_state *state)
{
return state->stack_info.type == STACK_TYPE_UNKNOWN;
}
-static inline
-void unwind_start(struct unwind_state *state, struct task_struct *task,
- struct pt_regs *regs, unsigned long *first_frame)
-{
- first_frame = first_frame ? : get_stack_pointer(task, regs);
-
- __unwind_start(state, task, regs, first_frame);
-}
-
static inline bool unwind_error(struct unwind_state *state)
{
return state->error;
}
-#ifdef CONFIG_FRAME_POINTER
-
static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
{
- if (unwind_done(state))
- return NULL;
+ first_frame = first_frame ? : get_stack_pointer(task, regs);
- return state->regs ? &state->regs->ip : state->bp + 1;
+ __unwind_start(state, task, regs, first_frame);
}
+#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
if (unwind_done(state))
@@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
return state->regs;
}
-
-#else /* !CONFIG_FRAME_POINTER */
-
-static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+#else
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
return NULL;
}
+#endif
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+#ifdef CONFIG_ORC_UNWINDER
+void unwind_init(void);
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size);
+#else
+static inline void unwind_init(void) {}
+static inline
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size) {}
+#endif
+
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x) \
+({ \
+ unsigned long val; \
+ if (task == current) \
+ val = READ_ONCE(x); \
+ else \
+ val = READ_ONCE_NOCHECK(x); \
+ val; \
+})
+
+static inline bool task_on_another_cpu(struct task_struct *task)
{
- return NULL;
+#ifdef CONFIG_SMP
+ return task != current && task->on_cpu;
+#else
+ return false;
+#endif
}
-#endif /* CONFIG_FRAME_POINTER */
-
#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
new file mode 100644
index 000000000000..bae46fc6b9de
--- /dev/null
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -0,0 +1,105 @@
+#ifndef _ASM_X86_UNWIND_HINTS_H
+#define _ASM_X86_UNWIND_HINTS_H
+
+#include "orc_types.h"
+
+#ifdef __ASSEMBLY__
+
+/*
+ * In asm, there are two kinds of code: normal C-type callable functions and
+ * the rest. The normal callable functions can be called by other code, and
+ * don't do anything unusual with the stack. Such normal callable functions
+ * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this
+ * category. In this case, no special debugging annotations are needed because
+ * objtool can automatically generate the ORC data for the ORC unwinder to read
+ * at runtime.
+ *
+ * Anything which doesn't fall into the above category, such as syscall and
+ * interrupt handlers, tends to not be called directly by other functions, and
+ * often does unusual non-C-function-type things with the stack pointer. Such
+ * code needs to be annotated such that objtool can understand it. The
+ * following CFI hint macros are for this type of code.
+ *
+ * These macros provide hints to objtool about the state of the stack at each
+ * instruction. Objtool starts from the hints and follows the code flow,
+ * making automatic CFI adjustments when it sees pushes and pops, filling out
+ * the debuginfo as necessary. It will also warn if it sees any
+ * inconsistencies.
+ */
+.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL
+#ifdef CONFIG_STACK_VALIDATION
+.Lunwind_hint_ip_\@:
+ .pushsection .discard.unwind_hints
+ /* struct unwind_hint */
+ .long .Lunwind_hint_ip_\@ - .
+ .short \sp_offset
+ .byte \sp_reg
+ .byte \type
+ .popsection
+#endif
+.endm
+
+.macro UNWIND_HINT_EMPTY
+ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED
+.endm
+
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
+ .if \base == %rsp
+ .if \indirect
+ .set sp_reg, ORC_REG_SP_INDIRECT
+ .else
+ .set sp_reg, ORC_REG_SP
+ .endif
+ .elseif \base == %rbp
+ .set sp_reg, ORC_REG_BP
+ .elseif \base == %rdi
+ .set sp_reg, ORC_REG_DI
+ .elseif \base == %rdx
+ .set sp_reg, ORC_REG_DX
+ .elseif \base == %r10
+ .set sp_reg, ORC_REG_R10
+ .else
+ .error "UNWIND_HINT_REGS: bad base register"
+ .endif
+
+ .set sp_offset, \offset
+
+ .if \iret
+ .set type, ORC_TYPE_REGS_IRET
+ .elseif \extra == 0
+ .set type, ORC_TYPE_REGS_IRET
+ .set sp_offset, \offset + (16*8)
+ .else
+ .set type, ORC_TYPE_REGS
+ .endif
+
+ UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+.endm
+
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
+ UNWIND_HINT_REGS base=\base offset=\offset iret=1
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=8
+ UNWIND_HINT sp_offset=\sp_offset
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#define UNWIND_HINT(sp_reg, sp_offset, type) \
+ "987: \n\t" \
+ ".pushsection .discard.unwind_hints\n\t" \
+ /* struct unwind_hint */ \
+ ".long 987b - .\n\t" \
+ ".short " __stringify(sp_offset) "\n\t" \
+ ".byte " __stringify(sp_reg) "\n\t" \
+ ".byte " __stringify(type) "\n\t" \
+ ".popsection\n\t"
+
+#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE)
+
+#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index c4b9dc2f67c5..9f42beefc67a 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -7,12 +7,24 @@
#ifndef _ASM_X86_VGA_H
#define _ASM_X86_VGA_H
+#include <asm/set_memory.h>
+
/*
* On the PC, we can just recalculate addresses and then
* access the videoram directly without any black magic.
+ * To support memory encryption however, we need to access
+ * the videoram as decrypted memory.
*/
-#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
+#define VGA_MAP_MEM(x, s) \
+({ \
+ unsigned long start = (unsigned long)phys_to_virt(x); \
+ \
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \
+ set_memory_decrypted(start, (s) >> PAGE_SHIFT); \
+ \
+ start; \
+})
#define vga_readb(x) (*(x))
#define vga_writeb(x, y) (*(y) = (x))
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ddef37b16af2..66b8f93333d1 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -201,7 +201,7 @@ struct boot_params {
*
* @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
* PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
* which start at asm startup_xen() entry point and later jump to the C
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a01892bdd61a..287eac7d207f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -126,11 +126,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
-ifdef CONFIG_FRAME_POINTER
-obj-y += unwind_frame.o
-else
-obj-y += unwind_guess.o
-endif
+obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
+obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
+obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7491e73d9253..97bb2caf3428 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
#define ACPI_INVALID_GSI INT_MIN
/*
- * This is just a simple wrapper around early_ioremap(),
+ * This is just a simple wrapper around early_memremap(),
* with sanity checks for phys == 0 and size == 0.
*/
char *__init __acpi_map_table(unsigned long phys, unsigned long size)
@@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
if (!phys || !size)
return NULL;
- return early_ioremap(phys, size);
+ return early_memremap(phys, size);
}
void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
if (!map || !size)
return;
- early_iounmap(map, size);
+ early_memunmap(map, size);
}
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 32e14d137416..3344d3382e91 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr;
int poke_int3_handler(struct pt_regs *regs)
{
- /* bp_patching_in_progress */
+ /*
+ * Having observed our INT3 instruction, we now must observe
+ * bp_patching_in_progress.
+ *
+ * in_progress = TRUE INT3
+ * WMB RMB
+ * write INT3 if (in_progress)
+ *
+ * Idem for bp_int3_handler.
+ */
smp_rmb();
if (likely(!bp_patching_in_progress))
@@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
bp_int3_addr = (u8 *)addr + sizeof(int3);
bp_patching_in_progress = true;
/*
- * Corresponding read barrier in int3 notifier for
- * making sure the in_progress flags is correctly ordered wrt.
- * patching
+ * Corresponding read barrier in int3 notifier for making sure the
+ * in_progress and handler are correctly ordered wrt. patching.
*/
smp_wmb();
@@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
text_poke(addr, opcode, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
-
+ /*
+ * sync_core() implies an smp_mb() and orders this store against
+ * the writing of the new instruction.
+ */
bp_patching_in_progress = false;
- smp_wmb();
return addr;
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..710edab9e644 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
#include <asm/ucontext.h>
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
OFFSET(stack_canary_offset, stack_canary, canary);
#endif
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
- BLANK();
- OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
- OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
- BLANK();
- OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
- OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
- OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
- OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
- OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
- OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
- OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
- OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
- OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
- OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3b9e220621f8..9862e2cd6d93 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -297,13 +297,29 @@ static int nearby_node(int apicid)
}
#endif
+#ifdef CONFIG_SMP
+/*
+ * Fix up cpu_core_id for pre-F17h systems to be in the
+ * [0 .. cores_per_node - 1] range. Not really needed but
+ * kept so as not to break existing setups.
+ */
+static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
+{
+ u32 cus_per_node;
+
+ if (c->x86 >= 0x17)
+ return;
+
+ cus_per_node = c->x86_max_cores / nodes_per_socket;
+ c->cpu_core_id %= cus_per_node;
+}
+
/*
* Fixup core topology information for
* (1) AMD multi-node processors
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
-#ifdef CONFIG_SMP
static void amd_get_topology(struct cpuinfo_x86 *c)
{
u8 node_id;
@@ -354,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
} else
return;
- /* fixup multi-node processor information */
if (nodes_per_socket > 1) {
- u32 cus_per_node;
-
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cus_per_node = c->x86_max_cores / nodes_per_socket;
-
- /* core id has to be in the [0 .. cores_per_node - 1] range */
- c->cpu_core_id %= cus_per_node;
+ legacy_fixup_core_id(c);
}
}
#endif
@@ -548,8 +558,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
static void early_init_amd(struct cpuinfo_x86 *c)
{
+ u32 dummy;
+
early_init_amd_mc(c);
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states
@@ -612,6 +626,27 @@ static void early_init_amd(struct cpuinfo_x86 *c)
*/
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_E400);
+
+ /*
+ * BIOS support is required for SME. If BIOS has enabled SME then
+ * adjust x86_phys_bits by the SME physical address space reduction
+ * value. If BIOS has not enabled SME then don't advertise the
+ * feature (set in scattered.c). Also, since the SME support requires
+ * long mode, don't advertise the feature under CONFIG_X86_32.
+ */
+ if (cpu_has(c, X86_FEATURE_SME)) {
+ u64 msr;
+
+ /* Check if SME is enabled */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
+ c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+ if (IS_ENABLED(CONFIG_X86_32))
+ clear_cpu_cap(c, X86_FEATURE_SME);
+ } else {
+ clear_cpu_cap(c, X86_FEATURE_SME);
+ }
+ }
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -730,8 +765,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
static void init_amd(struct cpuinfo_x86 *c)
{
- u32 dummy;
-
early_init_amd(c);
/*
@@ -793,8 +826,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
-
/* 3DNow or LM implies PREFETCHW */
if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index d869c8671e36..0ee83321a313 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -8,20 +8,25 @@
* This file is licensed under GPLv2.
*/
-#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/smp.h>
struct aperfmperf_sample {
unsigned int khz;
- unsigned long jiffies;
+ ktime_t time;
u64 aperf;
u64 mperf;
};
static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+#define APERFMPERF_CACHE_THRESHOLD_MS 10
+#define APERFMPERF_REFRESH_DELAY_MS 20
+#define APERFMPERF_STALE_THRESHOLD_MS 1000
+
/*
* aperfmperf_snapshot_khz()
* On the current CPU, snapshot APERF, MPERF, and jiffies
@@ -33,13 +38,18 @@ static void aperfmperf_snapshot_khz(void *dummy)
u64 aperf, aperf_delta;
u64 mperf, mperf_delta;
struct aperfmperf_sample *s = this_cpu_ptr(&samples);
+ ktime_t now = ktime_get();
+ s64 time_delta = ktime_ms_delta(now, s->time);
+ unsigned long flags;
- /* Don't bother re-computing within 10 ms */
- if (time_before(jiffies, s->jiffies + HZ/100))
+ /* Don't bother re-computing within the cache threshold time. */
+ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
return;
+ local_irq_save(flags);
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
+ local_irq_restore(flags);
aperf_delta = aperf - s->aperf;
mperf_delta = mperf - s->mperf;
@@ -51,22 +61,21 @@ static void aperfmperf_snapshot_khz(void *dummy)
if (mperf_delta == 0)
return;
- /*
- * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
- * khz = (cpu_khz * aperf_delta) / mperf_delta
- */
- if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
- else /* khz = aperf_delta / (mperf_delta / cpu_khz) */
- s->khz = div64_u64(aperf_delta,
- div64_u64(mperf_delta, cpu_khz));
- s->jiffies = jiffies;
+ s->time = now;
s->aperf = aperf;
s->mperf = mperf;
+
+ /* If the previous iteration was too long ago, discard it. */
+ if (time_delta > APERFMPERF_STALE_THRESHOLD_MS)
+ s->khz = 0;
+ else
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
}
unsigned int arch_freq_get_on_cpu(int cpu)
{
+ unsigned int khz;
+
if (!cpu_khz)
return 0;
@@ -74,6 +83,12 @@ unsigned int arch_freq_get_on_cpu(int cpu)
return 0;
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+ khz = per_cpu(samples.khz, cpu);
+ if (khz)
+ return khz;
+
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
return per_cpu(samples.khz, cpu);
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0af86d9242da..db684880d74a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,6 +21,14 @@
void __init check_bugs(void)
{
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
identify_boot_cpu();
if (!IS_ENABLED(CONFIG_SMP)) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b39870f33e..b95cd94ca97b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s)
}
__setup("nompx", x86_mpx_setup);
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
static int __init x86_noinvpcid_setup(char *s)
{
/* noinvpcid doesn't accept parameters */
@@ -311,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+ if (cpu_has(c, X86_FEATURE_PGE)) {
+ cr4_set_bits(X86_CR4_PCIDE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ clear_cpu_cap(c, X86_FEATURE_PCID);
+ }
+ }
+}
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1125,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smep(c);
setup_smap(c);
+ /* Set up PCID */
+ setup_pcid(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c55fb2cb2acc..24f749324c0f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
struct cacheinfo *this_leaf;
int i, sibling;
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ /*
+ * For L3, always use the pre-calculated cpu_llc_shared_mask
+ * to derive shared_cpu_map.
+ */
+ if (index == 3) {
+ for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+ this_leaf = this_cpu_ci->info_list + index;
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+ if (!cpu_online(sibling))
+ continue;
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
+ }
+ }
+ } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
unsigned int apicid, nshared, first, last;
this_leaf = this_cpu_ci->info_list + index;
@@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
&this_leaf->shared_cpu_map);
}
}
- } else if (index == 3) {
- for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
- this_cpu_ci = get_cpu_cacheinfo(i);
- if (!this_cpu_ci->info_list)
- continue;
- this_leaf = this_cpu_ci->info_list + index;
- for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
- if (!cpu_online(sibling))
- continue;
- cpumask_set_cpu(sibling,
- &this_leaf->shared_cpu_map);
- }
- }
} else
return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6dde0497efc7..3b413065c613 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,6 +51,7 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
+#include <asm/set_memory.h>
#include "mce-internal.h"
@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
return ret;
}
+#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
+
+void arch_unmap_kpfn(unsigned long pfn)
+{
+ unsigned long decoy_addr;
+
+ /*
+ * Unmap this page from the kernel 1:1 mappings to make sure
+ * we don't log more errors because of speculative access to
+ * the page.
+ * We would like to just call:
+ * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
+ * but doing that would radically increase the odds of a
+ * speculative access to the posion page because we'd have
+ * the virtual address of the kernel 1:1 mapping sitting
+ * around in registers.
+ * Instead we get tricky. We create a non-canonical address
+ * that looks just like the one we want, but has bit 63 flipped.
+ * This relies on set_memory_np() not checking whether we passed
+ * a legal address.
+ */
+
+/*
+ * Build time check to see if we have a spare virtual bit. Don't want
+ * to leave this until run time because most developers don't have a
+ * system that can exercise this code path. This will only become a
+ * problem if/when we move beyond 5-level page tables.
+ *
+ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
+ */
+#if PGDIR_SHIFT + 9 < 63
+ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+#else
+#error "no unused virtual bit available"
+#endif
+
+ if (set_memory_np(decoy_addr, 1))
+ pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+
+}
+#endif
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9e314bcf67cc..5ce1a5689162 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
wrmsr(smca_config, low, high);
}
- /* Collect bank_info using CPU 0 for now. */
- if (cpu)
+ /* Return early if this bank was already initialized. */
+ if (smca_banks[bank].hwid)
return;
if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
@@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
if (hwid_mcatype == s_hwid->hwid_mcatype) {
-
- WARN(smca_banks[bank].hwid,
- "Bank %s already initialized!\n",
- smca_get_name(s_hwid->bank_type));
-
smca_banks[bank].hwid = s_hwid;
smca_banks[bank].id = low;
smca_banks[bank].sysfs_id = s_hwid->count++;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d7cc190ae457..f7370abd33c6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -122,7 +122,7 @@ static struct attribute *thermal_throttle_attrs[] = {
NULL
};
-static struct attribute_group thermal_attr_group = {
+static const struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 21b185793c80..c6daec4bdba5 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch)
list_for_each_entry(p, &microcode_cache, plist) {
if (p->equiv_cpu == new_patch->equiv_cpu) {
- if (p->patch_id >= new_patch->patch_id)
+ if (p->patch_id >= new_patch->patch_id) {
/* we already have the latest patch */
+ kfree(new_patch->data);
+ kfree(new_patch);
return;
+ }
list_replace(&p->plist, &new_patch->plist);
kfree(p->data);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9cb98ee103db..86e8f0b2537b 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -561,7 +561,7 @@ static struct attribute *mc_default_attrs[] = {
NULL
};
-static struct attribute_group mc_attr_group = {
+static const struct attribute_group mc_attr_group = {
.attrs = mc_default_attrs,
.name = "microcode",
};
@@ -707,7 +707,7 @@ static struct attribute *cpu_root_microcode_attrs[] = {
NULL
};
-static struct attribute_group cpu_root_microcode_group = {
+static const struct attribute_group cpu_root_microcode_group = {
.name = "microcode",
.attrs = cpu_root_microcode_attrs,
};
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 59edbe9d4ccb..8f7a9bbad514 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header,
return false;
}
-static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
+static struct ucode_patch *memdup_patch(void *data, unsigned int size)
{
struct ucode_patch *p;
p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
if (!p)
- return ERR_PTR(-ENOMEM);
+ return NULL;
p->data = kmemdup(data, size, GFP_KERNEL);
if (!p->data) {
kfree(p);
- return ERR_PTR(-ENOMEM);
+ return NULL;
}
return p;
@@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size)
if (mc_hdr->rev <= mc_saved_hdr->rev)
continue;
- p = __alloc_microcode_buf(data, size);
- if (IS_ERR(p))
+ p = memdup_patch(data, size);
+ if (!p)
pr_err("Error allocating buffer %p\n", data);
else
list_replace(&iter->plist, &p->plist);
@@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size)
* newly found.
*/
if (!prev_found) {
- p = __alloc_microcode_buf(data, size);
- if (IS_ERR(p))
+ p = memdup_patch(data, size);
+ if (!p)
pr_err("Error allocating buffer for %p\n", data);
else
list_add_tail(&p->plist, &microcode_cache);
}
+ if (!p)
+ return;
+
/*
* Save for early loading. On 32-bit, that needs to be a physical
* address as the APs are running from physical addresses, before
* paging has been enabled.
*/
- if (p) {
- if (IS_ENABLED(CONFIG_X86_32))
- intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
- else
- intel_ucode_patch = p->data;
- }
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
}
static int microcode_sanity_check(void *mc, int print_err)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c5bb63be4ba1..40d5a8a75212 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -237,6 +237,18 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
}
+static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
+
static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
@@ -370,7 +382,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
/* Search for an empty MTRR */
i = mtrr_if->get_free_region(base, size, replace);
if (i >= 0) {
- set_mtrr(i, base, size, type);
+ set_mtrr_cpuslocked(i, base, size, type);
if (likely(replace < 0)) {
mtrr_usage_table[i] = 1;
} else {
@@ -378,7 +390,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
if (increment)
mtrr_usage_table[i]++;
if (unlikely(replace != i)) {
- set_mtrr(replace, 0, 0, 0);
+ set_mtrr_cpuslocked(replace, 0, 0, 0);
mtrr_usage_table[replace] = 0;
}
}
@@ -506,7 +518,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
goto out;
}
if (--mtrr_usage_table[reg] < 1)
- set_mtrr(reg, 0, 0, 0);
+ set_mtrr_cpuslocked(reg, 0, 0, 0);
error = reg;
out:
mutex_unlock(&mtrr_mutex);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 23c23508c012..05459ad3db46 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dbce3cca94cb..f13b4c00a5de 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
+ if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
+ __show_regs(regs, 0);
+
/*
* Scan the stack, printing any text addresses we find. At the
* same time, follow proper stack frames with the unwinder.
@@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* Don't print regs->ip again if it was already printed
* by __show_regs() below.
*/
- if (regs && stack == &regs->ip) {
- unwind_next_frame(&state);
- continue;
- }
+ if (regs && stack == &regs->ip)
+ goto next;
if (stack == ret_addr_p)
reliable = 1;
@@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (!reliable)
continue;
+next:
/*
* Get the next frame from the unwinder. No need to
* check for an error: if anything goes wrong, the rest
@@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state);
- if (regs)
+ if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
}
@@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
#ifdef CONFIG_X86_32
if (user_mode(regs)) {
sp = regs->sp;
- ss = regs->ss & 0xffff;
+ ss = regs->ss;
} else {
sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e5f0b40e66d2..4f0481474903 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_IRQ;
@@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_SOFTIRQ;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 3e1471d57487..225af4184f06 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
begin = end - (exception_stack_sizes[k] / sizeof(long));
regs = (struct pt_regs *)end - 1;
- if (stack < begin || stack >= end)
+ if (stack <= begin || stack >= end)
continue;
info->type = STACK_TYPE_EXCEPTION + k;
@@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
*/
- if (stack < begin || stack > end)
+ if (stack <= begin || stack > end)
return false;
info->type = STACK_TYPE_IRQ;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 532da61d605c..71c11ad5643e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any);
* Note: this function only works correctly once the E820 table is sorted and
* not-overlapping (at least for the range specified), which is the case normally.
*/
-bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+ enum e820_type type)
{
int i;
@@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
* coverage of the desired range exists:
*/
if (start >= end)
- return 1;
+ return entry;
}
- return 0;
+
+ return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+ return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+ struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+ return entry ? entry->type : -EINVAL;
}
/*
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index d907c3d8633f..a4516ca4c4f3 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_BXT_IDS(&gen9_early_ops),
INTEL_KBL_IDS(&gen9_early_ops),
INTEL_GLK_IDS(&gen9_early_ops),
+ INTEL_CNL_IDS(&gen9_early_ops),
};
static void __init
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6b91e2eb8d3f..9c4e7ba6870c 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -195,7 +195,7 @@ void init_espfix_ap(int cpu)
pte_p = pte_offset_kernel(&pmd, addr);
stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
- pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+ pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 46c3c73e7f43..6a193b93fd95 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -14,6 +14,7 @@
#include <linux/start_kernel.h>
#include <linux/io.h>
#include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
#include <asm/processor.h>
#include <asm/proto.h>
@@ -33,7 +34,6 @@
/*
* Manage page tables very early on.
*/
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
@@ -45,14 +45,17 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
return ptr - (void *)_text + (void *)physaddr;
}
-void __head __startup_64(unsigned long physaddr)
+unsigned long __head __startup_64(unsigned long physaddr,
+ struct boot_params *bp)
{
unsigned long load_delta, *p;
+ unsigned long pgtable_flags;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
int i;
+ unsigned int *next_pgt_ptr;
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
@@ -68,6 +71,12 @@ void __head __startup_64(unsigned long physaddr)
if (load_delta & ~PMD_PAGE_MASK)
for (;;);
+ /* Activate Secure Memory Encryption (SME) if supported and enabled */
+ sme_enable(bp);
+
+ /* Include the SME encryption mask in the fixup value */
+ load_delta += sme_get_me_mask();
+
/* Fixup the physical addresses in the page table */
pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -92,30 +101,34 @@ void __head __startup_64(unsigned long physaddr)
* it avoids problems around wraparound.
*/
- pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
- pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+ next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
+ pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+
+ pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
- pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+ pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
- p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
- p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
+ p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
} else {
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
- pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
}
i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
- pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
- pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+ pud[i + 0] = (pudval_t)pmd + pgtable_flags;
+ pud[i + 1] = (pudval_t)pmd + pgtable_flags;
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
@@ -136,9 +149,30 @@ void __head __startup_64(unsigned long physaddr)
pmd[i] += load_delta;
}
- /* Fixup phys_base */
+ /*
+ * Fixup phys_base - remove the memory encryption mask to obtain
+ * the true physical address.
+ */
p = fixup_pointer(&phys_base, physaddr);
- *p += load_delta;
+ *p += load_delta - sme_get_me_mask();
+
+ /* Encrypt the kernel (if SME is active) */
+ sme_encrypt_kernel();
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
}
/* Wipe all early page tables except for the kernel symbol map */
@@ -146,17 +180,17 @@ static void __init reset_early_page_tables(void)
{
memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
- write_cr3(__pa_nodebug(early_top_pgt));
+ write_cr3(__sme_pa_nodebug(early_top_pgt));
}
/* Create a new PMD entry */
-int __init early_make_pgtable(unsigned long address)
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p;
p4dval_t p4d, *p4d_p;
pudval_t pud, *pud_p;
- pmdval_t pmd, *pmd_p;
+ pmdval_t *pmd_p;
/* Invalid address or early pgt is done ? */
if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
@@ -215,12 +249,21 @@ again:
memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
- pmd = (physaddr & PMD_MASK) + early_pmd_flags;
pmd_p[pmd_index(address)] = pmd;
return 0;
}
+int __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pmdval_t pmd;
+
+ pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+ return __early_make_pgtable(address, pmd);
+}
+
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
static void __init clear_bss(void)
@@ -243,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data)
char * command_line;
unsigned long cmd_line_ptr;
+ /*
+ * If SME is active, this will create decrypted mappings of the
+ * boot data in advance of the copy operations.
+ */
+ sme_map_bootdata(real_mode_data);
+
memcpy(&boot_params, real_mode_data, sizeof boot_params);
sanitize_boot_params(&boot_params);
cmd_line_ptr = get_cmd_line_ptr();
@@ -250,6 +299,14 @@ static void __init copy_bootdata(char *real_mode_data)
command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
+
+ /*
+ * The old boot data is no longer needed and won't be reserved,
+ * freeing up that memory for use by the system. If SME is active,
+ * we need to remove the mappings that were created so that the
+ * memory doesn't remain mapped as decrypted.
+ */
+ sme_unmap_bootdata(real_mode_data);
}
asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
@@ -279,6 +336,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_page(init_top_pgt);
+ /*
+ * SME support may update early_pmd_flags to include the memory
+ * encryption mask, so it needs to be called before anything
+ * that may generate a page fault.
+ */
+ sme_early_init();
+
kasan_early_init();
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 1f85ee8f9439..29da9599fec0 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
jmp *%eax
.Lbad_subarch:
-WEAK(lguest_entry)
WEAK(xen_entry)
/* Unknown implementation; there's really
nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
subarch_entries:
.long .Ldefault_entry /* normal x86/PC */
- .long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
.long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
@@ -457,12 +455,9 @@ early_idt_handler_common:
/* The vector number is in pt_regs->gs */
cld
- pushl %fs /* pt_regs->fs */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
- pushl %es /* pt_regs->es */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
- pushl %ds /* pt_regs->ds */
- movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
+ pushl %fs /* pt_regs->fs (__fsh varies by model) */
+ pushl %es /* pt_regs->es (__esh varies by model) */
+ pushl %ds /* pt_regs->ds (__dsh varies by model) */
pushl %eax /* pt_regs->ax */
pushl %ebp /* pt_regs->bp */
pushl %edi /* pt_regs->di */
@@ -479,9 +474,8 @@ early_idt_handler_common:
/* Load the vector number into EDX */
movl PT_GS(%esp), %edx
- /* Load GS into pt_regs->gs and clear high bits */
+ /* Load GS into pt_regs->gs (and maybe clobber __gsh) */
movw %gs, PT_GS(%esp)
- movw $0, PT_GS+2(%esp)
movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */
call early_fixup_exception
@@ -493,10 +487,10 @@ early_idt_handler_common:
popl %edi /* pt_regs->di */
popl %ebp /* pt_regs->bp */
popl %eax /* pt_regs->ax */
- popl %ds /* pt_regs->ds */
- popl %es /* pt_regs->es */
- popl %fs /* pt_regs->fs */
- popl %gs /* pt_regs->gs */
+ popl %ds /* pt_regs->ds (always ignores __dsh) */
+ popl %es /* pt_regs->es (always ignores __esh) */
+ popl %fs /* pt_regs->fs (always ignores __fsh) */
+ popl %gs /* pt_regs->gs (always ignores __gsh) */
decl %ss:early_recursion_flag
addl $4, %esp /* pop pt_regs->orig_ax */
iret
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6225550883df..513cbb012ecc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -73,12 +73,19 @@ startup_64:
/* Sanitize CPU configuration */
call verify_cpu
+ /*
+ * Perform pagetable fixups. Additionally, if SME is active, encrypt
+ * the kernel and retrieve the modifier (SME encryption mask if SME
+ * is active) to be added to the initial pgdir entry that will be
+ * programmed into CR3.
+ */
leaq _text(%rip), %rdi
pushq %rsi
call __startup_64
popq %rsi
- movq $(early_top_pgt - __START_KERNEL_map), %rax
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
/*
@@ -98,7 +105,16 @@ ENTRY(secondary_startup_64)
/* Sanitize CPU configuration */
call verify_cpu
- movq $(init_top_pgt - __START_KERNEL_map), %rax
+ /*
+ * Retrieve the modifier (SME encryption mask if SME is active) to be
+ * added to the initial pgdir entry that will be programmed into CR3.
+ */
+ pushq %rsi
+ call __startup_secondary_64
+ popq %rsi
+
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
1:
/* Enable PAE mode, PGE and LA57 */
@@ -335,9 +351,9 @@ GLOBAL(name)
NEXT_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
- .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
NEXT_PAGE(early_dynamic_pgts)
@@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt)
.fill 512,8,0
#else
NEXT_PAGE(init_top_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
NEXT_PAGE(level3_ident_pgt)
- .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.fill 511, 8, 0
NEXT_PAGE(level2_ident_pgt)
/* Since I easily can, map the first 1G.
@@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt)
#ifdef CONFIG_X86_5LEVEL
NEXT_PAGE(level4_kernel_pgt)
.fill 511,8,0
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
- .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
NEXT_PAGE(level2_kernel_pgt)
/*
@@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt)
NEXT_PAGE(level2_fixmap_pgt)
.fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
/* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
.fill 5,8,0
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 16f82a3aaec7..8ce4212e2b8d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -345,21 +345,10 @@ static int hpet_shutdown(struct clock_event_device *evt, int timer)
return 0;
}
-static int hpet_resume(struct clock_event_device *evt, int timer)
-{
- if (!timer) {
- hpet_enable_legacy_int();
- } else {
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-
- irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq));
- irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
- disable_hardirq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
- enable_irq(hdev->irq);
- }
+static int hpet_resume(struct clock_event_device *evt)
+{
+ hpet_enable_legacy_int();
hpet_print_config();
-
return 0;
}
@@ -417,7 +406,7 @@ static int hpet_legacy_set_periodic(struct clock_event_device *evt)
static int hpet_legacy_resume(struct clock_event_device *evt)
{
- return hpet_resume(evt, 0);
+ return hpet_resume(evt);
}
static int hpet_legacy_next_event(unsigned long delta,
@@ -510,8 +499,14 @@ static int hpet_msi_set_periodic(struct clock_event_device *evt)
static int hpet_msi_resume(struct clock_event_device *evt)
{
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ struct irq_data *data = irq_get_irq_data(hdev->irq);
+ struct msi_msg msg;
- return hpet_resume(evt, hdev->num);
+ /* Restore the MSI msg and unmask the interrupt */
+ irq_chip_compose_msi_msg(data, &msg);
+ hpet_msi_write(hdev, &msg);
+ hpet_msi_unmask(data);
+ return 0;
}
static int hpet_msi_next_event(unsigned long delta,
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 38b64587b31b..fd6f8fbbe6f2 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
struct setup_data_node *node = file->private_data;
unsigned long remain;
loff_t pos = *ppos;
- struct page *pg;
void *p;
u64 pa;
@@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
count = node->len - pos;
pa = node->paddr + sizeof(struct setup_data) + pos;
- pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
- if (PageHighMem(pg)) {
- p = ioremap_cache(pa, count);
- if (!p)
- return -ENXIO;
- } else
- p = __va(pa);
+ p = memremap(pa, count, MEMREMAP_WB);
+ if (!p)
+ return -ENOMEM;
remain = copy_to_user(user_buf, p, count);
- if (PageHighMem(pg))
- iounmap(p);
+ memunmap(p);
if (remain)
return -EFAULT;
@@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent)
struct setup_data *data;
int error;
struct dentry *d;
- struct page *pg;
u64 pa_data;
int no = 0;
@@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent)
goto err_dir;
}
- pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
- if (PageHighMem(pg)) {
- data = ioremap_cache(pa_data, sizeof(*data));
- if (!data) {
- kfree(node);
- error = -ENXIO;
- goto err_dir;
- }
- } else
- data = __va(pa_data);
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ kfree(node);
+ error = -ENOMEM;
+ goto err_dir;
+ }
node->paddr = pa_data;
node->type = data->type;
@@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
error = create_setup_data_node(d, no, node);
pa_data = data->next;
- if (PageHighMem(pg))
- iounmap(data);
+ memunmap(data);
if (error)
goto err_dir;
no++;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 69ea0bc1cfa3..4f98aad38237 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -39,6 +39,7 @@
#include <asm/insn.h>
#include <asm/debugreg.h>
#include <asm/set_memory.h>
+#include <asm/sections.h>
#include "common.h"
@@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr)
/*
* Do not optimize in the entry code due to the unstable
- * stack handling.
+ * stack handling and registers setup.
*/
- if ((paddr >= (unsigned long)__entry_text_start) &&
- (paddr < (unsigned long)__entry_text_end))
+ if (((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end)) ||
+ ((paddr >= (unsigned long)__irqentry_text_start) &&
+ (paddr < (unsigned long)__irqentry_text_end)))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 4afc67f5facc..4b0592ca9e47 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -16,8 +16,8 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/io.h>
-#include <asm/io.h>
#include <asm/setup.h>
static ssize_t version_show(struct kobject *kobj,
@@ -55,7 +55,7 @@ static struct bin_attribute *boot_params_data_attrs[] = {
NULL,
};
-static struct attribute_group boot_params_attr_group = {
+static const struct attribute_group boot_params_attr_group = {
.attrs = boot_params_version_attrs,
.bin_attrs = boot_params_data_attrs,
};
@@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
*paddr = pa_data;
return 0;
}
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
pa_data = data->next;
- iounmap(data);
+ memunmap(data);
i++;
}
return -EINVAL;
@@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size)
u64 pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
if (nr == i) {
*size = data->len;
- iounmap(data);
+ memunmap(data);
return 0;
}
pa_data = data->next;
- iounmap(data);
+ memunmap(data);
i++;
}
return -EINVAL;
@@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj,
ret = get_setup_data_paddr(nr, &paddr);
if (ret)
return ret;
- data = ioremap_cache(paddr, sizeof(*data));
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
ret = sprintf(buf, "0x%x\n", data->type);
- iounmap(data);
+ memunmap(data);
return ret;
}
@@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp,
ret = get_setup_data_paddr(nr, &paddr);
if (ret)
return ret;
- data = ioremap_cache(paddr, sizeof(*data));
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
if (!data)
return -ENOMEM;
@@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp,
goto out;
ret = count;
- p = ioremap_cache(paddr + sizeof(*data), data->len);
+ p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
if (!p) {
ret = -ENOMEM;
goto out;
}
memcpy(buf, p + off, count);
- iounmap(p);
+ memunmap(p);
out:
- iounmap(data);
+ memunmap(data);
return ret;
}
@@ -202,7 +202,7 @@ static struct bin_attribute *setup_data_data_attrs[] = {
NULL,
};
-static struct attribute_group setup_data_attr_group = {
+static const struct attribute_group setup_data_attr_group = {
.attrs = setup_data_type_attrs,
.bin_attrs = setup_data_data_attrs,
};
@@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr)
*nr = 0;
while (pa_data) {
*nr += 1;
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
if (!data) {
ret = -ENOMEM;
goto out;
}
pa_data = data->next;
- iounmap(data);
+ memunmap(data);
}
out:
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 71c17a5be983..d04e30e3c0ff 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -151,6 +151,8 @@ void kvm_async_pf_task_wait(u32 token)
if (hlist_unhashed(&n.link))
break;
+ rcu_irq_exit();
+
if (!n.halted) {
local_irq_enable();
schedule();
@@ -159,11 +161,11 @@ void kvm_async_pf_task_wait(u32 token)
/*
* We cannot reschedule. So halt.
*/
- rcu_irq_exit();
native_safe_halt();
local_irq_disable();
- rcu_irq_enter();
}
+
+ rcu_irq_enter();
}
if (!n.halted)
finish_swait(&n.wq, &wait);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a870910c8565..f0e64db18ac8 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -21,6 +21,25 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
+#endif
+}
+
/* context.lock is held for us, so we don't need any locking. */
static void flush_ldt(void *__mm)
{
@@ -32,6 +51,8 @@ static void flush_ldt(void *__mm)
pc = &mm->context;
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+
+ refresh_ldt_segments();
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cb0a30473c23..1f790cf9d38f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
}
pte = pte_offset_kernel(pmd, vaddr);
- set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
return 0;
err:
free_transition_pgtable(image);
@@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
.alloc_pgt_page = alloc_pgt_page,
.context = image,
.page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
};
unsigned long mstart, mend;
pgd_t *level4p;
@@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image)
image->start = relocate_kernel((unsigned long)image->head,
(unsigned long)page_list,
image->start,
- image->preserve_context);
+ image->preserve_context,
+ sme_active());
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void)
{
kexec_mark_crashkres(false);
}
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+ /*
+ * If SME is active we need to be sure that kexec pages are
+ * not encrypted because when we boot to the new kernel the
+ * pages won't be accessed encrypted (initially).
+ */
+ return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+ /*
+ * If SME is active we need to reset the pages back to being
+ * an encrypted mapping before freeing them.
+ */
+ set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f67bd3205df7..62e7d70aadd5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
+#include <asm/unwind.h>
#if 0
#define DEBUGP(fmt, ...) \
@@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL;
+ *para = NULL, *orc = NULL, *orc_ip = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr,
locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
+ if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+ orc = s;
+ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+ orc_ip = s;
}
if (alt) {
@@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr,
/* make jump label nops */
jump_label_apply_nops(me);
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
return 0;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0d904d759ff1..5cbb3177ed17 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
}
}
-static struct mpf_intel *mpf_found;
+static unsigned long mpf_base;
static unsigned long __init get_mpc_size(unsigned long physptr)
{
struct mpc_table *mpc;
unsigned long size;
- mpc = early_ioremap(physptr, PAGE_SIZE);
+ mpc = early_memremap(physptr, PAGE_SIZE);
size = mpc->length;
- early_iounmap(mpc, PAGE_SIZE);
+ early_memunmap(mpc, PAGE_SIZE);
apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
return size;
@@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
unsigned long size;
size = get_mpc_size(mpf->physptr);
- mpc = early_ioremap(mpf->physptr, size);
+ mpc = early_memremap(mpf->physptr, size);
+
/*
* Read the physical hardware table. Anything here will
* override the defaults.
@@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
#endif
pr_err("BIOS bug, MP table errors detected!...\n");
pr_cont("... disabling SMP support. (tell your hw vendor)\n");
- early_iounmap(mpc, size);
+ early_memunmap(mpc, size);
return -1;
}
- early_iounmap(mpc, size);
+ early_memunmap(mpc, size);
if (early)
return -1;
@@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
*/
void __init default_get_smp_config(unsigned int early)
{
- struct mpf_intel *mpf = mpf_found;
+ struct mpf_intel *mpf;
if (!smp_found_config)
return;
- if (!mpf)
+ if (!mpf_base)
return;
if (acpi_lapic && early)
@@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early)
if (acpi_lapic && acpi_ioapic)
return;
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: error mapping MP table\n");
+ return;
+ }
+
pr_info("Intel MultiProcessor Specification v1.%d\n",
mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early)
/*
* Now see if we need to read further.
*/
- if (mpf->feature1 != 0) {
+ if (mpf->feature1) {
if (early) {
/*
* local APIC has default address
@@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early)
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
- if (check_physptr(mpf, early))
+ if (check_physptr(mpf, early)) {
+ early_memunmap(mpf, sizeof(*mpf));
return;
+ }
} else
BUG();
@@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early)
/*
* Only use the first configuration found.
*/
+
+ early_memunmap(mpf, sizeof(*mpf));
}
static void __init smp_reserve_memory(struct mpf_intel *mpf)
@@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
static int __init smp_scan_config(unsigned long base, unsigned long length)
{
- unsigned int *bp = phys_to_virt(base);
+ unsigned int *bp;
struct mpf_intel *mpf;
- unsigned long mem;
+ int ret = 0;
apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
+ bp = early_memremap(base, length);
mpf = (struct mpf_intel *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
(mpf->length == 1) &&
@@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
#endif
- mpf_found = mpf;
+ mpf_base = base;
- pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
- (unsigned long long) virt_to_phys(mpf),
- (unsigned long long) virt_to_phys(mpf) +
- sizeof(*mpf) - 1, mpf);
+ pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
+ base, base + sizeof(*mpf) - 1, mpf);
- mem = virt_to_phys(mpf);
- memblock_reserve(mem, sizeof(*mpf));
+ memblock_reserve(base, sizeof(*mpf));
if (mpf->physptr)
smp_reserve_memory(mpf);
- return 1;
+ ret = 1;
}
- bp += 4;
+ early_memunmap(bp, length);
+
+ if (ret)
+ break;
+
+ base += 16;
length -= 16;
}
- return 0;
+ return ret;
}
void __init default_find_smp_config(void)
@@ -838,29 +852,40 @@ static int __init update_mp_table(void)
char oem[10];
struct mpf_intel *mpf;
struct mpc_table *mpc, *mpc_new;
+ unsigned long size;
if (!enable_update_mptable)
return 0;
- mpf = mpf_found;
- if (!mpf)
+ if (!mpf_base)
return 0;
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: mpf early_memremap() failed\n");
+ return 0;
+ }
+
/*
* Now see if we need to go further.
*/
- if (mpf->feature1 != 0)
- return 0;
+ if (mpf->feature1)
+ goto do_unmap_mpf;
if (!mpf->physptr)
- return 0;
+ goto do_unmap_mpf;
- mpc = phys_to_virt(mpf->physptr);
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_memremap(mpf->physptr, size);
+ if (!mpc) {
+ pr_err("MPTABLE: mpc early_memremap() failed\n");
+ goto do_unmap_mpf;
+ }
if (!smp_check_mpc(mpc, oem, str))
- return 0;
+ goto do_unmap_mpc;
- pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+ pr_info("mpf: %llx\n", (u64)mpf_base);
pr_info("physptr: %x\n", mpf->physptr);
if (mpc_new_phys && mpc->length > mpc_new_length) {
@@ -878,21 +903,32 @@ static int __init update_mp_table(void)
new = mpf_checksum((unsigned char *)mpc, mpc->length);
if (old == new) {
pr_info("mpc is readonly, please try alloc_mptable instead\n");
- return 0;
+ goto do_unmap_mpc;
}
pr_info("use in-position replacing\n");
} else {
+ mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+ if (!mpc_new) {
+ pr_err("MPTABLE: new mpc early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
mpf->physptr = mpc_new_phys;
- mpc_new = phys_to_virt(mpc_new_phys);
memcpy(mpc_new, mpc, mpc->length);
+ early_memunmap(mpc, size);
mpc = mpc_new;
+ size = mpc_new_length;
/* check if we can modify that */
if (mpc_new_phys - mpf->physptr) {
struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
+ mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+ if (!mpf_new) {
+ pr_err("MPTABLE: new mpf early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
pr_info("mpf new: %x\n", 0x400 - 16);
- mpf_new = phys_to_virt(0x400 - 16);
memcpy(mpf_new, mpf, 16);
+ early_memunmap(mpf, sizeof(*mpf));
mpf = mpf_new;
mpf->physptr = mpc_new_phys;
}
@@ -909,6 +945,12 @@ static int __init update_mp_table(void)
*/
replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+do_unmap_mpc:
+ early_memunmap(mpc, size);
+
+do_unmap_mpf:
+ early_memunmap(mpf, sizeof(*mpf));
+
return 0;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 446c8aa09b9b..35aafc95e4b8 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -39,26 +39,26 @@
#include <trace/events/nmi.h>
struct nmi_desc {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct list_head head;
};
static struct nmi_desc nmi_desc[NMI_MAX] =
{
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
.head = LIST_HEAD_INIT(nmi_desc[0].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
.head = LIST_HEAD_INIT(nmi_desc[1].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
.head = LIST_HEAD_INIT(nmi_desc[2].head),
},
{
- .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
.head = LIST_HEAD_INIT(nmi_desc[3].head),
},
@@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
init_irq_work(&action->irq_work, nmi_max_handler);
- spin_lock_irqsave(&desc->lock, flags);
+ raw_spin_lock_irqsave(&desc->lock, flags);
/*
* Indicate if there are multiple registrations on the
@@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
else
list_add_tail_rcu(&action->list, &desc->head);
- spin_unlock_irqrestore(&desc->lock, flags);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
EXPORT_SYMBOL(__register_nmi_handler);
@@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
struct nmiaction *n;
unsigned long flags;
- spin_lock_irqsave(&desc->lock, flags);
+ raw_spin_lock_irqsave(&desc->lock, flags);
list_for_each_entry_rcu(n, &desc->head, list) {
/*
@@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
}
- spin_unlock_irqrestore(&desc->lock, flags);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 5e16d3f29594..0accc2404b92 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -93,9 +93,12 @@ again:
if (gfpflags_allow_blocking(flag)) {
page = dma_alloc_from_contiguous(dev, count, get_order(size),
flag);
- if (page && page_to_phys(page) + size > dma_mask) {
- dma_release_from_contiguous(dev, page, count);
- page = NULL;
+ if (page) {
+ addr = phys_to_dma(dev, page_to_phys(page));
+ if (addr + size > dma_mask) {
+ dma_release_from_contiguous(dev, page, count);
+ page = NULL;
+ }
}
}
/* fallback */
@@ -104,7 +107,7 @@ again:
if (!page)
return NULL;
- addr = page_to_phys(page);
+ addr = phys_to_dma(dev, page_to_phys(page));
if (addr + size > dma_mask) {
__free_pages(page, get_order(size));
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a6d404087fe3..4fc3cb60ea11 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
enum dma_data_direction dir,
unsigned long attrs)
{
- dma_addr_t bus = page_to_phys(page) + offset;
+ dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
WARN_ON(size == 0);
if (!check_addr("map_single", dev, bus, size))
return NOMMU_MAPPING_ERROR;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 1e23577e17cf..677077510e30 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,12 +6,14 @@
#include <linux/swiotlb.h>
#include <linux/bootmem.h>
#include <linux/dma-mapping.h>
+#include <linux/mem_encrypt.h>
#include <asm/iommu.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
#include <asm/xen/swiotlb-xen.h>
#include <asm/iommu_table.h>
+
int swiotlb __read_mostly;
void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
pci_swiotlb_late_init);
/*
- * if 4GB or more detected (and iommu=off not set) return 1
- * and set swiotlb to 1.
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
*/
int __init pci_swiotlb_detect_4gb(void)
{
@@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void)
if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
+
+ /*
+ * If SME is active then swiotlb will be set to 1 so that bounce
+ * buffers are allocated and used for devices that do not support
+ * the addressing range required for the encryption mask.
+ */
+ if (sme_active())
+ swiotlb = 1;
+
return swiotlb;
}
IOMMU_INIT(pci_swiotlb_detect_4gb,
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 91271122f0df..502a77d0adb0 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
x86_platform.legacy.reserve_bios_regions = 1;
break;
case X86_SUBARCH_XEN:
- case X86_SUBARCH_LGUEST:
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
break;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ca198080ea9..bd6b85fac666 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -355,6 +355,7 @@ bool xen_set_default_idle(void)
return ret;
}
#endif
+
void stop_this_cpu(void *dummy)
{
local_irq_disable();
@@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy)
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
- for (;;)
- halt();
+ for (;;) {
+ /*
+ * Use wbinvd followed by hlt to stop the processor. This
+ * provides support for kexec on a processor that supports
+ * SME. With kexec, going from SME inactive to SME active
+ * requires clearing cache entries so that addresses without
+ * the encryption bit set don't corrupt the same physical
+ * address that has the encryption bit set when caches are
+ * flushed. To achieve this a wbinvd is performed followed by
+ * a hlt. Even if the processor is not in the kexec/SME
+ * scenario this only adds a wbinvd to a halting processor.
+ */
+ asm volatile("wbinvd; hlt" : : : "memory");
+ }
}
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 22802162eeb9..11966251cd42 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all)
if (user_mode(regs)) {
sp = regs->sp;
- ss = regs->ss & 0xffff;
+ ss = regs->ss;
gs = get_user_gs(regs);
} else {
sp = kernel_stack_pointer(regs);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 77a35c817b2b..302e7b2572d1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
- (void *)regs->ip);
+ printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
regs->sp, regs->flags);
if (regs->orig_ax != -1)
@@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task)
}
}
+enum which_selector {
+ FS,
+ GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+ unsigned short selector,
+ enum which_selector which)
+{
+ if (likely(selector == 0)) {
+ /*
+ * On Intel (without X86_BUG_NULL_SEG), the segment base could
+ * be the pre-existing saved base or it could be zero. On AMD
+ * (with X86_BUG_NULL_SEG), the segment base could be almost
+ * anything.
+ *
+ * This branch is very hot (it's hit twice on almost every
+ * context switch between 64-bit programs), and avoiding
+ * the RDMSR helps a lot, so we just assume that whatever
+ * value is already saved is correct. This matches historical
+ * Linux behavior, so it won't break existing applications.
+ *
+ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+ * report that the base is zero, it needs to actually be zero:
+ * see the corresponding logic in load_seg_legacy.
+ */
+ } else {
+ /*
+ * If the selector is 1, 2, or 3, then the base is zero on
+ * !X86_BUG_NULL_SEG CPUs and could be anything on
+ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
+ * has never attempted to preserve the base across context
+ * switches.
+ *
+ * If selector > 3, then it refers to a real segment, and
+ * saving the base isn't necessary.
+ */
+ if (which == FS)
+ prev_p->thread.fsbase = 0;
+ else
+ prev_p->thread.gsbase = 0;
+ }
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+ savesegment(fs, task->thread.fsindex);
+ savesegment(gs, task->thread.gsindex);
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+static __always_inline void loadseg(enum which_selector which,
+ unsigned short sel)
+{
+ if (which == FS)
+ loadsegment(fs, sel);
+ else
+ load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+ unsigned long prev_base,
+ unsigned short next_index,
+ unsigned long next_base,
+ enum which_selector which)
+{
+ if (likely(next_index <= 3)) {
+ /*
+ * The next task is using 64-bit TLS, is not using this
+ * segment at all, or is having fun with arcane CPU features.
+ */
+ if (next_base == 0) {
+ /*
+ * Nasty case: on AMD CPUs, we need to forcibly zero
+ * the base.
+ */
+ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+ loadseg(which, __USER_DS);
+ loadseg(which, next_index);
+ } else {
+ /*
+ * We could try to exhaustively detect cases
+ * under which we can skip the segment load,
+ * but there's really only one case that matters
+ * for performance: if both the previous and
+ * next states are fully zeroed, we can skip
+ * the load.
+ *
+ * (This assumes that prev_base == 0 has no
+ * false positives. This is the case on
+ * Intel-style CPUs.)
+ */
+ if (likely(prev_index | next_index | prev_base))
+ loadseg(which, next_index);
+ }
+ } else {
+ if (prev_index != next_index)
+ loadseg(which, next_index);
+ wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ next_base);
+ }
+ } else {
+ /*
+ * The next task is using a real segment. Loading the selector
+ * is sufficient.
+ */
+ loadseg(which, next_index);
+ }
+}
+
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
@@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
unsigned int _cs, unsigned int _ss, unsigned int _ds)
{
+ WARN_ON_ONCE(regs != current_pt_regs());
+
+ if (static_cpu_has(X86_BUG_NULL_SEG)) {
+ /* Loading zero below won't clear the base. */
+ loadsegment(fs, __USER_DS);
+ load_gs_index(__USER_DS);
+ }
+
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
+
regs->ip = new_ip;
regs->sp = new_sp;
regs->cs = _cs;
@@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
- unsigned prev_fsindex, prev_gsindex;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(irq_count) != -1);
switch_fpu_prepare(prev_fpu, cpu);
@@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*
* (e.g. xen_load_tls())
*/
- savesegment(fs, prev_fsindex);
- savesegment(gs, prev_gsindex);
+ save_fsgs(prev_p);
/*
* Load TLS before restoring any segments so that segment loads
@@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
- /*
- * Switch FS and GS.
- *
- * These are even more complicated than DS and ES: they have
- * 64-bit bases are that controlled by arch_prctl. The bases
- * don't necessarily match the selectors, as user code can do
- * any number of things to cause them to be inconsistent.
- *
- * We don't promise to preserve the bases if the selectors are
- * nonzero. We also don't promise to preserve the base if the
- * selector is zero and the base doesn't match whatever was
- * most recently passed to ARCH_SET_FS/GS. (If/when the
- * FSGSBASE instructions are enabled, we'll need to offer
- * stronger guarantees.)
- *
- * As an invariant,
- * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
- * impossible.
- */
- if (next->fsindex) {
- /* Loading a nonzero value into FS sets the index and base. */
- loadsegment(fs, next->fsindex);
- } else {
- if (next->fsbase) {
- /* Next index is zero but next base is nonzero. */
- if (prev_fsindex)
- loadsegment(fs, 0);
- wrmsrl(MSR_FS_BASE, next->fsbase);
- } else {
- /* Next base and index are both zero. */
- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
- /*
- * We don't know the previous base and can't
- * find out without RDMSR. Forcibly clear it.
- */
- loadsegment(fs, __USER_DS);
- loadsegment(fs, 0);
- } else {
- /*
- * If the previous index is zero and ARCH_SET_FS
- * didn't change the base, then the base is
- * also zero and we don't need to do anything.
- */
- if (prev->fsbase || prev_fsindex)
- loadsegment(fs, 0);
- }
- }
- }
- /*
- * Save the old state and preserve the invariant.
- * NB: if prev_fsindex == 0, then we can't reliably learn the base
- * without RDMSR because Intel user code can zero it without telling
- * us and AMD user code can program any 32-bit value without telling
- * us.
- */
- if (prev_fsindex)
- prev->fsbase = 0;
- prev->fsindex = prev_fsindex;
-
- if (next->gsindex) {
- /* Loading a nonzero value into GS sets the index and base. */
- load_gs_index(next->gsindex);
- } else {
- if (next->gsbase) {
- /* Next index is zero but next base is nonzero. */
- if (prev_gsindex)
- load_gs_index(0);
- wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
- } else {
- /* Next base and index are both zero. */
- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
- /*
- * We don't know the previous base and can't
- * find out without RDMSR. Forcibly clear it.
- *
- * This contains a pointless SWAPGS pair.
- * Fixing it would involve an explicit check
- * for Xen or a new pvop.
- */
- load_gs_index(__USER_DS);
- load_gs_index(0);
- } else {
- /*
- * If the previous index is zero and ARCH_SET_GS
- * didn't change the base, then the base is
- * also zero and we don't need to do anything.
- */
- if (prev->gsbase || prev_gsindex)
- load_gs_index(0);
- }
- }
- }
- /*
- * Save the old state and preserve the invariant.
- * NB: if prev_gsindex == 0, then we can't reliably learn the base
- * without RDMSR because Intel user code can zero it without telling
- * us and AMD user code can program any 32-bit value without telling
- * us.
- */
- if (prev_gsindex)
- prev->gsbase = 0;
- prev->gsindex = prev_gsindex;
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
switch_fpu_finish(next_fpu, cpu);
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 98111b38ebfd..307d3bac5f04 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -47,6 +47,7 @@ relocate_kernel:
* %rsi page_list
* %rdx start address
* %rcx preserve_context
+ * %r8 sme_active
*/
/* Save the CPU context, used for jumping back */
@@ -71,6 +72,9 @@ relocate_kernel:
pushq $0
popfq
+ /* Save SME active flag */
+ movq %r8, %r12
+
/*
* get physical address of control page now
* this is impossible after page table switch
@@ -132,6 +136,16 @@ identity_mapped:
/* Flush the TLB (needed?) */
movq %r9, %cr3
+ /*
+ * If SME is active, there could be old encrypted cache line
+ * entries that will conflict with the now unencrypted memory
+ * used by kexec. Flush the caches before copying the kernel.
+ */
+ testq %r12, %r12
+ jz 1f
+ wbinvd
+1:
+
movq %rcx, %r11
call swap_pages
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..022ebddb3734 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -69,6 +69,7 @@
#include <linux/crash_dump.h>
#include <linux/tboot.h>
#include <linux/jiffies.h>
+#include <linux/mem_encrypt.h>
#include <linux/usb/xhci-dbgp.h>
#include <video/edid.h>
@@ -115,6 +116,7 @@
#include <asm/microcode.h>
#include <asm/mmu_context.h>
#include <asm/kaslr.h>
+#include <asm/unwind.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -374,6 +376,14 @@ static void __init reserve_initrd(void)
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
+ /*
+ * If SME is active, this memory will be marked encrypted by the
+ * kernel when it is accessed (including relocation). However, the
+ * ramdisk image was loaded decrypted by the bootloader, so make
+ * sure that it is encrypted before accessing it.
+ */
+ sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+
initrd_start = 0;
mapped_size = memblock_mem_size(max_pfn_mapped);
@@ -1310,6 +1320,8 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled(EFI_BOOT))
efi_apply_memmap_quirks();
#endif
+
+ unwind_init();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cc30a74e4adb..e04442345fc0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
sp = current->sas_ss_sp + current->sas_ss_size;
} else if (IS_ENABLED(CONFIG_X86_32) &&
!onsigstack &&
- (regs->ss & 0xffff) != __USER_DS &&
+ regs->ss != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer) {
/* This is the legacy signal stack switching. */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b474c8de7fba..54b9e89d4d6b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -971,7 +971,8 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
-static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
+ int *cpu0_nmi_registered)
{
volatile u32 *trampoline_status =
(volatile u32 *) __va(real_mode_header->trampoline_status);
@@ -979,7 +980,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long start_ip = real_mode_header->trampoline_start;
unsigned long boot_error = 0;
- int cpu0_nmi_registered = 0;
unsigned long timeout;
idle->thread.sp = (unsigned long)task_pt_regs(idle);
@@ -1035,7 +1035,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
- &cpu0_nmi_registered);
+ cpu0_nmi_registered);
if (!boot_error) {
/*
@@ -1080,12 +1080,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
*/
smpboot_restore_warm_reset_vector();
}
- /*
- * Clean up the nmi handler. Do this after the callin and callout sync
- * to avoid impact of possible long unregister time.
- */
- if (cpu0_nmi_registered)
- unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
return boot_error;
}
@@ -1093,8 +1087,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
+ int cpu0_nmi_registered = 0;
unsigned long flags;
- int err;
+ int err, ret = 0;
WARN_ON(irqs_disabled());
@@ -1131,10 +1126,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
- err = do_boot_cpu(apicid, cpu, tidle);
+ err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
if (err) {
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
- return -EIO;
+ ret = -EIO;
+ goto unreg_nmi;
}
/*
@@ -1150,7 +1146,15 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
- return 0;
+unreg_nmi:
+ /*
+ * Clean up the nmi handler. Do this after the callin and callout sync
+ * to avoid impact of possible long unregister time.
+ */
+ if (cpu0_nmi_registered)
+ unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
+
+ return ret;
}
/**
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 5f25cfbd952e..5ee663836c08 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
unsigned long addr, seg;
addr = regs->ip;
- seg = regs->cs & 0xffff;
+ seg = regs->cs;
if (v8086_mode(regs)) {
addr = (addr & 0xffff) + (seg << 4);
return addr;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 213ddf3e937d..73e4d28112f8 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,6 +21,7 @@
#include <asm/compat.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
+#include <asm/mpx.h>
/*
* Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -100,8 +101,8 @@ out:
return error;
}
-static void find_start_end(unsigned long flags, unsigned long *begin,
- unsigned long *end)
+static void find_start_end(unsigned long addr, unsigned long flags,
+ unsigned long *begin, unsigned long *end)
{
if (!in_compat_syscall() && (flags & MAP_32BIT)) {
/* This is usually used needed to map code in small
@@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
}
*begin = get_mmap_base(1);
- *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
+ if (in_compat_syscall())
+ *end = task_size_32bit();
+ else
+ *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
}
unsigned long
@@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_unmapped_area_info info;
unsigned long begin, end;
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(addr, flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
/* requested length too big for entire address space */
if (len > TASK_SIZE)
return -ENOMEM;
@@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = get_mmap_base(0);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ *
+ * !in_compat_syscall() check to avoid high addresses for x32.
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
if (filp) {
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..d145a0b1f529 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -10,20 +10,22 @@
#define FRAME_HEADER_SIZE (sizeof(long) * 2)
-/*
- * This disables KASAN checking when reading a value from another task's stack,
- * since the other task could be running on another CPU and could have poisoned
- * the stack in the meantime.
- */
-#define READ_ONCE_TASK_STACK(task, x) \
-({ \
- unsigned long val; \
- if (task == current) \
- val = READ_ONCE(x); \
- else \
- val = READ_ONCE_NOCHECK(x); \
- val; \
-})
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
static void unwind_dump(struct unwind_state *state)
{
@@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state)
}
}
-unsigned long unwind_get_return_address(struct unwind_state *state)
-{
- if (unwind_done(state))
- return 0;
-
- return __kernel_text_address(state->ip) ? state->ip : 0;
-}
-EXPORT_SYMBOL_GPL(unwind_get_return_address);
-
static size_t regs_size(struct pt_regs *regs)
{
/* x86_32 regs from kernel mode are two words shorter: */
@@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip)
if (addr >= __entry_text_start && addr < __entry_text_end)
return true;
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
return true;
-#endif
return false;
}
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 039f36738e49..4f0e17b90463 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
bool unwind_next_frame(struct unwind_state *state)
{
struct stack_info *info = &state->stack_info;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000000..570b70d3f604
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,582 @@
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/sections.h>
+
+#define orc_warn(fmt, ...) \
+ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static DEFINE_MUTEX(sort_mutex);
+int *cur_orc_ip_table = __start_orc_unwind_ip;
+struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+unsigned int lookup_num_blocks;
+bool orc_init;
+
+static inline unsigned long orc_ip(const int *ip)
+{
+ return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+ unsigned int num_entries, unsigned long ip)
+{
+ int *first = ip_table;
+ int *last = ip_table + num_entries - 1;
+ int *mid = first, *found = first;
+
+ if (!num_entries)
+ return NULL;
+
+ /*
+ * Do a binary range search to find the rightmost duplicate of a given
+ * starting address. Some entries are section terminators which are
+ * "weak" entries for ensuring there are no gaps. They should be
+ * ignored when they conflict with a real entry.
+ */
+ while (first <= last) {
+ mid = first + ((last - first) / 2);
+
+ if (orc_ip(mid) <= ip) {
+ found = mid;
+ first = mid + 1;
+ } else
+ last = mid - 1;
+ }
+
+ return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ struct module *mod;
+
+ mod = __module_address(ip);
+ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+ return NULL;
+ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+ mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+ if (!orc_init)
+ return NULL;
+
+ /* For non-init vmlinux addresses, use the fast lookup table: */
+ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+ unsigned int idx, start, stop;
+
+ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+ if (unlikely((idx >= lookup_num_blocks-1))) {
+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n",
+ idx, lookup_num_blocks, ip);
+ return NULL;
+ }
+
+ start = orc_lookup[idx];
+ stop = orc_lookup[idx + 1] + 1;
+
+ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+ (__start_orc_unwind + stop > __stop_orc_unwind))) {
+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n",
+ idx, lookup_num_blocks, start, stop, ip);
+ return NULL;
+ }
+
+ return __orc_find(__start_orc_unwind_ip + start,
+ __start_orc_unwind + start, stop - start, ip);
+ }
+
+ /* vmlinux .init slow lookup: */
+ if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
+ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+ /* Module lookup: */
+ return orc_module_find(ip);
+}
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+ struct orc_entry *orc_a, *orc_b;
+ struct orc_entry orc_tmp;
+ int *a = _a, *b = _b, tmp;
+ int delta = _b - _a;
+
+ /* Swap the .orc_unwind_ip entries: */
+ tmp = *a;
+ *a = *b + delta;
+ *b = tmp - delta;
+
+ /* Swap the corresponding .orc_unwind entries: */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ orc_b = cur_orc_table + (b - cur_orc_ip_table);
+ orc_tmp = *orc_a;
+ *orc_a = *orc_b;
+ *orc_b = orc_tmp;
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+ struct orc_entry *orc_a;
+ const int *a = _a, *b = _b;
+ unsigned long a_val = orc_ip(a);
+ unsigned long b_val = orc_ip(b);
+
+ if (a_val > b_val)
+ return 1;
+ if (a_val < b_val)
+ return -1;
+
+ /*
+ * The "weak" section terminator entries need to always be on the left
+ * to ensure the lookup code skips them in favor of real entries.
+ * These terminator entries exist to handle any gaps created by
+ * whitelisted .o files which didn't get objtool generation.
+ */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1;
+}
+
+#ifdef CONFIG_MODULES
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+ void *_orc, size_t orc_size)
+{
+ int *orc_ip = _orc_ip;
+ struct orc_entry *orc = _orc;
+ unsigned int num_entries = orc_ip_size / sizeof(int);
+
+ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(*orc) != 0 ||
+ num_entries != orc_size / sizeof(*orc));
+
+ /*
+ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+ * associate an .orc_unwind_ip table entry with its corresponding
+ * .orc_unwind entry so they can both be swapped.
+ */
+ mutex_lock(&sort_mutex);
+ cur_orc_ip_table = orc_ip;
+ cur_orc_table = orc;
+ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+ mutex_unlock(&sort_mutex);
+
+ mod->arch.orc_unwind_ip = orc_ip;
+ mod->arch.orc_unwind = orc;
+ mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+ size_t num_entries = orc_ip_size / sizeof(int);
+ struct orc_entry *orc;
+ int i;
+
+ if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(struct orc_entry) != 0 ||
+ num_entries != orc_size / sizeof(struct orc_entry)) {
+ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ /* Sort the .orc_unwind and .orc_unwind_ip tables: */
+ sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
+ orc_sort_swap);
+
+ /* Initialize the fast lookup table: */
+ lookup_num_blocks = orc_lookup_end - orc_lookup;
+ for (i = 0; i < lookup_num_blocks-1; i++) {
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ num_entries,
+ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ orc_lookup[i] = orc - __start_orc_unwind;
+ }
+
+ /* Initialize the ending block: */
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+ LOOKUP_STOP_IP);
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+ orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (state->regs)
+ return &state->regs->ip;
+
+ if (state->sp)
+ return (unsigned long *)state->sp - 1;
+
+ return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+
+ /*
+ * If the address isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that info->next_sp could point to an empty stack and the address
+ * could be on a subsequent stack.
+ */
+ while (!on_stack(info, (void *)addr, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ return true;
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+ unsigned long *val)
+{
+ if (!stack_access_ok(state, addr, sizeof(long)))
+ return false;
+
+ *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr);
+ return true;
+}
+
+#define REGS_SIZE (sizeof(struct pt_regs))
+#define SP_OFFSET (offsetof(struct pt_regs, sp))
+#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
+#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp, bool full)
+{
+ size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
+ size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
+ struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ if (!stack_access_ok(state, addr, regs_size))
+ return false;
+
+ *ip = regs->ip;
+ *sp = regs->sp;
+
+ return true;
+ }
+
+ if (!stack_access_ok(state, addr, sp_offset))
+ return false;
+
+ *ip = regs->ip;
+
+ if (user_mode(regs)) {
+ if (!stack_access_ok(state, addr + sp_offset,
+ REGS_SIZE - SP_OFFSET))
+ return false;
+
+ *sp = regs->sp;
+ } else
+ *sp = (unsigned long)&regs->sp;
+
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
+ enum stack_type prev_type = state->stack_info.type;
+ struct orc_entry *orc;
+ struct pt_regs *ptregs;
+ bool indirect = false;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Don't let modules unload while we're reading their ORC data. */
+ preempt_disable();
+
+ /* Have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto done;
+
+ /*
+ * Find the orc_entry associated with the text address.
+ *
+ * Decrement call return addresses by one so they work for sibling
+ * calls and calls to noreturn functions.
+ */
+ orc = orc_find(state->signal ? state->ip : state->ip - 1);
+ if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
+ goto done;
+ orig_ip = state->ip;
+
+ /* Find the previous frame's stack: */
+ switch (orc->sp_reg) {
+ case ORC_REG_SP:
+ sp = state->sp + orc->sp_offset;
+ break;
+
+ case ORC_REG_BP:
+ sp = state->bp + orc->sp_offset;
+ break;
+
+ case ORC_REG_SP_INDIRECT:
+ sp = state->sp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_BP_INDIRECT:
+ sp = state->bp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_R10:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg R10 at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->r10;
+ break;
+
+ case ORC_REG_R13:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg R13 at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->r13;
+ break;
+
+ case ORC_REG_DI:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg DI at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->di;
+ break;
+
+ case ORC_REG_DX:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg DX at ip %p\n",
+ (void *)state->ip);
+ goto done;
+ }
+ sp = state->regs->dx;
+ break;
+
+ default:
+ orc_warn("unknown SP base reg %d for ip %p\n",
+ orc->sp_reg, (void *)state->ip);
+ goto done;
+ }
+
+ if (indirect) {
+ if (!deref_stack_reg(state, sp, &sp))
+ goto done;
+ }
+
+ /* Find IP, SP and possibly regs: */
+ switch (orc->type) {
+ case ORC_TYPE_CALL:
+ ip_p = sp - sizeof(long);
+
+ if (!deref_stack_reg(state, ip_p, &state->ip))
+ goto done;
+
+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ state->ip, (void *)ip_p);
+
+ state->sp = sp;
+ state->regs = NULL;
+ state->signal = false;
+ break;
+
+ case ORC_TYPE_REGS:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+ orc_warn("can't dereference registers at %p for ip %p\n",
+ (void *)sp, (void *)orig_ip);
+ goto done;
+ }
+
+ state->regs = (struct pt_regs *)sp;
+ state->full_regs = true;
+ state->signal = true;
+ break;
+
+ case ORC_TYPE_REGS_IRET:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+ orc_warn("can't dereference iret registers at %p for ip %p\n",
+ (void *)sp, (void *)orig_ip);
+ goto done;
+ }
+
+ ptregs = container_of((void *)sp, struct pt_regs, ip);
+ if ((unsigned long)ptregs >= prev_sp &&
+ on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
+ state->regs = ptregs;
+ state->full_regs = false;
+ } else
+ state->regs = NULL;
+
+ state->signal = true;
+ break;
+
+ default:
+ orc_warn("unknown .orc_unwind entry type %d\n", orc->type);
+ break;
+ }
+
+ /* Find BP: */
+ switch (orc->bp_reg) {
+ case ORC_REG_UNDEFINED:
+ if (state->regs && state->full_regs)
+ state->bp = state->regs->bp;
+ break;
+
+ case ORC_REG_PREV_SP:
+ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+ goto done;
+ break;
+
+ case ORC_REG_BP:
+ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+ goto done;
+ break;
+
+ default:
+ orc_warn("unknown BP base reg %d for ip %p\n",
+ orc->bp_reg, (void *)orig_ip);
+ goto done;
+ }
+
+ /* Prevent a recursive loop due to bad ORC data: */
+ if (state->stack_info.type == prev_type &&
+ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+ state->sp <= prev_sp) {
+ orc_warn("stack going in the wrong direction? ip=%p\n",
+ (void *)orig_ip);
+ goto done;
+ }
+
+ preempt_enable();
+ return true;
+
+done:
+ preempt_enable();
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ /*
+ * Refuse to unwind the stack of a task while it's executing on another
+ * CPU. This check is racy, but that's ok: the unwinder has other
+ * checks to prevent it from going off the rails.
+ */
+ if (task_on_another_cpu(task))
+ goto done;
+
+ if (regs) {
+ if (user_mode(regs))
+ goto done;
+
+ state->ip = regs->ip;
+ state->sp = kernel_stack_pointer(regs);
+ state->bp = regs->bp;
+ state->regs = regs;
+ state->full_regs = true;
+ state->signal = true;
+
+ } else if (task == current) {
+ asm volatile("lea (%%rip), %0\n\t"
+ "mov %%rsp, %1\n\t"
+ "mov %%rbp, %2\n\t"
+ : "=r" (state->ip), "=r" (state->sp),
+ "=r" (state->bp));
+
+ } else {
+ struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+ state->sp = task->thread.sp;
+ state->bp = READ_ONCE_NOCHECK(frame->bp);
+ state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ }
+
+ if (get_stack_info((unsigned long *)state->sp, state->task,
+ &state->stack_info, &state->stack_mask))
+ return;
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+
+ /* When starting from regs, skip the regs frame: */
+ if (regs) {
+ unwind_next_frame(state);
+ return;
+ }
+
+ /* Otherwise, skip ahead to the user-specified starting frame: */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->sp <= (unsigned long)first_frame))
+ unwind_next_frame(state);
+
+ return;
+
+done:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8a3b61be0aa..f05f00acac89 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/page_types.h>
+#include <asm/orc_lookup.h>
#include <asm/cache.h>
#include <asm/boot.h>
@@ -148,6 +149,8 @@ SECTIONS
BUG_TABLE
+ ORC_UNWIND_TABLE
+
. = ALIGN(PAGE_SIZE);
__vvar_page = .;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2688c7dc5323..3ea624452f93 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2eea522c..19adbb418443 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -469,7 +469,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
cpuid_mask(&entry->ecx, CPUID_7_ECX);
/* PKU is not yet implemented for shadow paging. */
- if (!tdp_enabled)
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 762cdf2595f9..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,11 +84,6 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
-static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_ops->get_pkru(vcpu);
-}
-
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b1dd114956a..04d750813c9d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -108,7 +108,7 @@ module_param(dbg, bool, 0644);
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
#define PT64_DIR_BASE_ADDR_MASK \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
#define PT64_LVL_ADDR_MASK(level) \
@@ -126,7 +126,7 @@ module_param(dbg, bool, 0644);
* PT32_LEVEL_BITS))) - 1))
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
- | shadow_x_mask | shadow_nx_mask)
+ | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
#define ACC_EXEC_MASK 1
#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
static u64 __read_mostly shadow_mmio_value;
static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_me_mask;
/*
* SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
@@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
*/
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask)
+ u64 acc_track_mask, u64 me_mask)
{
BUG_ON(!dirty_mask != !accessed_mask);
BUG_ON(!accessed_mask && !acc_track_mask);
@@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
shadow_x_mask = x_mask;
shadow_present_mask = p_mask;
shadow_acc_track_mask = acc_track_mask;
+ shadow_me_mask = me_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask;
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
if (sp_ad_disabled(sp))
spte |= shadow_acc_track_value;
@@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
pte_access &= ~ACC_WRITE_MASK;
spte |= (u64)pfn << PAGE_SHIFT;
+ spte |= shadow_me_mask;
if (pte_access & ACC_WRITE_MASK) {
@@ -4106,16 +4109,28 @@ void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
/*
* Passing "true" to the last argument is okay; it adds a check
* on bit 8 of the SPTEs which KVM doesn't use anyway.
*/
- __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+ shadow_zero_check = &context->shadow_zero_check;
+ __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
boot_cpu_data.x86_phys_bits,
context->shadow_root_level, uses_nx,
guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
true);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->shadow_root_level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+
}
EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
@@ -4133,17 +4148,29 @@ static void
reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ shadow_zero_check = &context->shadow_zero_check;
+
if (boot_cpu_is_amd())
- __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+ __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
boot_cpu_data.x86_phys_bits,
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
true, true);
else
- __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ __reset_rsvds_bits_mask_ept(shadow_zero_check,
boot_cpu_data.x86_phys_bits,
false);
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->shadow_root_level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
}
/*
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a000dd..4b9a3ae6b725 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -185,7 +185,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* index of the protection domain, so pte_pkey * 2 is
* is the index of the first bit for the domain.
*/
- pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
offset = (pfec & ~1) +
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4d8141e533c3..8dbd8dbc83eb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1100,7 +1100,7 @@ static __init int svm_hardware_setup(void)
if (vls) {
if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
!IS_ENABLED(CONFIG_X86_64)) {
vls = false;
} else {
@@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb;
struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
- phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
- phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
- phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+ phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_MWAIT);
}
- control->iopm_base_pa = iopm_base;
- control->msrpm_base_pa = __pa(svm->msrpm);
+ control->iopm_base_pa = __sme_set(iopm_base);
+ control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
control->int_ctl = V_INTR_MASKING_MASK;
init_seg(&save->es);
@@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return -EINVAL;
new_entry = READ_ONCE(*entry);
- new_entry = (page_to_phys(svm->avic_backing_page) &
- AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
- AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+ AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
WRITE_ONCE(*entry, new_entry);
svm->avic_physical_id_cache = entry;
@@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
- svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+ svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
svm->asid_generation = 0;
init_vmcb(svm);
@@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
@@ -1777,11 +1777,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
-static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
-{
- return 0;
-}
-
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
@@ -2335,7 +2330,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
u64 pdpte;
int ret;
- ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
@@ -2347,7 +2342,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.nested_cr3 = root;
+ svm->vmcb->control.nested_cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_NPT);
svm_flush_tlb(vcpu);
}
@@ -2430,6 +2425,16 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = error_code;
+
+ /*
+ * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
+ * written only when inject_pending_event runs (DR6 would written here
+ * too). This should be conditional on a new capability---if the
+ * capability is disabled, kvm_multiple_exception would write the
+ * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+ */
if (svm->vcpu.arch.exception.nested_apf)
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
else
@@ -2868,7 +2873,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
svm->nested.msrpm[p] = svm->msrpm[p] | value;
}
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
return true;
}
@@ -4501,7 +4506,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
irq.vector);
*svm = to_svm(vcpu);
- vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+ vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
vcpu_info->vector = irq.vector;
return 0;
@@ -4552,7 +4557,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
struct amd_iommu_pi_data pi;
/* Try to enable guest_mode in IRTE */
- pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+ AVIC_HPA_MASK);
pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
svm->vcpu.vcpu_id);
pi.is_guest_mode = true;
@@ -5001,7 +5007,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.cr3 = root;
+ svm->vmcb->save.cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_CR);
svm_flush_tlb(vcpu);
}
@@ -5010,7 +5016,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.nested_cr3 = root;
+ svm->vmcb->control.nested_cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_NPT);
/* Also sync guest cr3 here in case we live migrate */
@@ -5403,8 +5409,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
- .get_pkru = svm_get_pkru,
-
.tlb_flush = svm_flush_tlb,
.run = svm_vcpu_run,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 39a6222bf968..d40900914a72 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -416,13 +416,10 @@ struct nested_vmx {
/* The guest-physical address of the current VMCS L1 keeps for L2 */
gpa_t current_vmptr;
- /* The host-usable pointer to the above */
- struct page *current_vmcs12_page;
- struct vmcs12 *current_vmcs12;
/*
* Cache of the guest's VMCS, existing outside of guest memory.
* Loaded from guest memory during VMPTRLD. Flushed to guest
- * memory during VMXOFF, VMCLEAR, VMPTRLD.
+ * memory during VMCLEAR and VMPTRLD.
*/
struct vmcs12 *cached_vmcs12;
/*
@@ -639,8 +636,6 @@ struct vcpu_vmx {
u64 current_tsc_ratio;
- bool guest_pkru_valid;
- u32 guest_pkru;
u32 host_pkru;
/*
@@ -927,6 +922,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
static int alloc_identity_pagetable(struct kvm *kvm);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -2382,11 +2381,6 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
-static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
-{
- return to_vmx(vcpu)->guest_pkru;
-}
-
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2428,6 +2422,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+ unsigned long exit_qual)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (vcpu->arch.exception.has_error_code) {
+ vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (kvm_exception_is_soft(nr))
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ vmx_get_nmi_mask(vcpu))
+ intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
/*
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -2437,23 +2455,38 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned int nr = vcpu->arch.exception.nr;
- if (!((vmcs12->exception_bitmap & (1u << nr)) ||
- (nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
- return 0;
+ if (nr == PF_VECTOR) {
+ if (vcpu->arch.exception.nested_apf) {
+ nested_vmx_inject_exception_vmexit(vcpu,
+ vcpu->arch.apf.nested_apf_token);
+ return 1;
+ }
+ /*
+ * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+ * can be written only when inject_pending_event runs. This should be
+ * conditional on a new capability---if the capability is disabled,
+ * kvm_multiple_exception would write the ancillary information to
+ * CR2 or DR6, for backwards ABI-compatibility.
+ */
+ if (nested_vmx_is_page_fault_vmexit(vmcs12,
+ vcpu->arch.exception.error_code)) {
+ nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
+ return 1;
+ }
+ } else {
+ unsigned long exit_qual = 0;
+ if (nr == DB_VECTOR)
+ exit_qual = vcpu->arch.dr6;
- if (vcpu->arch.exception.nested_apf) {
- vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
- INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
- vcpu->arch.apf.nested_apf_token);
- return 1;
+ if (vmcs12->exception_bitmap & (1u << nr)) {
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ return 1;
+ }
}
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
+ return 0;
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
@@ -2667,7 +2700,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* reason is that if one of these bits is necessary, it will appear
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
* fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_exit_handled() will not pass related exits to L1.
+ * nested_vmx_exit_reflected() will not pass related exits to L1.
* These rules have exceptions below.
*/
@@ -4955,6 +4988,28 @@ static bool vmx_get_enable_apicv(void)
return enable_apicv;
}
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gfn_t gfn;
+
+ /*
+ * Don't need to mark the APIC access page dirty; it is never
+ * written to by the CPU during APIC virtualization.
+ */
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+}
+
+
static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4962,18 +5017,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
void *vapic_page;
u16 status;
- if (vmx->nested.pi_desc &&
- vmx->nested.pi_pending) {
- vmx->nested.pi_pending = false;
- if (!pi_test_and_clear_on(vmx->nested.pi_desc))
- return;
-
- max_irr = find_last_bit(
- (unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+ return;
- if (max_irr == 256)
- return;
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return;
+ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (max_irr != 256) {
vapic_page = kmap(vmx->nested.virtual_apic_page);
__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
kunmap(vmx->nested.virtual_apic_page);
@@ -4985,6 +5037,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
+
+ nested_mark_vmcs12_pages_dirty(vcpu);
}
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -6502,7 +6556,7 @@ void vmx_enable_tdp(void)
enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
0ull, VMX_EPT_EXECUTABLE_MASK,
cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- VMX_EPT_RWX_MASK);
+ VMX_EPT_RWX_MASK, 0ull);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
@@ -7134,34 +7188,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
return 1;
}
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+}
+
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
if (vmx->nested.current_vmptr == -1ull)
return;
- /* current_vmptr and current_vmcs12 are always set/reset together */
- if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
- return;
-
if (enable_shadow_vmcs) {
/* copy to memory all shadowed fields in case
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmx_disable_shadow_vmcs(vmx);
}
vmx->nested.posted_intr_nv = -1;
/* Flush VMCS12 to guest memory */
- memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
- VMCS12_SIZE);
+ kvm_vcpu_write_guest_page(&vmx->vcpu,
+ vmx->nested.current_vmptr >> PAGE_SHIFT,
+ vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
- kunmap(vmx->nested.current_vmcs12_page);
- nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
}
/*
@@ -7175,12 +7227,14 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.vmxon = false;
free_vpid(vmx->nested.vpid02);
- nested_release_vmcs12(vmx);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
if (vmx->nested.msr_bitmap) {
free_page((unsigned long)vmx->nested.msr_bitmap);
vmx->nested.msr_bitmap = NULL;
}
if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
free_vmcs(vmx->vmcs01.shadow_vmcs);
vmx->vmcs01.shadow_vmcs = NULL;
@@ -7579,14 +7633,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
nested_release_vmcs12(vmx);
- vmx->nested.current_vmcs12 = new_vmcs12;
- vmx->nested.current_vmcs12_page = page;
/*
* Load VMCS12 from guest memory since it is not already
* cached.
*/
- memcpy(vmx->nested.cached_vmcs12,
- vmx->nested.current_vmcs12, VMCS12_SIZE);
+ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+ kunmap(page);
+ nested_release_page_clean(page);
+
set_current_vmptr(vmx, vmptr);
}
@@ -8019,12 +8073,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
* should handle it ourselves in L0 (and then continue L2). Only call this
* when in is_guest_mode (L2).
*/
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
{
u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u32 exit_reason = vmx->exit_reason;
trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
vmcs_readl(EXIT_QUALIFICATION),
@@ -8033,6 +8086,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+ * may write to these pages via their host physical address while
+ * L2 is running, bypassing any address-translation-based dirty
+ * tracking (e.g. EPT write protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
if (vmx->nested.nested_run_pending)
return false;
@@ -8169,6 +8234,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
}
}
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /*
+ * At this point, the exit interruption info in exit_intr_info
+ * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
+ * we need to query the in-kernel LAPIC.
+ */
+ WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+ if ((exit_intr_info &
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+
+ nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -8415,12 +8503,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
- if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
- nested_vmx_vmexit(vcpu, exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
- }
+ if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+ return nested_vmx_reflect_vmexit(vcpu, exit_reason);
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
dump_vmcs();
@@ -8929,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- if (vmx->guest_pkru_valid)
- __write_pkru(vmx->guest_pkru);
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+ vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vcpu->arch.pkru);
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
@@ -9078,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* back on host, so it is safe to read guest PKRU from current
* XSAVE.
*/
- if (boot_cpu_has(X86_FEATURE_OSPKE)) {
- vmx->guest_pkru = __read_pkru();
- if (vmx->guest_pkru != vmx->host_pkru) {
- vmx->guest_pkru_valid = true;
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+ vcpu->arch.pkru = __read_pkru();
+ if (vcpu->arch.pkru != vmx->host_pkru)
__write_pkru(vmx->host_pkru);
- } else
- vmx->guest_pkru_valid = false;
}
/*
@@ -9223,7 +9307,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
@@ -9509,12 +9592,15 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
WARN_ON(!is_guest_mode(vcpu));
- if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
- nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- else
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+ vmcs12->vm_exit_intr_error_code = fault->error_code;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+ INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+ fault->address);
+ } else {
kvm_inject_page_fault(vcpu, fault);
+ }
}
static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
@@ -10094,12 +10180,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
- *
- * A problem with this approach (when !enable_ept) is that L1 may be
- * injected with more page faults than it asked for. This could have
- * caused problems, but in practice existing hypervisors don't care.
- * To fix this, we will need to emulate the PFEC checking (on the L1
- * page tables), using walk_addr(), when injecting PFs to L1.
*/
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
enable_ept ? vmcs12->page_fault_error_code_mask : 0);
@@ -10847,13 +10927,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_exit_reason = exit_reason;
vmcs12->exit_qualification = exit_qualification;
-
vmcs12->vm_exit_intr_info = exit_intr_info;
- if ((vmcs12->vm_exit_intr_info &
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
- vmcs12->vm_exit_intr_error_code =
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
vmcs12->idt_vectoring_info_field = 0;
vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
@@ -11049,8 +11124,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
- && nested_exit_intr_ack_set(vcpu)) {
+ /*
+ * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
+ * the VM-exit interrupt information (valid interrupt) is always set to
+ * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
+ * kvm_cpu_has_interrupt(). See the commit message for details.
+ */
+ if (nested_exit_intr_ack_set(vcpu) &&
+ exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ kvm_cpu_has_interrupt(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
@@ -11593,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
- .get_pkru = vmx_get_pkru,
-
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c97c82814c4..ef5102f80497 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
+#include <linux/mem_encrypt.h>
#include <trace/events/kvm.h>
@@ -3159,15 +3160,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_set_hflags(vcpu, hflags);
vcpu->arch.smi_pending = events->smi.pending;
- if (events->smi.smm_inside_nmi)
- vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
- else
- vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (lapic_in_kernel(vcpu)) {
- if (events->smi.latched_init)
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+
+ if (events->smi.smm) {
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ }
}
}
@@ -3242,7 +3246,12 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
- memcpy(dest + offset, src, size);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(dest + offset, &vcpu->arch.pkru,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest + offset, src, size);
+
}
valid -= feature;
@@ -3280,7 +3289,11 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
- memcpy(dest, src + offset, size);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(&vcpu->arch.pkru, src + offset,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest, src + offset, size);
}
valid -= feature;
@@ -6113,7 +6126,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0,
- PT_PRESENT_MASK, 0);
+ PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6215,6 +6228,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
lapic_irq.shorthand = 0;
lapic_irq.dest_mode = 0;
+ lapic_irq.level = 0;
lapic_irq.dest_id = apicid;
lapic_irq.msi_redir_hint = false;
@@ -6721,17 +6735,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address)
-{
- /*
- * The physical address of apic access page is stored in the VMCS.
- * Update it when it becomes invalid.
- */
- if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT))
- kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-}
-
/*
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
@@ -7629,7 +7632,9 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
*/
vcpu->guest_fpu_loaded = 1;
__kernel_fpu_begin();
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+ ~XFEATURE_MASK_PKRU);
trace_kvm_fpu(1);
}
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
deleted file mode 100644
index 08f41caada45..000000000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
- bool "Lguest guest support"
- depends on X86_32 && PARAVIRT && PCI
- select TTY
- select VIRTUALIZATION
- select VIRTIO
- select VIRTIO_CONSOLE
- help
- Lguest is a tiny in-kernel hypervisor. Selecting this will
- allow your kernel to boot under lguest. This option will increase
- your kernel size by about 10k. If in doubt, say N.
-
- If you say Y here, make sure you say Y (or M) to the virtio block
- and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
deleted file mode 100644
index 8f38d577a2fa..000000000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y := head_32.o boot.o
-CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
deleted file mode 100644
index 99472698c931..000000000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways. First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes. We call the first kernel the Host,
- * and the others the Guests. The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time. This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest? We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h> /* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways. In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
- .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
- .noirq_iret = (u32)lguest_noirq_iret,
- .kernel_address = PAGE_OFFSET,
- .blocked_interrupts = { 1 }, /* Block timer interrupts */
- .syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really. We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly. This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
- unsigned long arg2, unsigned long arg3,
- unsigned long arg4)
-{
- /* Note: This code assumes we're uniprocessor. */
- static unsigned int next_call;
- unsigned long flags;
-
- /*
- * Disable interrupts if not already disabled: we don't want an
- * interrupt handler making a hypercall while we're already doing
- * one!
- */
- local_irq_save(flags);
- if (lguest_data.hcall_status[next_call] != 0xFF) {
- /* Table full, so do normal hcall which will flush table. */
- hcall(call, arg1, arg2, arg3, arg4);
- } else {
- lguest_data.hcalls[next_call].arg0 = call;
- lguest_data.hcalls[next_call].arg1 = arg1;
- lguest_data.hcalls[next_call].arg2 = arg2;
- lguest_data.hcalls[next_call].arg3 = arg3;
- lguest_data.hcalls[next_call].arg4 = arg4;
- /* Arguments must all be written before we mark it to go */
- wmb();
- lguest_data.hcall_status[next_call] = 0;
- if (++next_call == LHCALL_RING_SIZE)
- next_call = 0;
- }
- local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall(). This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off. Because hypercalls
- * are reasonably expensive, batching them up makes sense. For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, 0, 0, 0);
- else
- async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
- unsigned long arg1,
- unsigned long arg2)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, 0, 0);
- else
- async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, arg3, 0);
- else
- async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, arg3, arg4);
- else
- async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
- paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
- paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction. The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags"). The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag. Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
- return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
- lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment. Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules. In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register. To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch. One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled. There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag. If an interrupt does come
- * in, we then disable them for real. This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in. Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares? The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
- int entrynum, const gate_desc *g)
-{
- /*
- * The gate_desc structure is 8 bytes long: we hand it to the Host in
- * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
- * around like this; typesafety wasn't a big concern in Linux's early
- * years.
- */
- u32 *desc = (u32 *)g;
- /* Keep the local copy up to date. */
- native_write_idt_entry(dt, entrynum, g);
- /* Tell Host about this new entry. */
- hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
- unsigned int i;
- struct desc_struct *idt = (void *)desc->address;
-
- for (i = 0; i < (desc->size+1)/8; i++)
- hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table. There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
- unsigned int i;
- struct desc_struct *gdt = (void *)desc->address;
-
- for (i = 0; i < (desc->size+1)/8; i++)
- hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
- const void *desc, int type)
-{
- native_write_gdt_entry(dt, entrynum, desc, type);
- /* Tell Host about this new entry. */
- hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
- dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables). As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- /*
- * There's one problem which normal hardware doesn't have: the Host
- * can't handle us removing entries we're currently using. So we clear
- * the GS register here: if it's needed it'll be reloaded anyway.
- */
- lazy_load_gs(0);
- lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy. Linux only
- * uses this for some strange applications like Wine. We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment. Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops. So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features. It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 6 languages. I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction. Then I pretty much turned
- * off feature bits until the Guest booted. (Don't say that: you'll damage
- * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
- * hardly future proof.) No one's listening! They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged. So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int function = *ax;
-
- native_cpuid(ax, bx, cx, dx);
- switch (function) {
- /*
- * CPUID 0 gives the highest legal CPUID number (and the ID string).
- * We futureproof our code a little by sticking to known CPUID values.
- */
- case 0:
- if (*ax > 5)
- *ax = 5;
- break;
-
- /*
- * CPUID 1 is a basic feature request.
- *
- * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
- * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
- */
- case 1:
- *cx &= 0x00002201;
- *dx &= 0x07808151;
- /*
- * The Host can do a nice optimization if it knows that the
- * kernel mappings (addresses above 0xC0000000 or whatever
- * PAGE_OFFSET is set to) haven't changed. But Linux calls
- * flush_tlb_user() for both user and kernel mappings unless
- * the Page Global Enable (PGE) feature bit is set.
- */
- *dx |= 0x00002000;
- /*
- * We also lie, and say we're family id 5. 6 or greater
- * leads to a rdmsr in early_init_intel which we can't handle.
- * Family ID is returned as bits 8-12 in ax.
- */
- *ax &= 0xFFFFF0FF;
- *ax |= 0x00000500;
- break;
-
- /*
- * This is used to detect if we're running under KVM. We might be,
- * but that's a Host matter, not us. So say we're not.
- */
- case KVM_CPUID_SIGNATURE:
- *bx = *cx = *dx = 0;
- break;
-
- /*
- * 0x80000000 returns the highest Extended Function, so we futureproof
- * like we do above by limiting it to known fields.
- */
- case 0x80000000:
- if (*ax > 0x80000008)
- *ax = 0x80000008;
- break;
-
- /*
- * PAE systems can mark pages as non-executable. Linux calls this the
- * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
- * Virus Protection). We just switch it off here, since we don't
- * support it.
- */
- case 0x80000001:
- *dx &= ~(1 << 20);
- break;
- }
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it. The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0. cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used. Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
- return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads. The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
- return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
- lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
- current_cr3 = cr3;
-
- /* These two page tables are simple, linear, and used during boot */
- if (cr3 != __pa_symbol(swapper_pg_dir) &&
- cr3 != __pa_symbol(initial_page_table))
- cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
- return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
- return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant. The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU
- * maps virtual addresses to physical addresses using "page tables". We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables. But since most virtual addresses
- * are unused, we use a two level index which saves space. The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages. Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- * | --------->+---------+
- * | | | PADDR1 |
- * Mid-level | | PADDR2 |
- * (PMD) page | | |
- * | | Lower-level |
- * | | (PTE) page |
- * | | | |
- * .... ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page. If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- * Index into top Index into second Offset within page
- * page directory page pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper ]
- * [ Level ]
- * [ Entries ]
- * [(PUD Page)]---> +---------+
- * | --------->+---------+
- * | | | PADDR1 |
- * Mid-level | | PADDR2 |
- * (PMD) page | | |
- * | | Lower-level |
- * | | (PTE) page |
- * | | | |
- * .... ....
- *
- *
- * And the virtual address is decoded as:
- *
- * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into Index into mid Index into lower Offset within page
- * top entries directory page pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages. The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space. We tell the Host the toplevel and
- * address this corresponds to. The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
- /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
- lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
- ptep->pte_low, ptep->pte_high);
-#else
- lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
- native_set_pte(ptep, pteval);
- lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
- native_set_pud(pudp, pudval);
-
- /* 32 bytes aligned pdpt address and the index. */
- lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
- (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- native_set_pmd(pmdp, pmdval);
- lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- native_set_pmd(pmdp, pmdval);
- lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more. This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them. Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds! So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
- native_set_pte(ptep, pteval);
- if (cr3_changed)
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry. These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- native_set_pte_atomic(ptep, pte);
- if (cr3_changed)
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- native_pte_clear(mm, addr, ptep);
- lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
- lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations. On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present). Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
- /* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
- lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed. That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit. We don't actually "ack" interrupts as such, we
- * just mask and unmask them. I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
- set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
- clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
- .name = "lguest",
- .irq_mask = disable_lguest_irq,
- .irq_mask_ack = disable_lguest_irq,
- .irq_unmask = enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
- struct irq_desc *desc;
- int err;
-
- /* Returns -ve error or vector number. */
- err = irq_alloc_desc_at(irq, 0);
- if (err < 0 && err != -EEXIST)
- return err;
-
- /*
- * Tell the Linux infrastructure that the interrupt is
- * controlled by our level-based lguest interrupt controller.
- */
- irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
- handle_level_irq, "level");
-
- /* Some systems map "vectors" to interrupts weirdly. Not us! */
- desc = irq_to_desc(irq);
- __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
- return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
- int err;
- u8 line = 0;
-
- /* We literally use the PCI interrupt line as the irq number. */
- pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
- err = lguest_setup_irq(line);
- if (!err)
- dev->irq = line;
- return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
- WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
- unsigned int i;
-
- for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
- if (i != IA32_SYSCALL_VECTOR)
- set_intr_gate(i, irq_entries_start +
- 8 * (i - FIRST_EXTERNAL_VECTOR));
- }
-
- /*
- * This call is required to set up for 4k stacks, where we have
- * separate stacks for hard and soft interrupts.
- */
- irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
- *now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
- return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
- unsigned long sec, nsec;
-
- /*
- * Since the time is in two parts (seconds and nanoseconds), we risk
- * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
- * and getting 99 and 0. As Linux tends to come apart under the stress
- * of time travel, we must be careful:
- */
- do {
- /* First we read the seconds part. */
- sec = lguest_data.time.tv_sec;
- /*
- * This read memory barrier tells the compiler and the CPU that
- * this can't be reordered: we have to complete the above
- * before going on.
- */
- rmb();
- /* Now we read the nanoseconds part. */
- nsec = lguest_data.time.tv_nsec;
- /* Make sure we've done that. */
- rmb();
- /* Now if the seconds part has changed, try again. */
- } while (unlikely(lguest_data.time.tv_sec != sec));
-
- /* Our lguest clock is in real nanoseconds. */
- return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
- .name = "lguest",
- .rating = 200,
- .read = lguest_clock_read,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future. Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* FIXME: I don't think this can ever happen, but James tells me he had
- * to put this code in. Maybe we should remove it now. Anyone? */
- if (delta < LG_CLOCK_MIN_DELTA) {
- if (printk_ratelimit())
- printk(KERN_DEBUG "%s: small delta %lu ns\n",
- __func__, delta);
- return -ETIME;
- }
-
- /* Please wake us this far in the future. */
- hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
- return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
- /* A 0 argument shuts the clock down. */
- hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
- return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
- .name = "lguest",
- .features = CLOCK_EVT_FEAT_ONESHOT,
- .set_next_event = lguest_clockevent_set_next_event,
- .set_state_shutdown = lguest_clockevent_shutdown,
- .rating = INT_MAX,
- .mult = 1,
- .shift = 0,
- .min_delta_ns = LG_CLOCK_MIN_DELTA,
- .min_delta_ticks = LG_CLOCK_MIN_DELTA,
- .max_delta_ns = LG_CLOCK_MAX_DELTA,
- .max_delta_ticks = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0). We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
- unsigned long flags;
-
- /* Don't interrupt us while this is running. */
- local_irq_save(flags);
- lguest_clockevent.event_handler(&lguest_clockevent);
- local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure. The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
- /* Set up the timer interrupt (0) to go to our simple timer routine */
- if (lguest_setup_irq(0) != 0)
- panic("Could not set up timer irq");
- irq_set_handler(0, lguest_time_irq);
-
- clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
- /* We can't set cpumask in the initializer: damn C limitations! Set it
- * here and register our timer device. */
- lguest_clockevent.cpumask = cpumask_of(0);
- clockevents_register_device(&lguest_clockevent);
-
- /* Finally, we unblock the timer interrupt. */
- clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work. They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use. For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
- THREAD_SIZE / PAGE_SIZE);
- tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
- /* FIXME: Implement */
- return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
- /* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices). This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that. Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0. So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads. So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced. It will also never interrupt anything. It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
- return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
- return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
- /* Warn to see if there's any stray references */
- WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
- return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
- return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
- apic->read = lguest_apic_read;
- apic->write = lguest_apic_write;
- apic->icr_read = lguest_apic_icr_read;
- apic->icr_write = lguest_apic_icr_write;
- apic->wait_icr_idle = lguest_apic_wait_icr_idle;
- apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP! Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
- hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
- hcall(LHCALL_SHUTDOWN, __pa("Power down"),
- LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't. But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
- hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
- /* The hcall won't return, but to keep gcc happy, we're "done". */
- return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
- .notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
- /*
- * The Linux bootloader header contains an "e820" memory map: the
- * Launcher populated the first entry with our memory limit.
- */
- e820__range_add(boot_params.e820_table[0].addr,
- boot_params.e820_table[0].size,
- boot_params.e820_table[0].type);
-
- /* This string is for the boot messages. */
- return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
- write_pci_config_byte(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, bar),
- 0);
- write_pci_config(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, length),
- 4);
- write_pci_config(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, offset),
- off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
- /*
- * We could set this up once, then leave it; nothing else in the *
- * kernel should touch these registers. But if it went wrong, that
- * would be a horrible bug to find.
- */
- set_cfg_window(cfg_offset, off);
- write_pci_config(0, 1, 0,
- cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
- u8 cap, common_cap = 0, device_cap = 0;
- u32 device_len;
-
- /* Avoid recursive printk into here. */
- console_cfg_offset = -1;
-
- if (!early_pci_allowed()) {
- printk(KERN_ERR "lguest: early PCI access not allowed!\n");
- return;
- }
-
- /* We expect a console PCI device at BUS0, slot 1. */
- if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
- printk(KERN_ERR "lguest: PCI device is %#x!\n",
- read_pci_config(0, 1, 0, 0));
- return;
- }
-
- /* Find the capabilities we need (must be in bar0) */
- cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
- while (cap) {
- u8 vndr = read_pci_config_byte(0, 1, 0, cap);
- if (vndr == PCI_CAP_ID_VNDR) {
- u8 type, bar;
-
- type = read_pci_config_byte(0, 1, 0,
- cap + offsetof(struct virtio_pci_cap, cfg_type));
- bar = read_pci_config_byte(0, 1, 0,
- cap + offsetof(struct virtio_pci_cap, bar));
-
- switch (type) {
- case VIRTIO_PCI_CAP_DEVICE_CFG:
- if (bar == 0)
- device_cap = cap;
- break;
- case VIRTIO_PCI_CAP_PCI_CFG:
- console_access_cap = cap;
- break;
- }
- }
- cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
- }
- if (!device_cap || !console_access_cap) {
- printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
- common_cap, device_cap, console_access_cap);
- return;
- }
-
- /*
- * Note that we can't check features, until we've set the DRIVER
- * status bit. We don't want to do that until we have a real driver,
- * so we just check that the device-specific config has room for
- * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
- * it should ignore the access.
- */
- device_len = read_pci_config(0, 1, 0,
- device_cap + offsetof(struct virtio_pci_cap, length));
- if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
- + sizeof(u32))) {
- printk(KERN_ERR "lguest: console missing emerg_wr field\n");
- return;
- }
-
- console_cfg_offset = read_pci_config(0, 1, 0,
- device_cap + offsetof(struct virtio_pci_cap, offset));
- printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
- /* If we couldn't find PCI console, forget it. */
- if (console_cfg_offset < 0)
- return count;
-
- if (unlikely(!console_cfg_offset)) {
- probe_pci_console();
- if (console_cfg_offset < 0)
- return count;
- }
-
- write_bar_via_cfg(console_access_cap,
- console_cfg_offset
- + offsetof(struct virtio_console_config, emerg_wr),
- buf[0]);
- return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
- hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel. This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"? The rest of that quote is
- * "... But that usually will create another problem." This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient. We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
- const char *start, *end;
-} lguest_insns[] = {
- [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
- [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c). If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
- unsigned long addr, unsigned len)
-{
- unsigned int insn_len;
-
- /* Don't do anything special if we don't have a replacement */
- if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
- return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
- insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
- /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
- if (len < insn_len)
- return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
- /* Copy in our instructions. */
- memcpy(ibuf, lguest_insns[type].start, insn_len);
- return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest. The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
- /* We're under lguest. */
- pv_info.name = "lguest";
- /* We're running at privilege level 1, not 0 as normal. */
- pv_info.kernel_rpl = 1;
- /* Everyone except Xen runs with this set. */
- pv_info.shared_kernel_pmd = 1;
-
- /*
- * We set up all the lguest overrides for sensitive operations. These
- * are detailed with the operations themselves.
- */
-
- /* Interrupt-related operations */
- pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
- pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
- pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
- pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
- pv_irq_ops.safe_halt = lguest_safe_halt;
-
- /* Setup operations */
- pv_init_ops.patch = lguest_patch;
-
- /* Intercepts of various CPU instructions */
- pv_cpu_ops.load_gdt = lguest_load_gdt;
- pv_cpu_ops.cpuid = lguest_cpuid;
- pv_cpu_ops.load_idt = lguest_load_idt;
- pv_cpu_ops.iret = lguest_iret;
- pv_cpu_ops.load_sp0 = lguest_load_sp0;
- pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
- pv_cpu_ops.set_ldt = lguest_set_ldt;
- pv_cpu_ops.load_tls = lguest_load_tls;
- pv_cpu_ops.get_debugreg = lguest_get_debugreg;
- pv_cpu_ops.set_debugreg = lguest_set_debugreg;
- pv_cpu_ops.read_cr0 = lguest_read_cr0;
- pv_cpu_ops.write_cr0 = lguest_write_cr0;
- pv_cpu_ops.read_cr4 = lguest_read_cr4;
- pv_cpu_ops.write_cr4 = lguest_write_cr4;
- pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
- pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
- pv_cpu_ops.wbinvd = lguest_wbinvd;
- pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
- pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
- /* Pagetable management */
- pv_mmu_ops.write_cr3 = lguest_write_cr3;
- pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
- pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
- pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
- pv_mmu_ops.set_pte = lguest_set_pte;
- pv_mmu_ops.set_pte_at = lguest_set_pte_at;
- pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
- pv_mmu_ops.pte_clear = lguest_pte_clear;
- pv_mmu_ops.pmd_clear = lguest_pmd_clear;
- pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
- pv_mmu_ops.read_cr2 = lguest_read_cr2;
- pv_mmu_ops.read_cr3 = lguest_read_cr3;
- pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
- pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
- pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /* APIC read/write intercepts */
- set_lguest_basic_apic_ops();
-#endif
-
- x86_init.resources.memory_setup = lguest_memory_setup;
- x86_init.irqs.intr_init = lguest_init_IRQ;
- x86_init.timers.timer_init = lguest_time_init;
- x86_platform.calibrate_tsc = lguest_tsc_khz;
- x86_platform.get_wallclock = lguest_get_wallclock;
-
- /*
- * Now is a good time to look at the implementations of these functions
- * before returning to the rest of lguest_init().
- */
-
- /*G:070
- * Now we've seen all the paravirt_ops, we return to
- * lguest_init() where the rest of the fairly chaotic boot setup
- * occurs.
- */
-
- /*
- * The stack protector is a weird thing where gcc places a canary
- * value on the stack and then checks it on return. This file is
- * compiled with -fno-stack-protector it, so we got this far without
- * problems. The value of the canary is kept at offset 20 from the
- * %gs register, so we need to set that up before calling C functions
- * in other files.
- */
- setup_stack_canary_segment(0);
-
- /*
- * We could just call load_stack_canary_segment(), but we might as well
- * call switch_to_new_gdt() which loads the whole table and sets up the
- * per-cpu segment descriptor register %fs as well.
- */
- switch_to_new_gdt(0);
-
- /*
- * The Host<->Guest Switcher lives at the top of our address space, and
- * the Host told us how big it is when we made LGUEST_INIT hypercall:
- * it put the answer in lguest_data.reserve_mem
- */
- reserve_top_address(lguest_data.reserve_mem);
-
- /* Hook in our special panic hypercall code. */
- atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
- /*
- * This is messy CPU setup stuff which the native boot code does before
- * start_kernel, so we have to do, too:
- */
- cpu_detect(&new_cpu_data);
- /* head.S usually sets up the first capability word, so do it here. */
- new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
- /* Math is always hard! */
- set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
- /* We don't have features. We have puppies! Puppies! */
-#ifdef CONFIG_X86_MCE
- mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
- acpi_disabled = 1;
-#endif
-
- /*
- * We set the preferred console to "hvc". This is the "hypervisor
- * virtual console" driver written by the PowerPC people, which we also
- * adapted for lguest's use.
- */
- add_preferred_console("hvc", 0, NULL);
-
- /* Register our very early console. */
- virtio_cons_early_init(early_put_chars);
-
- /* Don't let ACPI try to control our PCI interrupts. */
- disable_acpi();
-
- /* We control them ourselves, by overriding these two hooks. */
- pcibios_enable_irq = lguest_enable_irq;
- pcibios_disable_irq = lguest_disable_irq;
-
- /*
- * Last of all, we set the power management poweroff hook to point to
- * the Guest routine to power off, and the reboot hook to our restart
- * routine.
- */
- pm_power_off = lguest_power_off;
- machine_ops.restart = lguest_restart;
-
- /*
- * Now we're set up, call i386_start_kernel() in head32.c and we proceed
- * to boot as normal. It never returns.
- */
- i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
deleted file mode 100644
index d5ae63f5ec5d..000000000000
--- a/arch/x86/lguest/head_32.S
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables. Finally it checks the 'hardware_subarch' field. This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here! We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
- /*
- * We make the "initialization" hypercall now to tell the Host where
- * our lguest_data struct is.
- */
- movl $LHCALL_LGUEST_INIT, %eax
- movl $lguest_data - __PAGE_OFFSET, %ebx
- int $LGUEST_TRAP_ENTRY
-
- /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
- movl $LHCALL_NEW_PGTABLE, %eax
- movl $(initial_page_table - __PAGE_OFFSET), %ebx
- int $LGUEST_TRAP_ENTRY
-
- /* Set up the initial stack so we can run C code. */
- movl $(init_thread_union+THREAD_SIZE),%esp
-
- /* Jumps are relative: we're running __PAGE_OFFSET too low. */
- jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers. These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...) \
- lgstart_##name: insns; lgend_##name:; \
- .globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later). If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages. First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
- /*
- * The reverse of irq_disable, this sets lguest_data.irq_enabled to
- * X86_EFLAGS_IF (ie. "Interrupts enabled").
- */
- movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
- /*
- * But now we need to check if the Host wants to know: there might have
- * been interrupts waiting to be delivered, in which case it will have
- * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
- * jump to send_interrupts, otherwise we're done.
- */
- cmpl $0, lguest_data+LGUEST_DATA_irq_pending
- jnz send_interrupts
- /*
- * One cool thing about x86 is that you can do many things without using
- * a register. In this case, the normal path hasn't needed to save or
- * restore any registers at all!
- */
- ret
-send_interrupts:
- /*
- * OK, now we need a register: eax is used for the hypercall number,
- * which is LHCALL_SEND_INTERRUPTS.
- *
- * We used not to bother with this pending detection at all, which was
- * much simpler. Sooner or later the Host would realize it had to
- * send us an interrupt. But that turns out to make performance 7
- * times worse on a simple tcp benchmark. So now we do this the hard
- * way.
- */
- pushl %eax
- movl $LHCALL_SEND_INTERRUPTS, %eax
- /* This is the actual hypercall trap. */
- int $LGUEST_TRAP_ENTRY
- /* Put eax back the way we found it. */
- popl %eax
- ret
-
-/*
- * Finally, the "popf" or "restore flags" routine. The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
- /* This is just "lguest_data.irq_enabled = flags;" */
- movl %eax, lguest_data+LGUEST_DATA_irq_enabled
- /*
- * Now, if the %eax value has enabled interrupts and
- * lguest_data.irq_pending is set, we want to tell the Host so it can
- * deliver any outstanding interrupts. Fortunately, both values will
- * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
- * instruction will AND them together for us. If both are set, we
- * jump to send_interrupts.
- */
- testl lguest_data+LGUEST_DATA_irq_pending, %eax
- jnz send_interrupts
- /* Again, the normal path has used no extra registers. Clever, huh? */
- ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it. However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway. If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last. It's *cool*! It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap. The
- * stack looks like this:
- * old address
- * old code segment & privilege level
- * old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once. The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic. The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever. Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
- pushl 2*4(%esp)
- /*
- * Note the %ss: segment prefix here. Normal data accesses use the
- * "ds" segment, but that will have already been restored for whatever
- * we're returning to (such as userspace): we can't trust it. The %ss:
- * prefix makes sure we use the stack segment, which is still valid.
- */
- popl %ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
- iret
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 5cc78bf57232..3261abb21ef4 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
return 0; /* Buffer overrun */
}
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+ const char *option, char *buffer, int bufsize)
+{
+ char c;
+ int pos = 0, len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy, /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos++ < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ if (!c)
+ break;
+
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ /* fall through */
+
+ case st_wordcmp:
+ if ((c == '=') && !*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for, prepare to
+ * copy the argument.
+ */
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ break;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ /* fall through */
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ /*
+ * Increment len, but don't overrun the
+ * supplied buffer and leave room for the
+ * NULL terminator.
+ */
+ if (++len < bufsize)
+ *bufptr++ = c;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
+
int cmdline_find_option_bool(const char *cmdline, const char *option)
{
return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+ int bufsize)
+{
+ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+ buffer, bufsize);
+}
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
index f77ba3058b31..066996dba6a2 100644
--- a/arch/x86/math-emu/div_Xsig.S
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -363,3 +363,4 @@ L_bugged_2:
pop %ebx
jmp L_exit
#endif /* PARANOID */
+ENDPROC(div_Xsig)
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
index 47099628fa4c..2c71527bd917 100644
--- a/arch/x86/math-emu/div_small.S
+++ b/arch/x86/math-emu/div_small.S
@@ -44,4 +44,4 @@ ENTRY(FPU_div_small)
leave
ret
-
+ENDPROC(FPU_div_small)
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
index 717785a53eb4..22e0631bb85a 100644
--- a/arch/x86/math-emu/mul_Xsig.S
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -62,6 +62,7 @@ ENTRY(mul32_Xsig)
popl %esi
leave
ret
+ENDPROC(mul32_Xsig)
ENTRY(mul64_Xsig)
@@ -114,6 +115,7 @@ ENTRY(mul64_Xsig)
popl %esi
leave
ret
+ENDPROC(mul64_Xsig)
@@ -173,4 +175,4 @@ ENTRY(mul_Xsig_Xsig)
popl %esi
leave
ret
-
+ENDPROC(mul_Xsig_Xsig)
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
index 17315c89ff3d..a9aaf414135d 100644
--- a/arch/x86/math-emu/polynom_Xsig.S
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -133,3 +133,4 @@ L_accum_done:
popl %esi
leave
ret
+ENDPROC(polynomial_Xsig)
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
index 8b6352efceef..53ac1a343c69 100644
--- a/arch/x86/math-emu/reg_norm.S
+++ b/arch/x86/math-emu/reg_norm.S
@@ -94,6 +94,7 @@ L_overflow:
call arith_overflow
pop %ebx
jmp L_exit
+ENDPROC(FPU_normalize)
@@ -145,3 +146,4 @@ L_exit_nuo_zero:
popl %ebx
leave
ret
+ENDPROC(FPU_normalize_nuo)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
index d1d4e48b4f67..41af5b208d88 100644
--- a/arch/x86/math-emu/reg_round.S
+++ b/arch/x86/math-emu/reg_round.S
@@ -706,3 +706,5 @@ L_exception_exit:
mov $-1,%eax
jmp fpu_reg_round_special_exit
#endif /* PARANOID */
+
+ENDPROC(FPU_round)
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
index 47c4c2434d85..3b1bc5e9b2f6 100644
--- a/arch/x86/math-emu/reg_u_add.S
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -165,3 +165,4 @@ L_exit:
leave
ret
#endif /* PARANOID */
+ENDPROC(FPU_u_add)
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
index cc00654b6f9a..796eb5ab921b 100644
--- a/arch/x86/math-emu/reg_u_div.S
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -469,3 +469,5 @@ L_exit:
leave
ret
#endif /* PARANOID */
+
+ENDPROC(FPU_u_div)
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
index 973f12af97df..6196f68cf3c1 100644
--- a/arch/x86/math-emu/reg_u_mul.S
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -146,3 +146,4 @@ L_exit:
ret
#endif /* PARANOID */
+ENDPROC(FPU_u_mul)
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
index 1b6c24801d22..d115b900919a 100644
--- a/arch/x86/math-emu/reg_u_sub.S
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -270,3 +270,4 @@ L_exit:
popl %esi
leave
ret
+ENDPROC(FPU_u_sub)
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
index bbe0e87718e4..87c99749a495 100644
--- a/arch/x86/math-emu/round_Xsig.S
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -78,7 +78,7 @@ L_exit:
popl %ebx
leave
ret
-
+ENDPROC(round_Xsig)
@@ -138,4 +138,4 @@ L_n_exit:
popl %ebx
leave
ret
-
+ENDPROC(norm_Xsig)
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
index 31cdd118e918..c8552edeec75 100644
--- a/arch/x86/math-emu/shr_Xsig.S
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -85,3 +85,4 @@ L_more_than_95:
popl %esi
leave
ret
+ENDPROC(shr_Xsig)
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
index 518428317985..340dd6897f85 100644
--- a/arch/x86/math-emu/wm_shrx.S
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -92,6 +92,7 @@ L_more_than_95:
popl %esi
leave
ret
+ENDPROC(FPU_shrx)
/*---------------------------------------------------------------------------+
@@ -202,3 +203,4 @@ Ls_more_than_95:
popl %esi
leave
ret
+ENDPROC(FPU_shrxs)
diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S
index d258f59564e1..695afae38fdf 100644
--- a/arch/x86/math-emu/wm_sqrt.S
+++ b/arch/x86/math-emu/wm_sqrt.S
@@ -468,3 +468,4 @@ sqrt_more_prec_large:
/* Our estimate is too large */
movl $0x7fffff00,%eax
jmp sqrt_round_result
+ENDPROC(wm_sqrt)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 0fbdcb64f9f8..72bf8c01c6e3 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -39,3 +39,5 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0470826d2bdc..5e3ac6fe6c9e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -13,12 +13,12 @@
*/
#include <linux/debugfs.h>
+#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
-#include <asm/kasan.h>
#include <asm/pgtable.h>
/*
@@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
{
pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
- { "cr3", "pgd", "pud", "pmd", "pte" };
+ { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
if (!pgprot_val(prot)) {
/* Not present */
@@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
pt_dump_cont_printf(m, dmsg, " ");
/* Bit 7 has a different meaning on level 3 vs 4 */
- if (level <= 3 && pr & _PAGE_PSE)
+ if (level <= 4 && pr & _PAGE_PSE)
pt_dump_cont_printf(m, dmsg, "PSE ");
else
pt_dump_cont_printf(m, dmsg, " ");
- if ((level == 4 && pr & _PAGE_PAT) ||
- ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+ if ((level == 5 && pr & _PAGE_PAT) ||
+ ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
pt_dump_cont_printf(m, dmsg, "PAT ");
else
pt_dump_cont_printf(m, dmsg, " ");
@@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
*/
static unsigned long normalize_addr(unsigned long u)
{
-#ifdef CONFIG_X86_64
- return (signed long)(u << 16) >> 16;
-#else
- return u;
-#endif
+ int shift;
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return u;
+
+ shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+ return (signed long)(u << shift) >> shift;
}
/*
@@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
for (i = 0; i < PTRS_PER_PTE; i++) {
prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
- note_page(m, st, __pgprot(prot), 4);
+ note_page(m, st, __pgprot(prot), 5);
start++;
}
}
+#ifdef CONFIG_KASAN
+
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_zero_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+ void *pt)
+{
+ if (__pa(pt) == __pa(kasan_zero_pmd) ||
+#ifdef CONFIG_X86_5LEVEL
+ __pa(pt) == __pa(kasan_zero_p4d) ||
+#endif
+ __pa(pt) == __pa(kasan_zero_pud)) {
+ pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
+ note_page(m, st, __pgprot(prot), 5);
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+ void *pt)
+{
+ return false;
+}
+#endif
#if PTRS_PER_PMD > 1
static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
{
int i;
- pmd_t *start;
+ pmd_t *start, *pmd_start;
pgprotval_t prot;
- start = (pmd_t *)pud_page_vaddr(addr);
+ pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
if (pmd_large(*start) || !pmd_present(*start)) {
prot = pmd_flags(*start);
- note_page(m, st, __pgprot(prot), 3);
- } else {
+ note_page(m, st, __pgprot(prot), 4);
+ } else if (!kasan_page_table(m, st, pmd_start)) {
walk_pte_level(m, st, *start,
P + i * PMD_LEVEL_MULT);
}
} else
- note_page(m, st, __pgprot(0), 3);
+ note_page(m, st, __pgprot(0), 4);
start++;
}
}
@@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
#if PTRS_PER_PUD > 1
-/*
- * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
- * KASAN fills page tables with the same values. Since there is no
- * point in checking page table more than once we just skip repeated
- * entries. This saves us dozens of seconds during boot.
- */
-static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
-{
- return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
-}
-
static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
{
int i;
- pud_t *start;
+ pud_t *start, *pud_start;
pgprotval_t prot;
pud_t *prev_pud = NULL;
- start = (pud_t *)p4d_page_vaddr(addr);
+ pud_start = start = (pud_t *)p4d_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
- if (!pud_none(*start) &&
- !pud_already_checked(prev_pud, start, st->check_wx)) {
+ if (!pud_none(*start)) {
if (pud_large(*start) || !pud_present(*start)) {
prot = pud_flags(*start);
- note_page(m, st, __pgprot(prot), 2);
- } else {
+ note_page(m, st, __pgprot(prot), 3);
+ } else if (!kasan_page_table(m, st, pud_start)) {
walk_pmd_level(m, st, *start,
P + i * PUD_LEVEL_MULT);
}
} else
- note_page(m, st, __pgprot(0), 2);
+ note_page(m, st, __pgprot(0), 3);
prev_pud = start;
start++;
@@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
{
int i;
- p4d_t *start;
+ p4d_t *start, *p4d_start;
pgprotval_t prot;
- start = (p4d_t *)pgd_page_vaddr(addr);
+ p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_P4D; i++) {
st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
@@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
if (p4d_large(*start) || !p4d_present(*start)) {
prot = p4d_flags(*start);
note_page(m, st, __pgprot(prot), 2);
- } else {
+ } else if (!kasan_page_table(m, st, p4d_start)) {
walk_pud_level(m, st, *start,
P + i * P4D_LEVEL_MULT);
}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 0ea8afcb929c..c076f710de4c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL_GPL(ex_handler_fault);
+/*
+ * Handler for UD0 exception following a failed test against the
+ * result of a refcount inc/dec/add/sub.
+ */
+bool ex_handler_refcount(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ /* First unconditionally saturate the refcount. */
+ *(int *)regs->cx = INT_MIN / 2;
+
+ /*
+ * Strictly speaking, this reports the fixup destination, not
+ * the fault location, and not the actually overflowing
+ * instruction, which is the instruction before the "js", but
+ * since that instruction could be a variety of lengths, just
+ * report the location after the overflow, which should be close
+ * enough for finding the overflow, as it's at least back in
+ * the function, having returned from .text.unlikely.
+ */
+ regs->ip = ex_fixup_addr(fixup);
+
+ /*
+ * This function has been called because either a negative refcount
+ * value was seen by any of the refcount functions, or a zero
+ * refcount value was seen by refcount_dec().
+ *
+ * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
+ * wrapped around) will be set. Additionally, seeing the refcount
+ * reach 0 will set ZF (Zero Flag: result was zero). In each of
+ * these cases we want a report, since it's a boundary condition.
+ *
+ */
+ if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
+ bool zero = regs->flags & X86_EFLAGS_ZF;
+
+ refcount_error_report(regs, zero ? "hit zero" : "overflow");
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_refcount);
+
bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
@@ -142,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
* undefined. I'm not sure which CPUs do this, but at least
* the 486 DX works this way.
*/
- if ((regs->cs & 0xFFFF) != __KERNEL_CS)
+ if (regs->cs != __KERNEL_CS)
goto fail;
/*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..0cdf14cf3270 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -396,14 +396,18 @@ static void dump_pagetable(unsigned long address)
pte_t *pte;
#ifdef CONFIG_X86_PAE
- printk("*pdpt = %016Lx ", pgd_val(*pgd));
+ pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
goto out;
+#define pr_pde pr_cont
+#else
+#define pr_pde pr_info
#endif
p4d = p4d_offset(pgd, address);
pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
- printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+ pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+#undef pr_pde
/*
* We must not directly access the pte in the highpte
@@ -415,9 +419,9 @@ static void dump_pagetable(unsigned long address)
goto out;
pte = pte_offset_kernel(pmd, address);
- printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
+ pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
- printk("\n");
+ pr_cont("\n");
}
#else /* CONFIG_X86_64: */
@@ -565,7 +569,7 @@ static void dump_pagetable(unsigned long address)
if (bad_address(pgd))
goto bad;
- printk("PGD %lx ", pgd_val(*pgd));
+ pr_info("PGD %lx ", pgd_val(*pgd));
if (!pgd_present(*pgd))
goto out;
@@ -574,7 +578,7 @@ static void dump_pagetable(unsigned long address)
if (bad_address(p4d))
goto bad;
- printk("P4D %lx ", p4d_val(*p4d));
+ pr_cont("P4D %lx ", p4d_val(*p4d));
if (!p4d_present(*p4d) || p4d_large(*p4d))
goto out;
@@ -582,7 +586,7 @@ static void dump_pagetable(unsigned long address)
if (bad_address(pud))
goto bad;
- printk("PUD %lx ", pud_val(*pud));
+ pr_cont("PUD %lx ", pud_val(*pud));
if (!pud_present(*pud) || pud_large(*pud))
goto out;
@@ -590,7 +594,7 @@ static void dump_pagetable(unsigned long address)
if (bad_address(pmd))
goto bad;
- printk("PMD %lx ", pmd_val(*pmd));
+ pr_cont("PMD %lx ", pmd_val(*pmd));
if (!pmd_present(*pmd) || pmd_large(*pmd))
goto out;
@@ -598,12 +602,12 @@ static void dump_pagetable(unsigned long address)
if (bad_address(pte))
goto bad;
- printk("PTE %lx", pte_val(*pte));
+ pr_cont("PTE %lx", pte_val(*pte));
out:
- printk("\n");
+ pr_cont("\n");
return;
bad:
- printk("BAD\n");
+ pr_info("BAD\n");
}
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 2824607df108..6d06cf33e3de 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/elf.h>
+#include <asm/mpx.h>
#if 0 /* This is just for testing */
struct page *
@@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
info.flags = 0;
info.length = len;
info.low_limit = get_mmap_base(1);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ */
info.high_limit = in_compat_syscall() ?
- tasksize_32bit() : tasksize_64bit();
+ task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
- unsigned long addr0, unsigned long len,
+ unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
- unsigned long addr;
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = get_mmap_base(0);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
@@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
+ info.high_limit = TASK_SIZE_LOW;
addr = vm_unmapped_area(&info);
}
@@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (len & ~huge_page_mask(h))
return -EINVAL;
+
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
if (len > TASK_SIZE)
return -ENOMEM;
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index adab1595f4bd..31cea988fa36 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
if (!pmd)
return -ENOMEM;
ident_pmd_init(info, pmd, addr, next);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
}
return 0;
@@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
if (!pud)
return -ENOMEM;
ident_pud_init(info, pud, addr, next);
- set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
}
return 0;
@@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long next;
int result;
+ /* Set the default pagetable flags if not supplied */
+ if (!info->kernpg_flag)
+ info->kernpg_flag = _KERNPG_TABLE;
+
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr);
p4d_t *p4d;
@@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
if (result)
return result;
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
} else {
/*
* With p4d folded, pgd is equal to p4d.
* The pgd entry has to point to the pud page table in this case.
*/
pud_t *pud = pud_offset(p4d, 0);
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
}
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 673541eb3b3f..7777ccc0e9f9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,7 @@
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
#include <asm/kaslr.h>
+#include <asm/hypervisor.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -636,6 +637,8 @@ void __init init_mem_mapping(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
+ hypervisor_init_mem_mapping();
+
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
@@ -812,7 +815,7 @@ void __init zone_sizes_init(void)
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
- .state = 0,
+ .next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 4c1b5fd0c7ad..34f0e1847dd6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,8 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
+#include <linux/mem_encrypt.h>
+#include <linux/efi.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@@ -21,6 +23,7 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/pat.h>
+#include <asm/setup.h>
#include "physaddr.h"
@@ -106,12 +109,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
}
/*
- * Don't remap the low PCI/ISA area, it's always mapped..
- */
- if (is_ISA_range(phys_addr, last_addr))
- return (__force void __iomem *)phys_to_virt(phys_addr);
-
- /*
* Don't allow anybody to remap normal RAM that we're using..
*/
pfn = phys_addr >> PAGE_SHIFT;
@@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr)
return;
/*
- * __ioremap special-cases the PCI/ISA range by not instantiating a
- * vm_area and by simply returning an address into the kernel mapping
- * of ISA space. So handle that here.
+ * The PCI/ISA range special-casing was removed from __ioremap()
+ * so this check, in theory, can be removed. However, there are
+ * cases where iounmap() is called for addresses not obtained via
+ * ioremap() (vga16fb for example). Add a warning so that these
+ * cases can be caught and fixed.
*/
if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
- (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
+ (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
+ WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
return;
+ }
addr = (volatile void __iomem *)
(PAGE_MASK & (unsigned long __force)addr);
@@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
unsigned long offset = phys & ~PAGE_MASK;
void *vaddr;
- /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
- if (page_is_ram(start >> PAGE_SHIFT))
- return __va(phys);
+ /* memremap() maps if RAM, otherwise falls back to ioremap() */
+ vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
- vaddr = ioremap_cache(start, PAGE_SIZE);
- /* Only add the offset on success and return NULL if the ioremap() failed: */
+ /* Only add the offset on success and return NULL if memremap() failed */
if (vaddr)
vaddr += offset;
@@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
{
- if (page_is_ram(phys >> PAGE_SHIFT))
- return;
+ memunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+
+/*
+ * Examine the physical address to determine if it is an area of memory
+ * that should be mapped decrypted. If the memory is not part of the
+ * kernel usable area it was accessed and created decrypted, so these
+ * areas should be mapped decrypted. And since the encryption key can
+ * change across reboots, persistent memory should also be mapped
+ * decrypted.
+ */
+static bool memremap_should_map_decrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ int is_pmem;
+
+ /*
+ * Check if the address is part of a persistent memory region.
+ * This check covers areas added by E820, EFI and ACPI.
+ */
+ is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
+ IORES_DESC_PERSISTENT_MEMORY);
+ if (is_pmem != REGION_DISJOINT)
+ return true;
+
+ /*
+ * Check if the non-volatile attribute is set for an EFI
+ * reserved area.
+ */
+ if (efi_enabled(EFI_BOOT)) {
+ switch (efi_mem_type(phys_addr)) {
+ case EFI_RESERVED_TYPE:
+ if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
+ return true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* Check if the address is outside kernel usable area */
+ switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
+ case E820_TYPE_RESERVED:
+ case E820_TYPE_ACPI:
+ case E820_TYPE_NVS:
+ case E820_TYPE_UNUSABLE:
+ case E820_TYPE_PRAM:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is EFI data. Check
+ * it against the boot params structure and EFI tables and memory types.
+ */
+static bool memremap_is_efi_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ u64 paddr;
+
+ /* Check if the address is part of EFI boot/runtime data */
+ if (!efi_enabled(EFI_BOOT))
+ return false;
+
+ paddr = boot_params.efi_info.efi_memmap_hi;
+ paddr <<= 32;
+ paddr |= boot_params.efi_info.efi_memmap;
+ if (phys_addr == paddr)
+ return true;
+
+ paddr = boot_params.efi_info.efi_systab_hi;
+ paddr <<= 32;
+ paddr |= boot_params.efi_info.efi_systab;
+ if (phys_addr == paddr)
+ return true;
+
+ if (efi_is_table_address(phys_addr))
+ return true;
+
+ switch (efi_mem_type(phys_addr)) {
+ case EFI_BOOT_SERVICES_DATA:
+ case EFI_RUNTIME_SERVICES_DATA:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain.
+ */
+static bool memremap_is_setup_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ struct setup_data *data;
+ u64 paddr, paddr_next;
+
+ paddr = boot_params.hdr.setup_data;
+ while (paddr) {
+ unsigned int len;
+
+ if (phys_addr == paddr)
+ return true;
+
+ data = memremap(paddr, sizeof(*data),
+ MEMREMAP_WB | MEMREMAP_DEC);
+
+ paddr_next = data->next;
+ len = data->len;
+
+ memunmap(data);
+
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+ return true;
+
+ paddr = paddr_next;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain (early boot version).
+ */
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ struct setup_data *data;
+ u64 paddr, paddr_next;
+
+ paddr = boot_params.hdr.setup_data;
+ while (paddr) {
+ unsigned int len;
+
+ if (phys_addr == paddr)
+ return true;
+
+ data = early_memremap_decrypted(paddr, sizeof(*data));
+
+ paddr_next = data->next;
+ len = data->len;
+
+ early_memunmap(data, sizeof(*data));
+
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+ return true;
+
+ paddr = paddr_next;
+ }
+
+ return false;
+}
+
+/*
+ * Architecture function to determine if RAM remap is allowed. By default, a
+ * RAM remap will map the data as encrypted. Determine if a RAM remap should
+ * not be done so that the data will be mapped decrypted.
+ */
+bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags)
+{
+ if (!sme_active())
+ return true;
+
+ if (flags & MEMREMAP_ENC)
+ return true;
+
+ if (flags & MEMREMAP_DEC)
+ return false;
+
+ if (memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size) ||
+ memremap_should_map_decrypted(phys_addr, size))
+ return false;
+
+ return true;
+}
+
+/*
+ * Architecture override of __weak function to adjust the protection attributes
+ * used when remapping memory. By default, early_memremap() will map the data
+ * as encrypted. Determine if an encrypted mapping should not be done and set
+ * the appropriate protection attributes.
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+ unsigned long size,
+ pgprot_t prot)
+{
+ if (!sme_active())
+ return prot;
+
+ if (early_memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size) ||
+ memremap_should_map_decrypted(phys_addr, size))
+ prot = pgprot_decrypted(prot);
+ else
+ prot = pgprot_encrypted(prot);
+
+ return prot;
+}
+
+bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
+{
+ return arch_memremap_can_ram_remap(phys_addr, size, 0);
+}
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+/* Remap memory with encryption */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
+}
+
+/*
+ * Remap memory with encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+ unsigned long size)
+{
+ /* Be sure the write-protect PAT entry is set for write-protect */
+ if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ return NULL;
+
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
+}
+
+/* Remap memory without encryption */
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
+}
+
+/*
+ * Remap memory without encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+ unsigned long size)
+{
+ /* Be sure the write-protect PAT entry is set for write-protect */
+ if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ return NULL;
- iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
}
+#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..bc84b73684b7 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -11,8 +11,8 @@
#include <asm/e820/types.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm/pgtable.h>
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_MAX_ENTRIES];
static int __init map_range(struct range *range)
@@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = {
void __init kasan_early_init(void)
{
int i;
- pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+ pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
@@ -153,7 +153,7 @@ void __init kasan_init(void)
*/
memset(kasan_zero_page, 0, PAGE_SIZE);
for (i = 0; i < PTRS_PER_PTE; i++) {
- pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+ pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
set_pte(&kasan_zero_pte[i], pte);
}
/* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
new file mode 100644
index 000000000000..0fbd09269757
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt.c
@@ -0,0 +1,593 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[] __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+unsigned long sme_me_mask __section(.data) = 0;
+EXPORT_SYMBOL_GPL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ void *src, *dst;
+ size_t len;
+
+ if (!sme_me_mask)
+ return;
+
+ local_flush_tlb();
+ wbinvd();
+
+ /*
+ * There are limited number of early mapping slots, so map (at most)
+ * one page at time.
+ */
+ while (size) {
+ len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+ /*
+ * Create mappings for the current and desired format of
+ * the memory. Use a write-protected mapping for the source.
+ */
+ src = enc ? early_memremap_decrypted_wp(paddr, len) :
+ early_memremap_encrypted_wp(paddr, len);
+
+ dst = enc ? early_memremap_encrypted(paddr, len) :
+ early_memremap_decrypted(paddr, len);
+
+ /*
+ * If a mapping can't be obtained to perform the operation,
+ * then eventual access of that area in the desired mode
+ * will cause a crash.
+ */
+ BUG_ON(!src || !dst);
+
+ /*
+ * Use a temporary buffer, of cache-line multiple size, to
+ * avoid data corruption as documented in the APM.
+ */
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+
+ early_memunmap(dst, len);
+ early_memunmap(src, len);
+
+ paddr += len;
+ size -= len;
+ }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+ bool map)
+{
+ unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+ pmdval_t pmd_flags, pmd;
+
+ /* Use early_pmd_flags but remove the encryption mask */
+ pmd_flags = __sme_clr(early_pmd_flags);
+
+ do {
+ pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+ __early_make_pgtable((unsigned long)vaddr, pmd);
+
+ vaddr += PMD_SIZE;
+ paddr += PMD_SIZE;
+ size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+ } while (size);
+
+ __native_flush_tlb();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!sme_active())
+ return;
+
+ /* Get the command line address before unmapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!sme_active())
+ return;
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+ /* Get the command line address after mapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+ unsigned int i;
+
+ if (!sme_me_mask)
+ return;
+
+ early_pmd_flags = __sme_set(early_pmd_flags);
+
+ __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+ /* Update the protection map with memory encryption mask */
+ for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+ protection_map[i] = pgprot_encrypted(protection_map[i]);
+}
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void)
+{
+ if (!sme_me_mask)
+ return;
+
+ /* Call into SWIOTLB to update the SWIOTLB DMA buffers */
+ swiotlb_update_mem_attributes();
+
+ pr_info("AMD Secure Memory Encryption (SME) active\n");
+}
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
+{
+ WARN(PAGE_ALIGN(size) != size,
+ "size is not page-aligned (%#lx)\n", size);
+
+ /* Make the SWIOTLB buffer area decrypted */
+ set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
+}
+
+static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
+ unsigned long end)
+{
+ unsigned long pgd_start, pgd_end, pgd_size;
+ pgd_t *pgd_p;
+
+ pgd_start = start & PGDIR_MASK;
+ pgd_end = end & PGDIR_MASK;
+
+ pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
+ pgd_size *= sizeof(pgd_t);
+
+ pgd_p = pgd_base + pgd_index(start);
+
+ memset(pgd_p, 0, pgd_size);
+}
+
+#define PGD_FLAGS _KERNPG_TABLE_NOENC
+#define P4D_FLAGS _KERNPG_TABLE_NOENC
+#define PUD_FLAGS _KERNPG_TABLE_NOENC
+#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
+ unsigned long vaddr, pmdval_t pmd_val)
+{
+ pgd_t *pgd_p;
+ p4d_t *p4d_p;
+ pud_t *pud_p;
+ pmd_t *pmd_p;
+
+ pgd_p = pgd_base + pgd_index(vaddr);
+ if (native_pgd_val(*pgd_p)) {
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+ else
+ pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+ } else {
+ pgd_t pgd;
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d_p = pgtable_area;
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
+
+ pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
+ } else {
+ pud_p = pgtable_area;
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+ pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+ pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
+ }
+ native_set_pgd(pgd_p, pgd);
+ }
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d_p += p4d_index(vaddr);
+ if (native_p4d_val(*p4d_p)) {
+ pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
+ } else {
+ p4d_t p4d;
+
+ pud_p = pgtable_area;
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+ pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+ p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
+ native_set_p4d(p4d_p, p4d);
+ }
+ }
+
+ pud_p += pud_index(vaddr);
+ if (native_pud_val(*pud_p)) {
+ if (native_pud_val(*pud_p) & _PAGE_PSE)
+ goto out;
+
+ pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
+ } else {
+ pud_t pud;
+
+ pmd_p = pgtable_area;
+ memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+ pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
+
+ pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
+ native_set_pud(pud_p, pud);
+ }
+
+ pmd_p += pmd_index(vaddr);
+ if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
+ native_set_pmd(pmd_p, native_make_pmd(pmd_val));
+
+out:
+ return pgtable_area;
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+ unsigned long p4d_size, pud_size, pmd_size;
+ unsigned long total;
+
+ /*
+ * Perform a relatively simplistic calculation of the pagetable
+ * entries that are needed. That mappings will be covered by 2MB
+ * PMD entries so we can conservatively calculate the required
+ * number of P4D, PUD and PMD structures needed to perform the
+ * mappings. Incrementing the count for each covers the case where
+ * the addresses cross entries.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+ p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+ pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
+ pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+ } else {
+ p4d_size = 0;
+ pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+ pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+ }
+ pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
+ pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+ total = p4d_size + pud_size + pmd_size;
+
+ /*
+ * Now calculate the added pagetable structures needed to populate
+ * the new pagetables.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+ p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+ pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
+ pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+ } else {
+ p4d_size = 0;
+ pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+ pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+ }
+ pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
+ pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+ total += p4d_size + pud_size + pmd_size;
+
+ return total;
+}
+
+void __init sme_encrypt_kernel(void)
+{
+ unsigned long workarea_start, workarea_end, workarea_len;
+ unsigned long execute_start, execute_end, execute_len;
+ unsigned long kernel_start, kernel_end, kernel_len;
+ unsigned long pgtable_area_len;
+ unsigned long paddr, pmd_flags;
+ unsigned long decrypted_base;
+ void *pgtable_area;
+ pgd_t *pgd;
+
+ if (!sme_active())
+ return;
+
+ /*
+ * Prepare for encrypting the kernel by building new pagetables with
+ * the necessary attributes needed to encrypt the kernel in place.
+ *
+ * One range of virtual addresses will map the memory occupied
+ * by the kernel as encrypted.
+ *
+ * Another range of virtual addresses will map the memory occupied
+ * by the kernel as decrypted and write-protected.
+ *
+ * The use of write-protect attribute will prevent any of the
+ * memory from being cached.
+ */
+
+ /* Physical addresses gives us the identity mapped virtual addresses */
+ kernel_start = __pa_symbol(_text);
+ kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+ kernel_len = kernel_end - kernel_start;
+
+ /* Set the encryption workarea to be immediately after the kernel */
+ workarea_start = kernel_end;
+
+ /*
+ * Calculate required number of workarea bytes needed:
+ * executable encryption area size:
+ * stack page (PAGE_SIZE)
+ * encryption routine page (PAGE_SIZE)
+ * intermediate copy buffer (PMD_PAGE_SIZE)
+ * pagetable structures for the encryption of the kernel
+ * pagetable structures for workarea (in case not currently mapped)
+ */
+ execute_start = workarea_start;
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+ execute_len = execute_end - execute_start;
+
+ /*
+ * One PGD for both encrypted and decrypted mappings and a set of
+ * PUDs and PMDs for each of the encrypted and decrypted mappings.
+ */
+ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+ pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+
+ /* PUDs and PMDs needed in the current pagetables for the workarea */
+ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+ /*
+ * The total workarea includes the executable encryption area and
+ * the pagetable area.
+ */
+ workarea_len = execute_len + pgtable_area_len;
+ workarea_end = workarea_start + workarea_len;
+
+ /*
+ * Set the address to the start of where newly created pagetable
+ * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+ * structures are created when the workarea is added to the current
+ * pagetables and when the new encrypted and decrypted kernel
+ * mappings are populated.
+ */
+ pgtable_area = (void *)execute_end;
+
+ /*
+ * Make sure the current pagetable structure has entries for
+ * addressing the workarea.
+ */
+ pgd = (pgd_t *)native_read_cr3_pa();
+ paddr = workarea_start;
+ while (paddr < workarea_end) {
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr,
+ paddr + PMD_FLAGS);
+
+ paddr += PMD_PAGE_SIZE;
+ }
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+
+ /*
+ * A new pagetable structure is being built to allow for the kernel
+ * to be encrypted. It starts with an empty PGD that will then be
+ * populated with new PUDs and PMDs as the encrypted and decrypted
+ * kernel mappings are created.
+ */
+ pgd = pgtable_area;
+ memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
+ pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
+
+ /* Add encrypted kernel (identity) mappings */
+ pmd_flags = PMD_FLAGS | _PAGE_ENC;
+ paddr = kernel_start;
+ while (paddr < kernel_end) {
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr,
+ paddr + pmd_flags);
+
+ paddr += PMD_PAGE_SIZE;
+ }
+
+ /*
+ * A different PGD index/entry must be used to get different
+ * pagetable entries for the decrypted mapping. Choose the next
+ * PGD index and convert it to a virtual address to be used as
+ * the base of the mapping.
+ */
+ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+ decrypted_base <<= PGDIR_SHIFT;
+
+ /* Add decrypted, write-protected kernel (non-identity) mappings */
+ pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
+ paddr = kernel_start;
+ while (paddr < kernel_end) {
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr + decrypted_base,
+ paddr + pmd_flags);
+
+ paddr += PMD_PAGE_SIZE;
+ }
+
+ /* Add decrypted workarea mappings to both kernel mappings */
+ paddr = workarea_start;
+ while (paddr < workarea_end) {
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr,
+ paddr + PMD_FLAGS);
+
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr + decrypted_base,
+ paddr + PMD_FLAGS);
+
+ paddr += PMD_PAGE_SIZE;
+ }
+
+ /* Perform the encryption */
+ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+ kernel_len, workarea_start, (unsigned long)pgd);
+
+ /*
+ * At this point we are running encrypted. Remove the mappings for
+ * the decrypted areas - all that is needed for this is to remove
+ * the PGD entry/entries.
+ */
+ sme_clear_pgd(pgd, kernel_start + decrypted_base,
+ kernel_end + decrypted_base);
+
+ sme_clear_pgd(pgd, workarea_start + decrypted_base,
+ workarea_end + decrypted_base);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+}
+
+void __init __nostackprotector sme_enable(struct boot_params *bp)
+{
+ const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+ unsigned int eax, ebx, ecx, edx;
+ bool active_by_default;
+ unsigned long me_mask;
+ char buffer[16];
+ u64 msr;
+
+ /* Check for the SME support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+ /*
+ * Check for the SME feature:
+ * CPUID Fn8000_001F[EAX] - Bit 0
+ * Secure Memory Encryption support
+ * CPUID Fn8000_001F[EBX] - Bits 5:0
+ * Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (!(eax & 1))
+ return;
+
+ me_mask = 1UL << (ebx & 0x3f);
+
+ /* Check if SME is enabled */
+ msr = __rdmsr(MSR_K8_SYSCFG);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ /*
+ * Fixups have not been applied to phys_base yet and we're running
+ * identity mapped, so we must obtain the address to the SME command
+ * line argument data using rip-relative addressing.
+ */
+ asm ("lea sme_cmdline_arg(%%rip), %0"
+ : "=r" (cmdline_arg)
+ : "p" (sme_cmdline_arg));
+ asm ("lea sme_cmdline_on(%%rip), %0"
+ : "=r" (cmdline_on)
+ : "p" (sme_cmdline_on));
+ asm ("lea sme_cmdline_off(%%rip), %0"
+ : "=r" (cmdline_off)
+ : "p" (sme_cmdline_off));
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+ active_by_default = true;
+ else
+ active_by_default = false;
+
+ cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+ ((u64)bp->ext_cmd_line_ptr << 32));
+
+ cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+ if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+ sme_me_mask = me_mask;
+ else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+ sme_me_mask = 0;
+ else
+ sme_me_mask = active_by_default ? me_mask : 0;
+}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
new file mode 100644
index 000000000000..730e6d541df1
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -0,0 +1,149 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+
+ .text
+ .code64
+ENTRY(sme_encrypt_execute)
+
+ /*
+ * Entry parameters:
+ * RDI - virtual address for the encrypted kernel mapping
+ * RSI - virtual address for the decrypted kernel mapping
+ * RDX - length of kernel
+ * RCX - virtual address of the encryption workarea, including:
+ * - stack page (PAGE_SIZE)
+ * - encryption routine page (PAGE_SIZE)
+ * - intermediate copy buffer (PMD_PAGE_SIZE)
+ * R8 - physcial address of the pagetables to use for encryption
+ */
+
+ push %rbp
+ movq %rsp, %rbp /* RBP now has original stack pointer */
+
+ /* Set up a one page stack in the non-encrypted memory area */
+ movq %rcx, %rax /* Workarea stack page */
+ leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */
+ addq $PAGE_SIZE, %rax /* Workarea encryption routine */
+
+ push %r12
+ movq %rdi, %r10 /* Encrypted kernel */
+ movq %rsi, %r11 /* Decrypted kernel */
+ movq %rdx, %r12 /* Kernel length */
+
+ /* Copy encryption routine into the workarea */
+ movq %rax, %rdi /* Workarea encryption routine */
+ leaq __enc_copy(%rip), %rsi /* Encryption routine */
+ movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */
+ rep movsb
+
+ /* Setup registers for call */
+ movq %r10, %rdi /* Encrypted kernel */
+ movq %r11, %rsi /* Decrypted kernel */
+ movq %r8, %rdx /* Pagetables used for encryption */
+ movq %r12, %rcx /* Kernel length */
+ movq %rax, %r8 /* Workarea encryption routine */
+ addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */
+
+ call *%rax /* Call the encryption routine */
+
+ pop %r12
+
+ movq %rbp, %rsp /* Restore original stack pointer */
+ pop %rbp
+
+ ret
+ENDPROC(sme_encrypt_execute)
+
+ENTRY(__enc_copy)
+/*
+ * Routine used to encrypt kernel.
+ * This routine must be run outside of the kernel proper since
+ * the kernel will be encrypted during the process. So this
+ * routine is defined here and then copied to an area outside
+ * of the kernel where it will remain and run decrypted
+ * during execution.
+ *
+ * On entry the registers must be:
+ * RDI - virtual address for the encrypted kernel mapping
+ * RSI - virtual address for the decrypted kernel mapping
+ * RDX - address of the pagetables to use for encryption
+ * RCX - length of kernel
+ * R8 - intermediate copy buffer
+ *
+ * RAX - points to this routine
+ *
+ * The kernel will be encrypted by copying from the non-encrypted
+ * kernel space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted kernel space. The physical
+ * addresses of the two kernel space mappings are the same which
+ * results in the kernel being encrypted "in place".
+ */
+ /* Enable the new page tables */
+ mov %rdx, %cr3
+
+ /* Flush any global TLBs */
+ mov %cr4, %rdx
+ andq $~X86_CR4_PGE, %rdx
+ mov %rdx, %cr4
+ orq $X86_CR4_PGE, %rdx
+ mov %rdx, %cr4
+
+ /* Set the PAT register PA5 entry to write-protect */
+ push %rcx
+ movl $MSR_IA32_CR_PAT, %ecx
+ rdmsr
+ push %rdx /* Save original PAT value */
+ andl $0xffff00ff, %edx /* Clear PA5 */
+ orl $0x00000500, %edx /* Set PA5 to WP */
+ wrmsr
+ pop %rdx /* RDX contains original PAT value */
+ pop %rcx
+
+ movq %rcx, %r9 /* Save kernel length */
+ movq %rdi, %r10 /* Save encrypted kernel address */
+ movq %rsi, %r11 /* Save decrypted kernel address */
+
+ wbinvd /* Invalidate any cache entries */
+
+ /* Copy/encrypt 2MB at a time */
+1:
+ movq %r11, %rsi /* Source - decrypted kernel */
+ movq %r8, %rdi /* Dest - intermediate copy buffer */
+ movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
+ rep movsb
+
+ movq %r8, %rsi /* Source - intermediate copy buffer */
+ movq %r10, %rdi /* Dest - encrypted kernel */
+ movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
+ rep movsb
+
+ addq $PMD_PAGE_SIZE, %r11
+ addq $PMD_PAGE_SIZE, %r10
+ subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */
+ jnz 1b /* Kernel length not zero? */
+
+ /* Restore PAT register */
+ push %rdx /* Save original PAT value */
+ movl $MSR_IA32_CR_PAT, %ecx
+ rdmsr
+ pop %rdx /* Restore original PAT value */
+ wrmsr
+
+ ret
+.L__enc_copy_end:
+ENDPROC(__enc_copy)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 229d04a83f85..a99679826846 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -37,22 +37,21 @@ struct va_alignment __read_mostly va_align = {
.flags = -1,
};
-unsigned long tasksize_32bit(void)
+unsigned long task_size_32bit(void)
{
return IA32_PAGE_OFFSET;
}
-unsigned long tasksize_64bit(void)
+unsigned long task_size_64bit(int full_addr_space)
{
- return TASK_SIZE_MAX;
+ return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
}
static unsigned long stack_maxrandom_size(unsigned long task_size)
{
unsigned long max = 0;
- if ((current->flags & PF_RANDOMIZE) &&
- !(current->personality & ADDR_NO_RANDOMIZE)) {
- max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+ if (current->flags & PF_RANDOMIZE) {
+ max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
max <<= PAGE_SHIFT;
}
@@ -79,13 +78,13 @@ static int mmap_is_legacy(void)
static unsigned long arch_rnd(unsigned int rndbits)
{
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
}
unsigned long arch_mmap_rnd(void)
{
- if (!(current->flags & PF_RANDOMIZE))
- return 0;
return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}
@@ -142,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
- arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+ arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/*
@@ -152,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* mmap_base, the compat syscall uses mmap_compat_base.
*/
arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
- arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+ arch_rnd(mmap32_rnd_bits), task_size_32bit());
#endif
}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 1c34b767c84c..9ceaa955d2ba 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -355,10 +355,19 @@ int mpx_enable_management(void)
*/
bd_base = mpx_get_bounds_dir();
down_write(&mm->mmap_sem);
+
+ /* MPX doesn't support addresses above 47 bits yet. */
+ if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
+ pr_warn_once("%s (%d): MPX cannot handle addresses "
+ "above 47-bits. Disabling.",
+ current->comm, current->pid);
+ ret = -ENXIO;
+ goto out;
+ }
mm->context.bd_addr = bd_base;
if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
ret = -ENXIO;
-
+out:
up_write(&mm->mmap_sem);
return ret;
}
@@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
if (ret)
force_sig(SIGSEGV, current);
}
+
+/* MPX cannot handle addresses above 47 bits yet. */
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+ unsigned long flags)
+{
+ if (!kernel_managing_mpx_tables(current->mm))
+ return addr;
+ if (addr + len <= DEFAULT_MAP_WINDOW)
+ return addr;
+ if (flags & MAP_FIXED)
+ return -ENOMEM;
+
+ /*
+ * Requested len is larger than the whole area we're allowed to map in.
+ * Resetting hinting address wouldn't do much good -- fail early.
+ */
+ if (len > DEFAULT_MAP_WINDOW)
+ return -ENOMEM;
+
+ /* Look for unmap area within DEFAULT_MAP_WINDOW */
+ return 0;
+}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index a8f90ce3dedf..d805162e6045 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
/*
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr. The return value is the number of nodes allocated.
+ * to max_addr.
+ *
+ * Returns zero on success or negative on error.
*/
static int __init split_nodes_interleave(struct numa_meminfo *ei,
struct numa_meminfo *pi,
u64 addr, u64 max_addr, int nr_nodes)
{
- nodemask_t physnode_mask = NODE_MASK_NONE;
+ nodemask_t physnode_mask = numa_nodes_parsed;
u64 size;
int big;
int nid = 0;
@@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
return -1;
}
- for (i = 0; i < pi->nr_blks; i++)
- node_set(pi->blk[i].nid, physnode_mask);
-
/*
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
@@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
/*
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'. The return value is the number of nodes allocated.
+ * `addr' to `max_addr'.
+ *
+ * Returns zero on success or negative on error.
*/
static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
struct numa_meminfo *pi,
u64 addr, u64 max_addr, u64 size)
{
- nodemask_t physnode_mask = NODE_MASK_NONE;
+ nodemask_t physnode_mask = numa_nodes_parsed;
u64 min_size;
int nid = 0;
int i, ret;
@@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
}
size &= FAKE_NODE_MIN_HASH_MASK;
- for (i = 0; i < pi->nr_blks; i++)
- node_set(pi->blk[i].nid, physnode_mask);
-
/*
* Fill physical nodes with fake nodes of size until there is no memory
* left on any of them.
@@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
return 0;
}
+int __init setup_emu2phys_nid(int *dfl_phys_nid)
+{
+ int i, max_emu_nid = 0;
+
+ *dfl_phys_nid = NUMA_NO_NODE;
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+ max_emu_nid = i;
+ if (*dfl_phys_nid == NUMA_NO_NODE)
+ *dfl_phys_nid = emu_nid_to_phys[i];
+ }
+ }
+
+ return max_emu_nid;
+}
+
/**
* numa_emulation - Emulate NUMA nodes
* @numa_meminfo: NUMA configuration to massage
@@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* Determine the max emulated nid and the default phys nid to use
* for unmapped nodes.
*/
- max_emu_nid = 0;
- dfl_phys_nid = NUMA_NO_NODE;
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
- if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
- max_emu_nid = i;
- if (dfl_phys_nid == NUMA_NO_NODE)
- dfl_phys_nid = emu_nid_to_phys[i];
- }
- }
- if (dfl_phys_nid == NUMA_NO_NODE) {
- pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
- goto no_emu;
- }
+ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
/* commit */
*numa_meminfo = ei;
+ /* Make sure numa_nodes_parsed only contains emulated nodes */
+ nodes_clear(numa_nodes_parsed);
+ for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
+ if (ei.blk[i].start != ei.blk[i].end &&
+ ei.blk[i].nid != NUMA_NO_NODE)
+ node_set(ei.blk[i].nid, numa_nodes_parsed);
+
/*
* Transform __apicid_to_node table to use emulated nids by
* reverse-mapping phys_nid. The maps should always exist but fall
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 757b0bcdf712..dfb7d657cf43 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages)
__pgprot(0), 1, 0, NULL);
}
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+ struct cpa_data cpa;
+ unsigned long start;
+ int ret;
+
+ /* Nothing to do if the SME is not active */
+ if (!sme_active())
+ return 0;
+
+ /* Should not be working on unaligned addresses */
+ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
+ addr &= PAGE_MASK;
+
+ start = addr;
+
+ memset(&cpa, 0, sizeof(cpa));
+ cpa.vaddr = &addr;
+ cpa.numpages = numpages;
+ cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
+ cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+ cpa.pgd = init_mm.pgd;
+
+ /* Must avoid aliasing mappings in the highmem code */
+ kmap_flush_unused();
+ vm_unmap_aliases();
+
+ /*
+ * Before changing the encryption attribute, we need to flush caches.
+ */
+ if (static_cpu_has(X86_FEATURE_CLFLUSH))
+ cpa_flush_range(start, numpages, 1);
+ else
+ cpa_flush_all(1);
+
+ ret = __change_page_attr_set_clr(&cpa, 1);
+
+ /*
+ * After changing the encryption attribute, we need to flush TLBs
+ * again in case any speculative TLB caching occurred (but no need
+ * to flush caches again). We could just use cpa_flush_all(), but
+ * in case TLB flushing gets optimized in the cpa_flush_range()
+ * path use the same logic as above.
+ */
+ if (static_cpu_has(X86_FEATURE_CLFLUSH))
+ cpa_flush_range(start, numpages, 0);
+ else
+ cpa_flush_all(0);
+
+ return ret;
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+ return __set_memory_enc_dec(addr, numpages, true);
+}
+EXPORT_SYMBOL_GPL(set_memory_encrypted);
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+ return __set_memory_enc_dec(addr, numpages, false);
+}
+EXPORT_SYMBOL_GPL(set_memory_decrypted);
+
int set_pages_uc(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
@@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
if (!(page_flags & _PAGE_RW))
cpa.mask_clr = __pgprot(_PAGE_RW);
+ if (!(page_flags & _PAGE_ENC))
+ cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
+
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 45979502f64b..fe7d57a8fb60 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -293,7 +293,7 @@ void init_cache_modes(void)
* pat_init - Initialize PAT MSR and PAT table
*
* This function initializes PAT MSR and PAT table with an OS-defined value
- * to enable additional cache attributes, WC and WT.
+ * to enable additional cache attributes, WC, WT and WP.
*
* This function must be called on all CPUs using the specific sequence of
* operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
@@ -352,7 +352,7 @@ void pat_init(void)
* 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
* 011 3 UC : _PAGE_CACHE_MODE_UC
* 100 4 WB : Reserved
- * 101 5 WC : Reserved
+ * 101 5 WP : _PAGE_CACHE_MODE_WP
* 110 6 UC-: Reserved
* 111 7 WT : _PAGE_CACHE_MODE_WT
*
@@ -360,7 +360,7 @@ void pat_init(void)
* corresponding types in the presence of PAT errata.
*/
pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+ PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
}
if (!boot_cpu_done) {
@@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc);
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
+ if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
+ vma_prot = pgprot_decrypted(vma_prot);
+
return vma_prot;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 508a708eb9a6..218834a3e9ad 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -56,7 +56,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
- tlb_remove_page(tlb, pte);
+ tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -72,21 +72,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb->need_flush_all = 1;
#endif
pgtable_pmd_page_dtor(page);
- tlb_remove_page(tlb, page);
+ tlb_remove_table(tlb, page);
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pud));
+ tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(p4d));
+ tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 014d07a80053..ce104b962a17 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,42 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+ u16 *new_asid, bool *need_flush)
+{
+ u16 asid;
+
+ if (!static_cpu_has(X86_FEATURE_PCID)) {
+ *new_asid = 0;
+ *need_flush = true;
+ return;
+ }
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+ next->context.ctx_id)
+ continue;
+
+ *new_asid = asid;
+ *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+ next_tlb_gen);
+ return;
+ }
+
+ /*
+ * We don't currently own an ASID slot on this CPU.
+ * Allocate a slot.
+ */
+ *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+ if (*new_asid >= TLB_NR_DYN_ASIDS) {
+ *new_asid = 0;
+ this_cpu_write(cpu_tlbstate.next_asid, 1);
+ }
+ *need_flush = true;
+}
+
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -43,12 +79,11 @@ void leave_mm(int cpu)
if (loaded_mm == &init_mm)
return;
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- BUG();
+ /* Warn if we're not lazy. */
+ WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
switch_mm(NULL, &init_mm, NULL);
}
-EXPORT_SYMBOL_GPL(leave_mm);
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
@@ -63,115 +98,219 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
- unsigned cpu = smp_processor_id();
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned cpu = smp_processor_id();
+ u64 next_tlb_gen;
/*
- * NB: The scheduler will call us with prev == next when
- * switching from lazy TLB mode to normal mode if active_mm
- * isn't changing. When this happens, there is no guarantee
- * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+ * NB: The scheduler will call us with prev == next when switching
+ * from lazy TLB mode to normal mode if active_mm isn't changing.
+ * When this happens, we don't assume that CR3 (and hence
+ * cpu_tlbstate.loaded_mm) matches next.
*
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
*/
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ /* We don't want flush_tlb_func_* to run concurrently with us. */
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(!irqs_disabled());
+
+ /*
+ * Verify that CR3 is what we think it is. This will catch
+ * hypothetical buggy code that directly switches to swapper_pg_dir
+ * without going through leave_mm() / switch_mm_irqs_off() or that
+ * does something like write_cr3(read_cr3_pa()).
+ */
+ VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
if (real_prev == next) {
- /*
- * There's nothing to do: we always keep the per-mm control
- * regs in sync with cpu_tlbstate.loaded_mm. Just
- * sanity-check mm_cpumask.
- */
- if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
- cpumask_set_cpu(cpu, mm_cpumask(next));
- return;
- }
+ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ next->context.ctx_id);
+
+ if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ /*
+ * There's nothing to do: we weren't lazy, and we
+ * aren't changing our mm. We don't need to flush
+ * anything, nor do we need to update CR3, CR4, or
+ * LDTR.
+ */
+ return;
+ }
+
+ /* Resume remote flushes and then read tlb_gen. */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
+ next_tlb_gen) {
+ /*
+ * Ideally, we'd have a flush_tlb() variant that
+ * takes the known CR3 value as input. This would
+ * be faster on Xen PV and on hypothetical CPUs
+ * on which INVPCID is fast.
+ */
+ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
+ next_tlb_gen);
+ write_cr3(__sme_pa(next->pgd) | prev_asid);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+ TLB_FLUSH_ALL);
+ }
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
+ * We just exited lazy mode, which means that CR4 and/or LDTR
+ * may be stale. (Changes to the required CR4 and LDTR states
+ * are not reflected in tlb_gen.)
*/
- unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
- pgd_t *pgd = next->pgd + stack_pgd_index;
-
- if (unlikely(pgd_none(*pgd)))
- set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
- }
+ } else {
+ u16 new_asid;
+ bool need_flush;
+
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ /*
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
+ */
+ unsigned int index = pgd_index(current_stack_pointer());
+ pgd_t *pgd = next->pgd + index;
+
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[index]);
+ }
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ /* Stop remote flushes for the previous mm */
+ if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
- WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
- /*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- */
- load_cr3(next->pgd);
-
- /*
- * This gets called via leave_mm() in the idle path where RCU
- * functions differently. Tracing normally uses RCU, so we have to
- * call the tracepoint specially here.
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ /*
+ * Start remote flushes and then read tlb_gen.
+ */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ write_cr3(__sme_pa(next->pgd) | new_asid);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+ TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ }
- /* Stop flush ipis for the previous mm */
- WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
- real_prev != &init_mm);
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ }
- /* Load per-mm CR4 and LDTR state */
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen. We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
static void flush_tlb_func_common(const struct flush_tlb_info *f,
bool local, enum tlb_flush_reason reason)
{
+ /*
+ * We have three different tlb_gen values in here. They are:
+ *
+ * - mm_tlb_gen: the latest generation.
+ * - local_tlb_gen: the generation that this CPU has already caught
+ * up to.
+ * - f->new_tlb_gen: the generation that the requester of the flush
+ * wants us to catch up to.
+ */
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
- if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
- leave_mm(smp_processor_id());
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+ loaded_mm->context.ctx_id);
+
+ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+ /*
+ * We're in lazy mode -- don't flush. We can get here on
+ * remote flushes due to races and on local flushes if a
+ * kernel thread coincidentally flushes the mm it's lazily
+ * still using.
+ */
return;
}
- if (f->end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- if (local)
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- trace_tlb_flush(reason, TLB_FLUSH_ALL);
- } else {
+ if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+ /*
+ * There's nothing to do: we're already up to date. This can
+ * happen if two concurrent flushes happen -- the first flush to
+ * be handled can catch us all the way up, leaving no work for
+ * the second flush.
+ */
+ trace_tlb_flush(reason, 0);
+ return;
+ }
+
+ WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+ WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+ /*
+ * If we get to this point, we know that our TLB is out of date.
+ * This does not strictly imply that we need to flush (it's
+ * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+ * going to need to flush in the very near future, so we might
+ * as well get it over with.
+ *
+ * The only question is whether to do a full or partial flush.
+ *
+ * We do a partial flush if requested and two extra conditions
+ * are met:
+ *
+ * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
+ * we've always done all needed flushes to catch up to
+ * local_tlb_gen. If, for example, local_tlb_gen == 2 and
+ * f->new_tlb_gen == 3, then we know that the flush needed to bring
+ * us up to date for tlb_gen 3 is the partial flush we're
+ * processing.
+ *
+ * As an example of why this check is needed, suppose that there
+ * are two concurrent flushes. The first is a full flush that
+ * changes context.tlb_gen from 1 to 2. The second is a partial
+ * flush that changes context.tlb_gen from 2 to 3. If they get
+ * processed on this CPU in reverse order, we'll see
+ * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+ * If we were to use __flush_tlb_single() and set local_tlb_gen to
+ * 3, we'd be break the invariant: we'd update local_tlb_gen above
+ * 1 without the full flush that's needed for tlb_gen 2.
+ *
+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
+ * Partial TLB flushes are not all that much cheaper than full TLB
+ * flushes, so it seems unlikely that it would be a performance win
+ * to do a partial flush if that won't bring our TLB fully up to
+ * date. By doing a full flush instead, we can increase
+ * local_tlb_gen all the way to mm_tlb_gen and we can probably
+ * avoid another flush in the very near future.
+ */
+ if (f->end != TLB_FLUSH_ALL &&
+ f->new_tlb_gen == local_tlb_gen + 1 &&
+ f->new_tlb_gen == mm_tlb_gen) {
+ /* Partial flush */
unsigned long addr;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
addr = f->start;
while (addr < f->end) {
__flush_tlb_single(addr);
@@ -180,7 +319,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
trace_tlb_flush(reason, nr_pages);
+ } else {
+ /* Full flush. */
+ local_flush_tlb();
+ if (local)
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
+
+ /* Both paths above update our state to mm_tlb_gen. */
+ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -214,6 +362,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(info->end - info->start) >> PAGE_SHIFT);
if (is_uv_system()) {
+ /*
+ * This whole special case is confused. UV has a "Broadcast
+ * Assist Unit", which seems to be a fancy way to send IPIs.
+ * Back when x86 used an explicit TLB flush IPI, UV was
+ * optimized to use its own mechanism. These days, x86 uses
+ * smp_call_function_many(), but UV still uses a manual IPI,
+ * and that IPI's action is out of date -- it does a manual
+ * flush instead of calling flush_tlb_func_remote(). This
+ * means that the percpu tlb_gen variables won't be updated
+ * and we'll do pointless flushes on future context switches.
+ *
+ * Rather than hooking native_flush_tlb_others() here, I think
+ * that UV should be updated so that smp_call_function_many(),
+ * etc, are optimal on UV.
+ */
unsigned int cpu;
cpu = smp_processor_id();
@@ -250,8 +413,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
cpu = get_cpu();
- /* Synchronize with switch_mm. */
- smp_mb();
+ /* This is also a barrier that synchronizes with switch_mm(). */
+ info.new_tlb_gen = inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
@@ -273,6 +436,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), &info);
+
put_cpu();
}
@@ -281,8 +445,6 @@ static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
- leave_mm(smp_processor_id());
}
void flush_tlb_all(void)
@@ -335,6 +497,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
flush_tlb_others(&batch->cpumask, &info);
+
cpumask_clear(&batch->cpumask);
put_cpu();
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dbe2132b0ed4..7a5350d08cef 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev)
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- data = ioremap(pa_data, sizeof(*rom));
+ data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB);
if (!data)
return -ENOMEM;
@@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev)
}
}
pa_data = data->next;
- iounmap(data);
+ memunmap(data);
}
set_dma_domain_ops(dev);
set_dev_domain_options(dev);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f084d8718ac4..6217b23e85f6 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void)
/*
* Convenience functions to obtain memory types and attributes
*/
-u32 efi_mem_type(unsigned long phys_addr)
+int efi_mem_type(unsigned long phys_addr)
{
efi_memory_desc_t *md;
if (!efi_enabled(EFI_MEMMAP))
- return 0;
+ return -ENOTSUPP;
for_each_efi_memory_desc(md) {
if ((md->phys_addr <= phys_addr) &&
@@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr)
(md->num_pages << EFI_PAGE_SHIFT))))
return md->type;
}
- return 0;
+ return -EINVAL;
}
static int __init arch_parse_efi_cmdline(char *str)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9bf72f5bfedb..12e83888e5b9 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -327,7 +327,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size)
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
- unsigned long pfn, text;
+ unsigned long pfn, text, pf;
struct page *page;
unsigned npages;
pgd_t *pgd;
@@ -335,7 +335,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
if (efi_enabled(EFI_OLD_MEMMAP))
return 0;
- efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
+ /*
+ * Since the PGD is encrypted, set the encryption mask so that when
+ * this value is loaded into cr3 the PGD will be decrypted during
+ * the pagetable walk.
+ */
+ efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd);
pgd = efi_pgd;
/*
@@ -345,7 +350,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* phys_efi_set_virtual_address_map().
*/
pfn = pa_memmap >> PAGE_SHIFT;
- if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) {
+ pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC;
+ if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) {
pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
return 1;
}
@@ -388,7 +394,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
text = __pa(_text);
pfn = text >> PAGE_SHIFT;
- if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) {
+ pf = _PAGE_RW | _PAGE_ENC;
+ if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
pr_err("Failed to map kernel text 1:1\n");
return 1;
}
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 3e4bdb442fbc..f44c0bc95aa2 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -26,7 +26,7 @@
static struct bau_operations ops __ro_after_init;
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
-static int timeout_base_ns[] = {
+static const int timeout_base_ns[] = {
20,
160,
1280,
@@ -1216,7 +1216,7 @@ static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
* set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
* Such a message must be ignored.
*/
-void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
+static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
{
unsigned long mmr_image;
unsigned char swack_vec;
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cd4be19c36dc..1f71980fc5e0 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,6 +1,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
#include <asm/set_memory.h>
#include <asm/pgtable.h>
@@ -59,6 +60,13 @@ static void __init setup_real_mode(void)
base = (unsigned char *)real_mode_header;
+ /*
+ * If SME is active, the trampoline area will need to be in
+ * decrypted memory in order to bring up other processors
+ * successfully.
+ */
+ set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
+
memcpy(base, real_mode_blob, size);
phys_base = __pa(base);
@@ -100,6 +108,10 @@ static void __init setup_real_mode(void)
trampoline_cr4_features = &trampoline_header->cr4;
*trampoline_cr4_features = mmu_cr4_features;
+ trampoline_header->flags = 0;
+ if (sme_active())
+ trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
+
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
trampoline_pgd[511] = init_top_pgt[511].pgd;
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index dac7b20d2f9d..614fd7064d0a 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -30,6 +30,7 @@
#include <asm/msr.h>
#include <asm/segment.h>
#include <asm/processor-flags.h>
+#include <asm/realmode.h>
#include "realmode.h"
.text
@@ -92,6 +93,28 @@ ENTRY(startup_32)
movl %edx, %fs
movl %edx, %gs
+ /*
+ * Check for memory encryption support. This is a safety net in
+ * case BIOS hasn't done the necessary step of setting the bit in
+ * the MSR for this AP. If SME is active and we've gotten this far
+ * then it is safe for us to set the MSR bit and continue. If we
+ * don't we'll eventually crash trying to execute encrypted
+ * instructions.
+ */
+ bt $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
+ jnc .Ldone
+ movl $MSR_K8_SYSCFG, %ecx
+ rdmsr
+ bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
+ jc .Ldone
+
+ /*
+ * Memory encryption is enabled but the SME enable bit for this
+ * CPU has has not been set. It is safe to set it, so do so.
+ */
+ wrmsr
+.Ldone:
+
movl pa_tr_cr4, %eax
movl %eax, %cr4 # Enable PAE mode
@@ -147,6 +170,7 @@ GLOBAL(trampoline_header)
tr_start: .space 8
GLOBAL(tr_efer) .space 8
GLOBAL(tr_cr4) .space 4
+ GLOBAL(tr_flags) .space 4
END(trampoline_header)
#include "trampoline_common.S"
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ae4cd58c0c7a..02250b2633b8 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -50,7 +50,7 @@ void foo(void)
DEFINE(HOST_GS, GS);
DEFINE(HOST_ORIG_AX, ORIG_EAX);
#else
-#if defined(PTRACE_GETREGSET) && defined(PTRACE_SETREGSET)
+#ifdef FP_XSTATE_MAGIC1
DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long));
#else
DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long));
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 027987638e98..1ecd419811a2 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -17,6 +17,9 @@ config XEN_PV
bool "Xen PV guest support"
default y
depends on XEN
+ # XEN_PV is not ready to work with 5-level paging.
+ # Changes to hypervisor are also required.
+ depends on !X86_5LEVEL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
help
@@ -75,4 +78,6 @@ config XEN_DEBUG_FS
config XEN_PVH
bool "Support for running as a PVH guest"
depends on XEN && XEN_PVHVM && ACPI
+ # Pre-built page tables are not ready to handle 5-level paging.
+ depends on !X86_5LEVEL
def_bool n
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 87d791356ea9..de503c225ae1 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -12,6 +12,7 @@
#include <asm/setup.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
+#include <asm/early_ioremap.h>
#include <asm/xen/cpuid.h>
#include <asm/xen/hypervisor.h>
@@ -21,38 +22,50 @@
#include "mmu.h"
#include "smp.h"
-void __ref xen_hvm_init_shared_info(void)
+static unsigned long shared_info_pfn;
+
+void xen_hvm_init_shared_info(void)
{
struct xen_add_to_physmap xatp;
- u64 pa;
-
- if (HYPERVISOR_shared_info == &xen_dummy_shared_info) {
- /*
- * Search for a free page starting at 4kB physical address.
- * Low memory is preferred to avoid an EPT large page split up
- * by the mapping.
- * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
- * the BIOS used for HVM guests is well behaved and won't
- * clobber memory other than the first 4kB.
- */
- for (pa = PAGE_SIZE;
- !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
- memblock_is_reserved(pa);
- pa += PAGE_SIZE)
- ;
-
- memblock_reserve(pa, PAGE_SIZE);
- HYPERVISOR_shared_info = __va(pa);
- }
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
- xatp.gpfn = virt_to_pfn(HYPERVISOR_shared_info);
+ xatp.gpfn = shared_info_pfn;
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
}
+static void __init reserve_shared_info(void)
+{
+ u64 pa;
+
+ /*
+ * Search for a free page starting at 4kB physical address.
+ * Low memory is preferred to avoid an EPT large page split up
+ * by the mapping.
+ * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
+ * the BIOS used for HVM guests is well behaved and won't
+ * clobber memory other than the first 4kB.
+ */
+ for (pa = PAGE_SIZE;
+ !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
+ memblock_is_reserved(pa);
+ pa += PAGE_SIZE)
+ ;
+
+ shared_info_pfn = PHYS_PFN(pa);
+
+ memblock_reserve(pa, PAGE_SIZE);
+ HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE);
+}
+
+static void __init xen_hvm_init_mem_mapping(void)
+{
+ early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE);
+ HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+}
+
static void __init init_hvm_pv_info(void)
{
int major, minor;
@@ -153,6 +166,7 @@ static void __init xen_hvm_guest_init(void)
init_hvm_pv_info();
+ reserve_shared_info();
xen_hvm_init_shared_info();
/*
@@ -218,5 +232,6 @@ const struct hypervisor_x86 x86_hyper_xen_hvm = {
.init_platform = xen_hvm_guest_init,
.pin_vcpu = xen_pin_vcpu,
.x2apic_available = xen_x2apic_para_available,
+ .init_mem_mapping = xen_hvm_init_mem_mapping,
};
EXPORT_SYMBOL(x86_hyper_xen_hvm);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 811e4ddb3f37..6c279c8f0a0e 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -263,6 +263,13 @@ static void __init xen_init_capabilities(void)
setup_clear_cpu_cap(X86_FEATURE_MTRR);
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+
+ /*
+ * Xen PV would need some work to support PCID: CR3 handling as well
+ * as xen_flush_tlb_others() would need updating.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
if (!xen_initial_domain())
setup_clear_cpu_cap(X86_FEATURE_ACPI);
@@ -981,59 +988,6 @@ void __ref xen_setup_vcpu_info_placement(void)
}
}
-static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
- unsigned long addr, unsigned len)
-{
- char *start, *end, *reloc;
- unsigned ret;
-
- start = end = reloc = NULL;
-
-#define SITE(op, x) \
- case PARAVIRT_PATCH(op.x): \
- if (xen_have_vcpu_info_placement) { \
- start = (char *)xen_##x##_direct; \
- end = xen_##x##_direct_end; \
- reloc = xen_##x##_direct_reloc; \
- } \
- goto patch_site
-
- switch (type) {
- SITE(pv_irq_ops, irq_enable);
- SITE(pv_irq_ops, irq_disable);
- SITE(pv_irq_ops, save_fl);
- SITE(pv_irq_ops, restore_fl);
-#undef SITE
-
- patch_site:
- if (start == NULL || (end-start) > len)
- goto default_patch;
-
- ret = paravirt_patch_insns(insnbuf, len, start, end);
-
- /* Note: because reloc is assigned from something that
- appears to be an array, gcc assumes it's non-null,
- but doesn't know its relationship with start and
- end. */
- if (reloc > start && reloc < end) {
- int reloc_off = reloc - start;
- long *relocp = (long *)(insnbuf + reloc_off);
- long delta = start - (char *)addr;
-
- *relocp += delta;
- }
- break;
-
- default_patch:
- default:
- ret = paravirt_patch_default(type, clobbers, insnbuf,
- addr, len);
- break;
- }
-
- return ret;
-}
-
static const struct pv_info xen_info __initconst = {
.shared_kernel_pmd = 0,
@@ -1043,10 +997,6 @@ static const struct pv_info xen_info __initconst = {
.name = "Xen",
};
-static const struct pv_init_ops xen_init_ops __initconst = {
- .patch = xen_patch,
-};
-
static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.cpuid = xen_cpuid,
@@ -1244,7 +1194,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
- pv_init_ops = xen_init_ops;
+ pv_init_ops.patch = paravirt_patch_default;
pv_cpu_ops = xen_cpu_ops;
x86_platform.get_nmi_reason = xen_get_nmi_reason;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index cab28cf2cffb..e437714750f8 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
/* Get the "official" set of cpus referring to our pagetable. */
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
- && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+ if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
continue;
smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
}
return;
}
- cpumask_copy(mask, mm_cpumask(mm));
/*
* It's possible that a vcpu may have a stale reference to our
@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
* look at its actual current cr3 value, and force it to flush
* if needed.
*/
+ cpumask_clear(mask);
for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
cpumask_set_cpu(cpu, mask);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index eff224df813f..dcd31fa39b5d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in percpu data) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/asm-offsets.h>
@@ -16,7 +10,7 @@
#include <asm/processor-flags.h>
#include <asm/frame.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
/*
* Enable events. This clears the event mask and tests the pending
@@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
-2: call check_events
+ call check_events
1:
-ENDPATCH(xen_irq_enable_direct)
FRAME_END
ret
ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
/*
@@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct)
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
+ENDPROC(xen_irq_disable_direct)
/*
* (xen_)save_fl is used to get the current interrupt enable status.
@@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
-ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
/*
@@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct)
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jnz 1f
-2: call check_events
+ call check_events
1:
-ENDPATCH(xen_restore_fl_direct)
FRAME_END
ret
ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
/*
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
deleted file mode 100644
index 465276467a47..000000000000
--- a/arch/x86/xen/xen-asm.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _XEN_XEN_ASM_H
-#define _XEN_XEN_ASM_H
-
-#include <linux/linkage.h>
-
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index feb6d40a0860..1200e262a116 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/thread_info.h>
@@ -18,21 +12,10 @@
#include <xen/interface/xen.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
-/*
- * Force an event check by making a hypercall, but preserve regs
- * before making the call.
- */
-check_events:
- push %eax
- push %ecx
- push %edx
- call xen_force_evtchn_callback
- pop %edx
- pop %ecx
- pop %eax
- ret
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
/*
* This is run where a normal iret would be run, with the same stack setup:
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c3df43141e70..3a3b6a211584 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/errno.h>
@@ -20,7 +14,7 @@
#include <xen/interface/xen.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp), %rcx
@@ -46,9 +40,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
*/
ENTRY(xen_iret)
pushq $0
-1: jmp hypercall_iret
-ENDPATCH(xen_iret)
-RELOC(xen_iret, 1b+1)
+ jmp hypercall_iret
ENTRY(xen_sysret64)
/*
@@ -65,9 +57,7 @@ ENTRY(xen_sysret64)
pushq %rcx
pushq $VGCF_in_syscall
-1: jmp hypercall_iret
-ENDPATCH(xen_sysret64)
-RELOC(xen_sysret64, 1b+1)
+ jmp hypercall_iret
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
@@ -82,34 +72,47 @@ RELOC(xen_sysret64, 1b+1)
* rip
* r11
* rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
-.macro undo_xen_syscall
- mov 0*8(%rsp), %rcx
- mov 1*8(%rsp), %r11
- mov 5*8(%rsp), %rsp
-.endm
-
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
- undo_xen_syscall
- jmp entry_SYSCALL_64_after_swapgs
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER_DS and __USER_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER_DS, 4*8(%rsp)
+ movq $__USER_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_64_after_hwframe
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
- undo_xen_syscall
- jmp entry_SYSCALL_compat
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER32_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_compat_after_hwframe
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
- undo_xen_syscall
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
jmp entry_SYSENTER_compat
ENDPROC(xen_sysenter_target)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 72a8e6adebe6..a7525e95d53f 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -58,7 +58,7 @@ ENTRY(hypercall_page)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
/* Map the p2m table to a 512GB-aligned user address. */
- ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE)
+ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD))
#endif
#ifdef CONFIG_XEN_PV
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 0d5004477db6..70301ac0d414 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -129,17 +129,10 @@ static inline void __init xen_efi_init(void)
}
#endif
-/* Declare an asm function, along with symbols needed to make it
- inlineable */
-#define DECL_ASM(ret, name, ...) \
- __visible ret name(__VA_ARGS__); \
- extern char name##_end[] __visible; \
- extern char name##_reloc[] __visible
-
-DECL_ASM(void, xen_irq_enable_direct, void);
-DECL_ASM(void, xen_irq_disable_direct, void);
-DECL_ASM(unsigned long, xen_save_fl_direct, void);
-DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+__visible void xen_irq_enable_direct(void);
+__visible void xen_irq_disable_direct(void);
+__visible unsigned long xen_save_fl_direct(void);
+__visible void xen_restore_fl_direct(unsigned long);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 2d716ebc5a5e..dff7cc39437c 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -1,5 +1,6 @@
generic-y += bug.h
generic-y += clkdev.h
+generic-y += device.h
generic-y += div64.h
generic-y += dma-contiguous.h
generic-y += emergency-restart.h
@@ -17,6 +18,7 @@ generic-y += local.h
generic-y += local64.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
+generic-y += param.h
generic-y += percpu.h
generic-y += preempt.h
generic-y += rwsem.h
diff --git a/arch/xtensa/include/asm/device.h b/arch/xtensa/include/asm/device.h
deleted file mode 100644
index 1deeb8ebbb1b..000000000000
--- a/arch/xtensa/include/asm/device.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Arch specific extensions to struct device
- *
- * This file is released under the GPLv2
- */
-#ifndef _ASM_XTENSA_DEVICE_H
-#define _ASM_XTENSA_DEVICE_H
-
-struct dev_archdata {
-};
-
-struct pdev_archdata {
-};
-
-#endif /* _ASM_XTENSA_DEVICE_H */
diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index b39531babec0..eaaf1ebcc7a4 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -44,18 +44,10 @@
: "r" (uaddr), "I" (-EFAULT), "r" (oparg) \
: "memory")
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
- return -EFAULT;
#if !XCHAL_HAVE_S32C1I
return -ENOSYS;
@@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
pagefault_enable();
- if (ret)
- return ret;
+ if (!ret)
+ *oval = oldval;
- switch (cmp) {
- case FUTEX_OP_CMP_EQ: return (oldval == cmparg);
- case FUTEX_OP_CMP_NE: return (oldval != cmparg);
- case FUTEX_OP_CMP_LT: return (oldval < cmparg);
- case FUTEX_OP_CMP_GE: return (oldval >= cmparg);
- case FUTEX_OP_CMP_LE: return (oldval <= cmparg);
- case FUTEX_OP_CMP_GT: return (oldval > cmparg);
- }
-
- return -ENOSYS;
+ return ret;
}
static inline int
diff --git a/arch/xtensa/include/asm/param.h b/arch/xtensa/include/asm/param.h
deleted file mode 100644
index 0a70e780ef2a..000000000000
--- a/arch/xtensa/include/asm/param.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * include/asm-xtensa/param.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-#ifndef _XTENSA_PARAM_H
-#define _XTENSA_PARAM_H
-
-#include <uapi/asm/param.h>
-
-# define HZ CONFIG_HZ /* internal timer frequency */
-# define USER_HZ 100 /* for user interfaces in "ticks" */
-# define CLOCKS_PER_SEC (USER_HZ) /* frequnzy at which times() counts */
-#endif /* _XTENSA_PARAM_H */
diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h
index a36221cf6363..3bb49681ee24 100644
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -33,11 +33,6 @@
#define arch_spin_is_locked(x) ((x)->slock != 0)
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c
index 33bfa5270d95..08175df7a69e 100644
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -273,8 +273,8 @@ void __init init_arch(bp_tag_t *bp_start)
* Initialize system. Setup memory and reserve regions.
*/
-extern char _end;
-extern char _stext;
+extern char _end[];
+extern char _stext[];
extern char _WindowVectors_text_start;
extern char _WindowVectors_text_end;
extern char _DebugInterruptVector_literal_start;
@@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p)
}
#endif
- mem_reserve(__pa(&_stext), __pa(&_end));
+ mem_reserve(__pa(_stext), __pa(_end));
#ifdef CONFIG_VECTORS_OFFSET
mem_reserve(__pa(&_WindowVectors_text_start),
diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c
index d159e9b9c018..672391003e40 100644
--- a/arch/xtensa/kernel/xtensa_ksyms.c
+++ b/arch/xtensa/kernel/xtensa_ksyms.c
@@ -94,13 +94,11 @@ unsigned long __sync_fetch_and_or_4(unsigned long *p, unsigned long v)
}
EXPORT_SYMBOL(__sync_fetch_and_or_4);
-#ifdef CONFIG_NET
/*
* Networking support
*/
EXPORT_SYMBOL(csum_partial);
EXPORT_SYMBOL(csum_partial_copy_generic);
-#endif /* CONFIG_NET */
/*
* Architecture-specific symbols
diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c
index 1a804a2f9a5b..3c75c4e597da 100644
--- a/arch/xtensa/mm/cache.c
+++ b/arch/xtensa/mm/cache.c
@@ -103,6 +103,7 @@ void clear_user_highpage(struct page *page, unsigned long vaddr)
clear_page_alias(kvaddr, paddr);
preempt_enable();
}
+EXPORT_SYMBOL(clear_user_highpage);
void copy_user_highpage(struct page *dst, struct page *src,
unsigned long vaddr, struct vm_area_struct *vma)
@@ -119,10 +120,7 @@ void copy_user_highpage(struct page *dst, struct page *src,
copy_page_alias(dst_vaddr, src_vaddr, dst_paddr, src_paddr);
preempt_enable();
}
-
-#endif /* DCACHE_WAY_SIZE > PAGE_SIZE */
-
-#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK
+EXPORT_SYMBOL(copy_user_highpage);
/*
* Any time the kernel writes to a user page cache page, or it is about to
@@ -176,7 +174,7 @@ void flush_dcache_page(struct page *page)
/* There shouldn't be an entry in the cache for this page anymore. */
}
-
+EXPORT_SYMBOL(flush_dcache_page);
/*
* For now, flush the whole cache. FIXME??
@@ -188,6 +186,7 @@ void local_flush_cache_range(struct vm_area_struct *vma,
__flush_invalidate_dcache_all();
__invalidate_icache_all();
}
+EXPORT_SYMBOL(local_flush_cache_range);
/*
* Remove any entry in the cache for this page.
@@ -207,8 +206,9 @@ void local_flush_cache_page(struct vm_area_struct *vma, unsigned long address,
__flush_invalidate_dcache_page_alias(virt, phys);
__invalidate_icache_page_alias(virt, phys);
}
+EXPORT_SYMBOL(local_flush_cache_page);
-#endif
+#endif /* DCACHE_WAY_SIZE > PAGE_SIZE */
void
update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep)
@@ -225,7 +225,7 @@ update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep)
flush_tlb_page(vma, addr);
-#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK
+#if (DCACHE_WAY_SIZE > PAGE_SIZE)
if (!PageReserved(page) && test_bit(PG_arch_1, &page->flags)) {
unsigned long phys = page_to_phys(page);
@@ -256,7 +256,7 @@ update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep)
* flush_dcache_page() on the page.
*/
-#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK
+#if (DCACHE_WAY_SIZE > PAGE_SIZE)
void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long vaddr, void *dst, const void *src,