186 files changed, 12271 insertions, 3629 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 00fcf81f2c56..cb6e3a219294 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # Select 32 or 64 bit
 config 64BIT
-	bool "64-bit kernel" if ARCH = "x86"
-	default ARCH != "i386"
+	bool "64-bit kernel" if "$(ARCH)" = "x86"
+	default "$(ARCH)" != "i386"
 	---help---
 	  Say yes to build a 64-bit kernel - formerly known as x86_64
 	  Say no to build a 32-bit kernel - formerly known as i386
@@ -28,6 +28,8 @@ config X86_64
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
 	select MODULES_USE_ELF_RELA
+	select NEED_DMA_MAP_STATE
+	select SWIOTLB
 	select X86_DEV_DMA_OPS
 	select ARCH_HAS_SYSCALL_WRAPPER
 
@@ -52,6 +54,7 @@ config X86
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
+	select ARCH_HAS_FILTER_PGPROT
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_KCOV			if X86_64
@@ -59,6 +62,7 @@ config X86
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
+	select ARCH_HAS_UACCESS_MCSAFE		if X86_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX
@@ -133,11 +137,10 @@ config X86
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_EBPF_JIT			if X86_64
+	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_EXIT_THREAD
 	select HAVE_FENTRY			if X86_64 || DYNAMIC_FTRACE
@@ -183,6 +186,7 @@ config X86
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
 	select IRQ_FORCED_THREADING
+	select NEED_SG_DMA_LENGTH
 	select PCI_LOCKLESS_CONFIG
 	select PERF_EVENTS
 	select RTC_LIB
@@ -235,13 +239,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX
 config SBUS
 	bool
 
-config NEED_DMA_MAP_STATE
-	def_bool y
-	depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
-
-config NEED_SG_DMA_LENGTH
-	def_bool y
-
 config GENERIC_ISA_DMA
 	def_bool y
 	depends on ISA_DMA_API
@@ -273,6 +270,9 @@ config ARCH_HAS_CPU_RELAX
 config ARCH_HAS_CACHE_LINE_SIZE
 	def_bool y
 
+config ARCH_HAS_FILTER_PGPROT
+	def_bool y
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
@@ -871,6 +871,7 @@ config DMI
 
 config GART_IOMMU
 	bool "Old AMD GART IOMMU support"
+	select IOMMU_HELPER
 	select SWIOTLB
 	depends on X86_64 && PCI && AMD_NB
 	---help---
@@ -892,6 +893,7 @@ config GART_IOMMU
 
 config CALGARY_IOMMU
 	bool "IBM Calgary IOMMU support"
+	select IOMMU_HELPER
 	select SWIOTLB
 	depends on X86_64 && PCI
 	---help---
@@ -919,20 +921,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 	  Calgary anyway, pass 'iommu=calgary' on the kernel command line.
 	  If unsure, say Y.
 
-# need this always selected by IOMMU for the VIA workaround
-config SWIOTLB
-	def_bool y if X86_64
-	---help---
-	  Support for software bounce buffers used on x86-64 systems
-	  which don't have a hardware IOMMU. Using this PCI devices
-	  which can only access 32-bits of memory can be used on systems
-	  with more than 3 GB of memory.
-	  If unsure, say Y.
-
-config IOMMU_HELPER
-	def_bool y
-	depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
-
 config MAXSMP
 	bool "Enable Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL
@@ -1454,6 +1442,7 @@ config HIGHMEM
 config X86_PAE
 	bool "PAE (Physical Address Extension) Support"
 	depends on X86_32 && !HIGHMEM4G
+	select PHYS_ADDR_T_64BIT
 	select SWIOTLB
 	---help---
 	  PAE is required for NX support, and furthermore enables
@@ -1481,14 +1470,6 @@ config X86_5LEVEL
 
 	  Say N if unsure.
 
-config ARCH_PHYS_ADDR_T_64BIT
-	def_bool y
-	depends on X86_64 || X86_PAE
-
-config ARCH_DMA_ADDR_T_64BIT
-	def_bool y
-	depends on X86_64 || HIGHMEM64G
-
 config X86_DIRECT_GBPAGES
 	def_bool y
 	depends on X86_64 && !DEBUG_PAGEALLOC
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 60135cbd905c..f0a6ea22429d 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -94,7 +94,7 @@ ifeq ($(CONFIG_X86_32),y)
 else
         BITS := 64
         UTS_MACHINE := x86_64
-        CHECKFLAGS += -D__x86_64__ -m64
+        CHECKFLAGS += -D__x86_64__
 
         biarch := -m64
         KBUILD_AFLAGS += -m64
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 0cb325734cfb..af6cda0b7900 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "misc.h"
 
-#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL
 
 static unsigned long fs;
 static inline void set_fs(unsigned long seg)
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 47d3efff6805..a8a8642d2b0b 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -109,23 +109,34 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
 }
 
 static efi_status_t
-__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
+__setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
 {
 	struct pci_setup_rom *rom = NULL;
 	efi_status_t status;
 	unsigned long size;
-	uint64_t attributes;
+	uint64_t attributes, romsize;
+	void *romimage;
 
-	status = efi_early->call(pci->attributes, pci,
-				 EfiPciIoAttributeOperationGet, 0, 0,
-				 &attributes);
+	status = efi_call_proto(efi_pci_io_protocol, attributes, pci,
+				EfiPciIoAttributeOperationGet, 0, 0,
+				&attributes);
 	if (status != EFI_SUCCESS)
 		return status;
 
-	if (!pci->romimage || !pci->romsize)
+	/*
+	 * Some firmware images contain EFI function pointers at the place where the
+	 * romimage and romsize fields are supposed to be. Typically the EFI
+	 * code is mapped at high addresses, translating to an unrealistically
+	 * large romsize. The UEFI spec limits the size of option ROMs to 16
+	 * MiB so we reject any ROMs over 16 MiB in size to catch this.
+	 */
+	romimage = (void *)(unsigned long)efi_table_attr(efi_pci_io_protocol,
+							 romimage, pci);
+	romsize = efi_table_attr(efi_pci_io_protocol, romsize, pci);
+	if (!romimage || !romsize || romsize > SZ_16M)
 		return EFI_INVALID_PARAMETER;
 
-	size = pci->romsize + sizeof(*rom);
+	size = romsize + sizeof(*rom);
 
 	status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
 	if (status != EFI_SUCCESS) {
@@ -141,29 +152,32 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
 	rom->pcilen = pci->romsize;
 	*__rom = rom;
 
-	status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
-				 PCI_VENDOR_ID, 1, &(rom->vendor));
+	status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
+				EfiPciIoWidthUint16, PCI_VENDOR_ID, 1,
+				&rom->vendor);
 
 	if (status != EFI_SUCCESS) {
 		efi_printk(sys_table, "Failed to read rom->vendor\n");
 		goto free_struct;
 	}
 
-	status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
-				 PCI_DEVICE_ID, 1, &(rom->devid));
+	status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
+				EfiPciIoWidthUint16, PCI_DEVICE_ID, 1,
+				&rom->devid);
 
 	if (status != EFI_SUCCESS) {
 		efi_printk(sys_table, "Failed to read rom->devid\n");
 		goto free_struct;
 	}
 
-	status = efi_early->call(pci->get_location, pci, &(rom->segment),
-				 &(rom->bus), &(rom->device), &(rom->function));
+	status = efi_call_proto(efi_pci_io_protocol, get_location, pci,
+				&rom->segment, &rom->bus, &rom->device,
+				&rom->function);
 
 	if (status != EFI_SUCCESS)
 		goto free_struct;
 
-	memcpy(rom->romdata, pci->romimage, pci->romsize);
+	memcpy(rom->romdata, romimage, romsize);
 	return status;
 
 free_struct:
@@ -175,7 +189,7 @@ static void
 setup_efi_pci32(struct boot_params *params, void **pci_handle,
 		unsigned long size)
 {
-	efi_pci_io_protocol_32 *pci = NULL;
+	efi_pci_io_protocol_t *pci = NULL;
 	efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
 	u32 *handles = (u32 *)(unsigned long)pci_handle;
 	efi_status_t status;
@@ -202,7 +216,7 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
 		if (!pci)
 			continue;
 
-		status = __setup_efi_pci32(pci, &rom);
+		status = __setup_efi_pci(pci, &rom);
 		if (status != EFI_SUCCESS)
 			continue;
 
@@ -216,73 +230,11 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
 	}
 }
 
-static efi_status_t
-__setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
-{
-	struct pci_setup_rom *rom;
-	efi_status_t status;
-	unsigned long size;
-	uint64_t attributes;
-
-	status = efi_early->call(pci->attributes, pci,
-				 EfiPciIoAttributeOperationGet, 0,
-				 &attributes);
-	if (status != EFI_SUCCESS)
-		return status;
-
-	if (!pci->romimage || !pci->romsize)
-		return EFI_INVALID_PARAMETER;
-
-	size = pci->romsize + sizeof(*rom);
-
-	status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to alloc mem for rom\n");
-		return status;
-	}
-
-	rom->data.type = SETUP_PCI;
-	rom->data.len = size - sizeof(struct setup_data);
-	rom->data.next = 0;
-	rom->pcilen = pci->romsize;
-	*__rom = rom;
-
-	status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
-				 PCI_VENDOR_ID, 1, &(rom->vendor));
-
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to read rom->vendor\n");
-		goto free_struct;
-	}
-
-	status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
-				 PCI_DEVICE_ID, 1, &(rom->devid));
-
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to read rom->devid\n");
-		goto free_struct;
-	}
-
-	status = efi_early->call(pci->get_location, pci, &(rom->segment),
-				 &(rom->bus), &(rom->device), &(rom->function));
-
-	if (status != EFI_SUCCESS)
-		goto free_struct;
-
-	memcpy(rom->romdata, pci->romimage, pci->romsize);
-	return status;
-
-free_struct:
-	efi_call_early(free_pool, rom);
-	return status;
-
-}
-
 static void
 setup_efi_pci64(struct boot_params *params, void **pci_handle,
 		unsigned long size)
 {
-	efi_pci_io_protocol_64 *pci = NULL;
+	efi_pci_io_protocol_t *pci = NULL;
 	efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
 	u64 *handles = (u64 *)(unsigned long)pci_handle;
 	efi_status_t status;
@@ -309,7 +261,7 @@ setup_efi_pci64(struct boot_params *params, void **pci_handle,
 		if (!pci)
 			continue;
 
-		status = __setup_efi_pci64(pci, &rom);
+		status = __setup_efi_pci(pci, &rom);
 		if (status != EFI_SUCCESS)
 			continue;
 
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fca012baba19..64037895b085 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -306,6 +306,25 @@ ENTRY(startup_64)
 	leaq	boot_stack_end(%rbx), %rsp
 
 	/*
+	 * paging_prepare() and cleanup_trampoline() below can have GOT
+	 * references. Adjust the table with address we are running at.
+	 *
+	 * Zero RAX for adjust_got: the GOT was not adjusted before;
+	 * there's no adjustment to undo.
+	 */
+	xorq	%rax, %rax
+
+	/*
+	 * Calculate the address the binary is loaded at and use it as
+	 * a GOT adjustment.
+	 */
+	call	1f
+1:	popq	%rdi
+	subq	$1b, %rdi
+
+	call	adjust_got
+
+	/*
 	 * At this point we are in long mode with 4-level paging enabled,
 	 * but we might want to enable 5-level paging or vice versa.
 	 *
@@ -346,6 +365,7 @@ ENTRY(startup_64)
 	 * this function call.
 	 */
 	pushq	%rsi
+	movq	%rsi, %rdi		/* real mode address */
 	call	paging_prepare
 	popq	%rsi
 
@@ -370,10 +390,14 @@ trampoline_return:
 	/*
 	 * cleanup_trampoline() would restore trampoline memory.
 	 *
+	 * RDI is address of the page table to use instead of page table
+	 * in trampoline memory (if required).
+	 *
 	 * RSI holds real mode data and needs to be preserved across
 	 * this function call.
 	 */
 	pushq	%rsi
+	leaq	top_pgtable(%rbx), %rdi
 	call	cleanup_trampoline
 	popq	%rsi
 
@@ -381,6 +405,21 @@ trampoline_return:
 	pushq	$0
 	popfq
 
+	/*
+	 * Previously we've adjusted the GOT with address the binary was
+	 * loaded at. Now we need to re-adjust for relocation address.
+	 *
+	 * Calculate the address the binary is loaded at, so that we can
+	 * undo the previous GOT adjustment.
+	 */
+	call	1f
+1:	popq	%rax
+	subq	$1b, %rax
+
+	/* The new adjustment is the relocation address */
+	movq	%rbx, %rdi
+	call	adjust_got
+
 /*
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
@@ -482,19 +521,6 @@ relocated:
 	rep	stosq
 
 /*
- * Adjust our own GOT
- */
-	leaq	_got(%rip), %rdx
-	leaq	_egot(%rip), %rcx
-1:
-	cmpq	%rcx, %rdx
-	jae	2f
-	addq	%rbx, (%rdx)
-	addq	$8, %rdx
-	jmp	1b
-2:
-	
-/*
  * Do the extraction, and jump to the new kernel..
  */
 	pushq	%rsi			/* Save the real mode argument */
@@ -512,6 +538,27 @@ relocated:
  */
 	jmp	*%rax
 
+/*
+ * Adjust the global offset table
+ *
+ * RAX is the previous adjustment of the table to undo (use 0 if it's the
+ * first time we touch GOT).
+ * RDI is the new adjustment to apply.
+ */
+adjust_got:
+	/* Walk through the GOT adding the address to the entries */
+	leaq	_got(%rip), %rdx
+	leaq	_egot(%rip), %rcx
+1:
+	cmpq	%rcx, %rdx
+	jae	2f
+	subq	%rax, (%rdx)	/* Undo previous adjustment */
+	addq	%rdi, (%rdx)	/* Apply the new adjustment */
+	addq	$8, %rdx
+	jmp	1b
+2:
+	ret
+
 	.code32
 /*
  * This is the 32-bit trampoline that will be copied over to low memory.
@@ -649,3 +696,10 @@ boot_stack_end:
 	.balign 4096
 pgtable:
 	.fill BOOT_PGT_SIZE, 1, 0
+
+/*
+ * The page table is going to be used instead of page table in the trampoline
+ * memory.
+ */
+top_pgtable:
+	.fill PAGE_SIZE, 1, 0
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index a0a50b91ecef..b87a7582853d 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -47,7 +47,7 @@
 #include <linux/decompress/mm.h>
 
 #ifdef CONFIG_X86_5LEVEL
-unsigned int pgtable_l5_enabled __ro_after_init;
+unsigned int __pgtable_l5_enabled;
 unsigned int pgdir_shift __ro_after_init = 39;
 unsigned int ptrs_per_p4d __ro_after_init = 1;
 #endif
@@ -734,7 +734,7 @@ void choose_random_location(unsigned long input,
 
 #ifdef CONFIG_X86_5LEVEL
 	if (__read_cr4() & X86_CR4_LA57) {
-		pgtable_l5_enabled = 1;
+		__pgtable_l5_enabled = 1;
 		pgdir_shift = 48;
 		ptrs_per_p4d = 512;
 	}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 9e11be4cae19..a423bdb42686 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -12,10 +12,8 @@
 #undef CONFIG_PARAVIRT_SPINLOCKS
 #undef CONFIG_KASAN
 
-#ifdef CONFIG_X86_5LEVEL
-/* cpu_feature_enabled() cannot be used that early */
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
 
 #include <linux/linkage.h>
 #include <linux/screen_info.h>
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 32af1cbcd903..8c5107545251 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -23,14 +23,6 @@ struct paging_config {
 static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
 
 /*
- * The page table is going to be used instead of page table in the trampoline
- * memory.
- *
- * It must not be in BSS as BSS is cleared after cleanup_trampoline().
- */
-static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
-
-/*
  * Trampoline address will be printed by extract_kernel() for debugging
  * purposes.
  *
@@ -39,16 +31,23 @@ static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
  */
 unsigned long *trampoline_32bit __section(.data);
 
-struct paging_config paging_prepare(void)
+extern struct boot_params *boot_params;
+int cmdline_find_option_bool(const char *option);
+
+struct paging_config paging_prepare(void *rmode)
 {
 	struct paging_config paging_config = {};
 	unsigned long bios_start, ebda_start;
 
+	/* Initialize boot_params. Required for cmdline_find_option_bool(). */
+	boot_params = rmode;
+
 	/*
 	 * Check if LA57 is desired and supported.
 	 *
-	 * There are two parts to the check:
+	 * There are several parts to the check:
 	 *   - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
+	 *   - if user asked to disable 5-level paging: no5lvl in cmdline
 	 *   - if the machine supports 5-level paging:
 	 *     + CPUID leaf 7 is supported
 	 *     + the leaf has the feature bit set
@@ -56,6 +55,7 @@ struct paging_config paging_prepare(void)
 	 * That's substitute for boot_cpu_has() in early boot code.
 	 */
 	if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
+			!cmdline_find_option_bool("no5lvl") &&
 			native_cpuid_eax(0) >= 7 &&
 			(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
 		paging_config.l5_required = 1;
@@ -134,19 +134,19 @@ out:
 	return paging_config;
 }
 
-void cleanup_trampoline(void)
+void cleanup_trampoline(void *pgtable)
 {
 	void *trampoline_pgtable;
 
-	trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET;
+	trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long);
 
 	/*
 	 * Move the top level page table out of trampoline memory,
 	 * if it's there.
 	 */
 	if ((void *)__native_read_cr3() == trampoline_pgtable) {
-		memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE);
-		native_write_cr3((unsigned long)top_pgtable);
+		memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
+		native_write_cr3((unsigned long)pgtable);
 	}
 
 	/* Restore trampoline memory */
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5f07333bb224..a450ad573dcb 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
-obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
@@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
@@ -38,6 +36,16 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 
+obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o
+obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o
+
+obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o
+obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
+
+obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
+obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
+
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -55,11 +63,12 @@ ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
 	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
 	obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
+
+	obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o
 endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
@@ -68,10 +77,16 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
+aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o
+aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
+
+morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
+morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
+
 ifeq ($(avx_supported),yes)
 	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
 					camellia_aesni_avx_glue.o
@@ -87,6 +102,8 @@ ifeq ($(avx2_supported),yes)
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
 	chacha20-x86_64-y += chacha20-avx2-x86_64.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
+
+	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
new file mode 100644
index 000000000000..9254e0b6cc06
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -0,0 +1,749 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0	%xmm0
+#define STATE1	%xmm1
+#define STATE2	%xmm2
+#define STATE3	%xmm3
+#define STATE4	%xmm4
+#define KEY	%xmm5
+#define MSG	%xmm5
+#define T0	%xmm6
+#define T1	%xmm7
+
+#define STATEP	%rdi
+#define LEN	%rsi
+#define SRC	%rdx
+#define DST	%rcx
+
+.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
+.align 16
+.Laegis128_const_0:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128_const_1:
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
+.align 16
+.Laegis128_counter:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * aegis128_update
+ * input:
+ *   STATE[0-4] - input state
+ * output:
+ *   STATE[0-4] - output state (shifted positions)
+ * changed:
+ *   T0
+ */
+.macro aegis128_update
+	movdqa STATE4, T0
+	aesenc STATE0, STATE4
+	aesenc STATE1, STATE0
+	aesenc STATE2, STATE1
+	aesenc STATE3, STATE2
+	aesenc T0,     STATE3
+.endm
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   SRC - src
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   T0
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	pxor MSG, MSG
+
+	mov LEN, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov LEN, %r8
+	and $0x1E, %r8
+	add SRC, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov LEN, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov LEN, %r8
+	and $0x1C, %r8
+	add SRC, %r8
+	shl $0x10, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov LEN, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov LEN, %r8
+	and $0x18, %r8
+	add SRC, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG
+
+	mov LEN, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov LEN, %r8
+	and $0x10, %r8
+	add SRC, %r8
+	pslldq $8, MSG
+	movq (%r8), T0
+	pxor T0, MSG
+
+.Lld_partial_8:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   DST - dst
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov LEN, %r8
+	mov DST, %r9
+
+	movq T0, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	psrldq $8, T0
+	movq T0, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $0x10, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis128_aesni_init)
+	FRAME_BEGIN
+
+	/* load IV: */
+	movdqu (%rdx), T1
+
+	/* load key: */
+	movdqa (%rsi), KEY
+	pxor KEY, T1
+	movdqa T1, STATE0
+	movdqa KEY, STATE3
+	movdqa KEY, STATE4
+
+	/* load the constants: */
+	movdqa .Laegis128_const_0, STATE2
+	movdqa .Laegis128_const_1, STATE1
+	pxor STATE2, STATE3
+	pxor STATE1, STATE4
+
+	/* update 10 times with KEY / KEY xor IV: */
+	aegis128_update; pxor KEY, STATE4
+	aegis128_update; pxor T1,  STATE3
+	aegis128_update; pxor KEY, STATE2
+	aegis128_update; pxor T1,  STATE1
+	aegis128_update; pxor KEY, STATE0
+	aegis128_update; pxor T1,  STATE4
+	aegis128_update; pxor KEY, STATE3
+	aegis128_update; pxor T1,  STATE2
+	aegis128_update; pxor KEY, STATE1
+	aegis128_update; pxor T1,  STATE0
+
+	/* store the state: */
+	movdqu STATE0, 0x00(STATEP)
+	movdqu STATE1, 0x10(STATEP)
+	movdqu STATE2, 0x20(STATEP)
+	movdqu STATE3, 0x30(STATEP)
+	movdqu STATE4, 0x40(STATEP)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128_aesni_init)
+
+/*
+ * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
+ *                               const void *data);
+ */
+ENTRY(crypto_aegis128_aesni_ad)
+	FRAME_BEGIN
+
+	cmp $0x10, LEN
+	jb .Lad_out
+
+	/* load the state: */
+	movdqu 0x00(STATEP), STATE0
+	movdqu 0x10(STATEP), STATE1
+	movdqu 0x20(STATEP), STATE2
+	movdqu 0x30(STATEP), STATE3
+	movdqu 0x40(STATEP), STATE4
+
+	mov SRC, %r8
+	and $0xF, %r8
+	jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+	movdqa 0x00(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE4
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_1
+
+	movdqa 0x10(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE3
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_2
+
+	movdqa 0x20(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE2
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_3
+
+	movdqa 0x30(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE1
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_4
+
+	movdqa 0x40(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE0
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_0
+
+	add $0x50, SRC
+	jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+	movdqu 0x00(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE4
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_1
+
+	movdqu 0x10(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE3
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_2
+
+	movdqu 0x20(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE2
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_3
+
+	movdqu 0x30(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE1
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_4
+
+	movdqu 0x40(SRC), MSG
+	aegis128_update
+	pxor MSG, STATE0
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_0
+
+	add $0x50, SRC
+	jmp .Lad_u_loop
+
+	/* store the state: */
+.Lad_out_0:
+	movdqu STATE0, 0x00(STATEP)
+	movdqu STATE1, 0x10(STATEP)
+	movdqu STATE2, 0x20(STATEP)
+	movdqu STATE3, 0x30(STATEP)
+	movdqu STATE4, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lad_out_1:
+	movdqu STATE4, 0x00(STATEP)
+	movdqu STATE0, 0x10(STATEP)
+	movdqu STATE1, 0x20(STATEP)
+	movdqu STATE2, 0x30(STATEP)
+	movdqu STATE3, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lad_out_2:
+	movdqu STATE3, 0x00(STATEP)
+	movdqu STATE4, 0x10(STATEP)
+	movdqu STATE0, 0x20(STATEP)
+	movdqu STATE1, 0x30(STATEP)
+	movdqu STATE2, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lad_out_3:
+	movdqu STATE2, 0x00(STATEP)
+	movdqu STATE3, 0x10(STATEP)
+	movdqu STATE4, 0x20(STATEP)
+	movdqu STATE0, 0x30(STATEP)
+	movdqu STATE1, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lad_out_4:
+	movdqu STATE1, 0x00(STATEP)
+	movdqu STATE2, 0x10(STATEP)
+	movdqu STATE3, 0x20(STATEP)
+	movdqu STATE4, 0x30(STATEP)
+	movdqu STATE0, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128_aesni_ad)
+
+.macro encrypt_block a s0 s1 s2 s3 s4 i
+	movdq\a (\i * 0x10)(SRC), MSG
+	movdqa MSG, T0
+	pxor \s1, T0
+	pxor \s4, T0
+	movdqa \s2, T1
+	pand \s3, T1
+	pxor T1, T0
+	movdq\a T0, (\i * 0x10)(DST)
+
+	aegis128_update
+	pxor MSG, \s4
+
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lenc_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
+ *                                const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc)
+	FRAME_BEGIN
+
+	cmp $0x10, LEN
+	jb .Lenc_out
+
+	/* load the state: */
+	movdqu 0x00(STATEP), STATE0
+	movdqu 0x10(STATEP), STATE1
+	movdqu 0x20(STATEP), STATE2
+	movdqu 0x30(STATEP), STATE3
+	movdqu 0x40(STATEP), STATE4
+
+	mov  SRC,  %r8
+	or   DST,  %r8
+	and $0xF, %r8
+	jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+	encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+	encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+	encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+	encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+	encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+	add $0x50, SRC
+	add $0x50, DST
+	jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+	encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+	encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+	encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+	encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+	encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+	add $0x50, SRC
+	add $0x50, DST
+	jmp .Lenc_u_loop
+
+	/* store the state: */
+.Lenc_out_0:
+	movdqu STATE4, 0x00(STATEP)
+	movdqu STATE0, 0x10(STATEP)
+	movdqu STATE1, 0x20(STATEP)
+	movdqu STATE2, 0x30(STATEP)
+	movdqu STATE3, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lenc_out_1:
+	movdqu STATE3, 0x00(STATEP)
+	movdqu STATE4, 0x10(STATEP)
+	movdqu STATE0, 0x20(STATEP)
+	movdqu STATE1, 0x30(STATEP)
+	movdqu STATE2, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lenc_out_2:
+	movdqu STATE2, 0x00(STATEP)
+	movdqu STATE3, 0x10(STATEP)
+	movdqu STATE4, 0x20(STATEP)
+	movdqu STATE0, 0x30(STATEP)
+	movdqu STATE1, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lenc_out_3:
+	movdqu STATE1, 0x00(STATEP)
+	movdqu STATE2, 0x10(STATEP)
+	movdqu STATE3, 0x20(STATEP)
+	movdqu STATE4, 0x30(STATEP)
+	movdqu STATE0, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lenc_out_4:
+	movdqu STATE0, 0x00(STATEP)
+	movdqu STATE1, 0x10(STATEP)
+	movdqu STATE2, 0x20(STATEP)
+	movdqu STATE3, 0x30(STATEP)
+	movdqu STATE4, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128_aesni_enc)
+
+/*
+ * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
+ *                                     const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu 0x00(STATEP), STATE0
+	movdqu 0x10(STATEP), STATE1
+	movdqu 0x20(STATEP), STATE2
+	movdqu 0x30(STATEP), STATE3
+	movdqu 0x40(STATEP), STATE4
+
+	/* encrypt message: */
+	call __load_partial
+
+	movdqa MSG, T0
+	pxor STATE1, T0
+	pxor STATE4, T0
+	movdqa STATE2, T1
+	pand STATE3, T1
+	pxor T1, T0
+
+	call __store_partial
+
+	aegis128_update
+	pxor MSG, STATE4
+
+	/* store the state: */
+	movdqu STATE4, 0x00(STATEP)
+	movdqu STATE0, 0x10(STATEP)
+	movdqu STATE1, 0x20(STATEP)
+	movdqu STATE2, 0x30(STATEP)
+	movdqu STATE3, 0x40(STATEP)
+
+	FRAME_END
+ENDPROC(crypto_aegis128_aesni_enc_tail)
+
+.macro decrypt_block a s0 s1 s2 s3 s4 i
+	movdq\a (\i * 0x10)(SRC), MSG
+	pxor \s1, MSG
+	pxor \s4, MSG
+	movdqa \s2, T1
+	pand \s3, T1
+	pxor T1, MSG
+	movdq\a MSG, (\i * 0x10)(DST)
+
+	aegis128_update
+	pxor MSG, \s4
+
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
+ *                                const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec)
+	FRAME_BEGIN
+
+	cmp $0x10, LEN
+	jb .Ldec_out
+
+	/* load the state: */
+	movdqu 0x00(STATEP), STATE0
+	movdqu 0x10(STATEP), STATE1
+	movdqu 0x20(STATEP), STATE2
+	movdqu 0x30(STATEP), STATE3
+	movdqu 0x40(STATEP), STATE4
+
+	mov  SRC, %r8
+	or   DST, %r8
+	and $0xF, %r8
+	jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+	decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+	decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+	decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+	decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+	decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+	add $0x50, SRC
+	add $0x50, DST
+	jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+	decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+	decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+	decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+	decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+	decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+	add $0x50, SRC
+	add $0x50, DST
+	jmp .Ldec_u_loop
+
+	/* store the state: */
+.Ldec_out_0:
+	movdqu STATE4, 0x00(STATEP)
+	movdqu STATE0, 0x10(STATEP)
+	movdqu STATE1, 0x20(STATEP)
+	movdqu STATE2, 0x30(STATEP)
+	movdqu STATE3, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Ldec_out_1:
+	movdqu STATE3, 0x00(STATEP)
+	movdqu STATE4, 0x10(STATEP)
+	movdqu STATE0, 0x20(STATEP)
+	movdqu STATE1, 0x30(STATEP)
+	movdqu STATE2, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Ldec_out_2:
+	movdqu STATE2, 0x00(STATEP)
+	movdqu STATE3, 0x10(STATEP)
+	movdqu STATE4, 0x20(STATEP)
+	movdqu STATE0, 0x30(STATEP)
+	movdqu STATE1, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Ldec_out_3:
+	movdqu STATE1, 0x00(STATEP)
+	movdqu STATE2, 0x10(STATEP)
+	movdqu STATE3, 0x20(STATEP)
+	movdqu STATE4, 0x30(STATEP)
+	movdqu STATE0, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Ldec_out_4:
+	movdqu STATE0, 0x00(STATEP)
+	movdqu STATE1, 0x10(STATEP)
+	movdqu STATE2, 0x20(STATEP)
+	movdqu STATE3, 0x30(STATEP)
+	movdqu STATE4, 0x40(STATEP)
+	FRAME_END
+	ret
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128_aesni_dec)
+
+/*
+ * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
+ *                                     const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu 0x00(STATEP), STATE0
+	movdqu 0x10(STATEP), STATE1
+	movdqu 0x20(STATEP), STATE2
+	movdqu 0x30(STATEP), STATE3
+	movdqu 0x40(STATEP), STATE4
+
+	/* decrypt message: */
+	call __load_partial
+
+	pxor STATE1, MSG
+	pxor STATE4, MSG
+	movdqa STATE2, T1
+	pand STATE3, T1
+	pxor T1, MSG
+
+	movdqa MSG, T0
+	call __store_partial
+
+	/* mask with byte count: */
+	movq LEN, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	movdqa .Laegis128_counter, T1
+	pcmpgtb T1, T0
+	pand T0, MSG
+
+	aegis128_update
+	pxor MSG, STATE4
+
+	/* store the state: */
+	movdqu STATE4, 0x00(STATEP)
+	movdqu STATE0, 0x10(STATEP)
+	movdqu STATE1, 0x20(STATEP)
+	movdqu STATE2, 0x30(STATEP)
+	movdqu STATE3, 0x40(STATEP)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
+ *                                  u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis128_aesni_final)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu 0x00(STATEP), STATE0
+	movdqu 0x10(STATEP), STATE1
+	movdqu 0x20(STATEP), STATE2
+	movdqu 0x30(STATEP), STATE3
+	movdqu 0x40(STATEP), STATE4
+
+	/* prepare length block: */
+	movq %rdx, MSG
+	movq %rcx, T0
+	pslldq $8, T0
+	pxor T0, MSG
+	psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+	pxor STATE3, MSG
+
+	/* update state: */
+	aegis128_update; pxor MSG, STATE4
+	aegis128_update; pxor MSG, STATE3
+	aegis128_update; pxor MSG, STATE2
+	aegis128_update; pxor MSG, STATE1
+	aegis128_update; pxor MSG, STATE0
+	aegis128_update; pxor MSG, STATE4
+	aegis128_update; pxor MSG, STATE3
+
+	/* xor tag: */
+	movdqu (%rsi), MSG
+
+	pxor STATE0, MSG
+	pxor STATE1, MSG
+	pxor STATE2, MSG
+	pxor STATE3, MSG
+	pxor STATE4, MSG
+
+	movdqu MSG, (%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
new file mode 100644
index 000000000000..5de7c0d46edf
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-128 Authenticated-Encryption Algorithm
+ *   Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128_BLOCK_ALIGN 16
+#define AEGIS128_BLOCK_SIZE 16
+#define AEGIS128_NONCE_SIZE 16
+#define AEGIS128_STATE_BLOCKS 5
+#define AEGIS128_KEY_SIZE 16
+#define AEGIS128_MIN_AUTH_SIZE 8
+#define AEGIS128_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128_aesni_ad(
+		void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128_aesni_enc(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_enc_tail(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec_tail(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_final(
+		void *state, void *tag_xor, unsigned int cryptlen,
+		unsigned int assoclen);
+
+struct aegis_block {
+	u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+	struct aegis_block blocks[AEGIS128_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+	struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+			     void *dst);
+	void (*crypt_tail)(void *state, unsigned int length, const void *src,
+			   void *dst);
+};
+
+static void crypto_aegis128_aesni_process_ad(
+		struct aegis_state *state, struct scatterlist *sg_src,
+		unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct aegis_block buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= AEGIS128_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+				crypto_aegis128_aesni_ad(state,
+							 AEGIS128_BLOCK_SIZE,
+							 buf.bytes);
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			crypto_aegis128_aesni_ad(state, left, src);
+
+			src += left & ~(AEGIS128_BLOCK_SIZE - 1);
+			left &= AEGIS128_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+		pos += left;
+		assoclen -= size;
+
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
+		crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+	}
+}
+
+static void crypto_aegis128_aesni_process_crypt(
+		struct aegis_state *state, struct aead_request *req,
+		const struct aegis_crypt_ops *ops)
+{
+	struct skcipher_walk walk;
+	u8 *src, *dst;
+	unsigned int chunksize, base;
+
+	ops->skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		src = walk.src.virt.addr;
+		dst = walk.dst.virt.addr;
+		chunksize = walk.nbytes;
+
+		ops->crypt_blocks(state, chunksize, src, dst);
+
+		base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1);
+		src += base;
+		dst += base;
+		chunksize &= AEGIS128_BLOCK_SIZE - 1;
+
+		if (chunksize > 0)
+			ops->crypt_tail(state, chunksize, src, dst);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
+{
+	u8 *ctx = crypto_aead_ctx(aead);
+	ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+	return (void *)ctx;
+}
+
+static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key,
+					unsigned int keylen)
+{
+	struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead);
+
+	if (keylen != AEGIS128_KEY_SIZE) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE);
+
+	return 0;
+}
+
+static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
+						unsigned int authsize)
+{
+	if (authsize > AEGIS128_MAX_AUTH_SIZE)
+		return -EINVAL;
+	if (authsize < AEGIS128_MIN_AUTH_SIZE)
+		return -EINVAL;
+	return 0;
+}
+
+static void crypto_aegis128_aesni_crypt(struct aead_request *req,
+					struct aegis_block *tag_xor,
+					unsigned int cryptlen,
+					const struct aegis_crypt_ops *ops)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+	struct aegis_state state;
+
+	kernel_fpu_begin();
+
+	crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+	crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
+	crypto_aegis128_aesni_process_crypt(&state, req, ops);
+	crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+	kernel_fpu_end();
+}
+
+static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
+{
+	static const struct aegis_crypt_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_blocks = crypto_aegis128_aesni_enc,
+		.crypt_tail = crypto_aegis128_aesni_enc_tail,
+	};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_block tag = {};
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+	scatterwalk_map_and_copy(tag.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+
+static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
+{
+	static const struct aegis_block zeros = {};
+
+	static const struct aegis_crypt_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_blocks = crypto_aegis128_aesni_dec,
+		.crypt_tail = crypto_aegis128_aesni_dec_tail,
+	};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+	return 0;
+}
+
+static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead,
+					const u8 *key, unsigned int keylen)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead,
+					     unsigned int authsize)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis128_aesni_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis128_aesni_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+	return 0;
+}
+
+static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis128_aesni_alg[] = {
+	{
+		.setkey = crypto_aegis128_aesni_setkey,
+		.setauthsize = crypto_aegis128_aesni_setauthsize,
+		.encrypt = crypto_aegis128_aesni_encrypt,
+		.decrypt = crypto_aegis128_aesni_decrypt,
+		.init = crypto_aegis128_aesni_init_tfm,
+		.exit = crypto_aegis128_aesni_exit_tfm,
+
+		.ivsize = AEGIS128_NONCE_SIZE,
+		.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+		.chunksize = AEGIS128_BLOCK_SIZE,
+
+		.base = {
+			.cra_flags = CRYPTO_ALG_INTERNAL,
+			.cra_blocksize = 1,
+			.cra_ctxsize = sizeof(struct aegis_ctx) +
+				__alignof__(struct aegis_ctx),
+			.cra_alignmask = 0,
+
+			.cra_name = "__aegis128",
+			.cra_driver_name = "__aegis128-aesni",
+
+			.cra_module = THIS_MODULE,
+		}
+	}, {
+		.setkey = cryptd_aegis128_aesni_setkey,
+		.setauthsize = cryptd_aegis128_aesni_setauthsize,
+		.encrypt = cryptd_aegis128_aesni_encrypt,
+		.decrypt = cryptd_aegis128_aesni_decrypt,
+		.init = cryptd_aegis128_aesni_init_tfm,
+		.exit = cryptd_aegis128_aesni_exit_tfm,
+
+		.ivsize = AEGIS128_NONCE_SIZE,
+		.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+		.chunksize = AEGIS128_BLOCK_SIZE,
+
+		.base = {
+			.cra_flags = CRYPTO_ALG_ASYNC,
+			.cra_blocksize = 1,
+			.cra_ctxsize = sizeof(struct cryptd_aead *),
+			.cra_alignmask = 0,
+
+			.cra_priority = 400,
+
+			.cra_name = "aegis128",
+			.cra_driver_name = "aegis128-aesni",
+
+			.cra_module = THIS_MODULE,
+		}
+	}
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_AES),
+	X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis128_aesni_module_init(void)
+{
+	if (!x86_match_cpu(aesni_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_aegis128_aesni_alg,
+				     ARRAY_SIZE(crypto_aegis128_aesni_alg));
+}
+
+static void __exit crypto_aegis128_aesni_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_aegis128_aesni_alg,
+				ARRAY_SIZE(crypto_aegis128_aesni_alg));
+}
+
+module_init(crypto_aegis128_aesni_module_init);
+module_exit(crypto_aegis128_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128");
+MODULE_ALIAS_CRYPTO("aegis128-aesni");
diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S
new file mode 100644
index 000000000000..9263c344f2c7
--- /dev/null
+++ b/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -0,0 +1,825 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128L
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0	%xmm0
+#define STATE1	%xmm1
+#define STATE2	%xmm2
+#define STATE3	%xmm3
+#define STATE4	%xmm4
+#define STATE5	%xmm5
+#define STATE6	%xmm6
+#define STATE7	%xmm7
+#define MSG0	%xmm8
+#define MSG1	%xmm9
+#define T0	%xmm10
+#define T1	%xmm11
+#define T2	%xmm12
+#define T3	%xmm13
+
+#define STATEP	%rdi
+#define LEN	%rsi
+#define SRC	%rdx
+#define DST	%rcx
+
+.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32
+.align 16
+.Laegis128l_const_0:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128l_const_1:
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16
+.align 16
+.Laegis128l_counter0:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Laegis128l_counter1:
+	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   SRC - src
+ * output:
+ *   MSG0 - first message block
+ *   MSG1 - second message block
+ * changed:
+ *   T0
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	pxor MSG0, MSG0
+	pxor MSG1, MSG1
+
+	mov LEN, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov LEN, %r8
+	and $0x1E, %r8
+	add SRC, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov LEN, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov LEN, %r8
+	and $0x1C, %r8
+	add SRC, %r8
+	shl $0x10, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov LEN, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov LEN, %r8
+	and $0x18, %r8
+	add SRC, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG0
+
+	mov LEN, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov LEN, %r8
+	and $0x10, %r8
+	add SRC, %r8
+	pslldq $8, MSG0
+	movq (%r8), T0
+	pxor T0, MSG0
+
+.Lld_partial_8:
+	mov LEN, %r8
+	and $0x10, %r8
+	jz .Lld_partial_16
+
+	movdqa MSG0, MSG1
+	movdqu (SRC), MSG0
+
+.Lld_partial_16:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   DST - dst
+ * output:
+ *   T0   - first message block
+ *   T1   - second message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov LEN, %r8
+	mov DST, %r9
+
+	cmp $16, %r8
+	jl .Lst_partial_16
+
+	movdqu T0, (%r9)
+	movdqa T1, T0
+
+	sub $16, %r8
+	add $16, %r9
+
+.Lst_partial_16:
+	movq T0, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	psrldq $8, T0
+	movq T0, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $0x10, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+.macro update
+	movdqa STATE7, T0
+	aesenc STATE0, STATE7
+	aesenc STATE1, STATE0
+	aesenc STATE2, STATE1
+	aesenc STATE3, STATE2
+	aesenc STATE4, STATE3
+	aesenc STATE5, STATE4
+	aesenc STATE6, STATE5
+	aesenc T0,     STATE6
+.endm
+
+.macro update0
+	update
+	pxor MSG0, STATE7
+	pxor MSG1, STATE3
+.endm
+
+.macro update1
+	update
+	pxor MSG0, STATE6
+	pxor MSG1, STATE2
+.endm
+
+.macro update2
+	update
+	pxor MSG0, STATE5
+	pxor MSG1, STATE1
+.endm
+
+.macro update3
+	update
+	pxor MSG0, STATE4
+	pxor MSG1, STATE0
+.endm
+
+.macro update4
+	update
+	pxor MSG0, STATE3
+	pxor MSG1, STATE7
+.endm
+
+.macro update5
+	update
+	pxor MSG0, STATE2
+	pxor MSG1, STATE6
+.endm
+
+.macro update6
+	update
+	pxor MSG0, STATE1
+	pxor MSG1, STATE5
+.endm
+
+.macro update7
+	update
+	pxor MSG0, STATE0
+	pxor MSG1, STATE4
+.endm
+
+.macro state_load
+	movdqu 0x00(STATEP), STATE0
+	movdqu 0x10(STATEP), STATE1
+	movdqu 0x20(STATEP), STATE2
+	movdqu 0x30(STATEP), STATE3
+	movdqu 0x40(STATEP), STATE4
+	movdqu 0x50(STATEP), STATE5
+	movdqu 0x60(STATEP), STATE6
+	movdqu 0x70(STATEP), STATE7
+.endm
+
+.macro state_store s0 s1 s2 s3 s4 s5 s6 s7
+	movdqu \s7, 0x00(STATEP)
+	movdqu \s0, 0x10(STATEP)
+	movdqu \s1, 0x20(STATEP)
+	movdqu \s2, 0x30(STATEP)
+	movdqu \s3, 0x40(STATEP)
+	movdqu \s4, 0x50(STATEP)
+	movdqu \s5, 0x60(STATEP)
+	movdqu \s6, 0x70(STATEP)
+.endm
+
+.macro state_store0
+	state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
+.endm
+
+.macro state_store1
+	state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
+.endm
+
+.macro state_store2
+	state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro state_store3
+	state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro state_store4
+	state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro state_store5
+	state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
+.endm
+
+.macro state_store6
+	state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
+.endm
+
+.macro state_store7
+	state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis128l_aesni_init)
+	FRAME_BEGIN
+
+	/* load key: */
+	movdqa (%rsi), MSG1
+	movdqa MSG1, STATE0
+	movdqa MSG1, STATE4
+	movdqa MSG1, STATE5
+	movdqa MSG1, STATE6
+	movdqa MSG1, STATE7
+
+	/* load IV: */
+	movdqu (%rdx), MSG0
+	pxor MSG0, STATE0
+	pxor MSG0, STATE4
+
+	/* load the constants: */
+	movdqa .Laegis128l_const_0, STATE2
+	movdqa .Laegis128l_const_1, STATE1
+	movdqa STATE1, STATE3
+	pxor STATE2, STATE5
+	pxor STATE1, STATE6
+	pxor STATE2, STATE7
+
+	/* update 10 times with IV and KEY: */
+	update0
+	update1
+	update2
+	update3
+	update4
+	update5
+	update6
+	update7
+	update0
+	update1
+
+	state_store1
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128l_aesni_init)
+
+.macro ad_block a i
+	movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+	movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+	update\i
+	sub $0x20, LEN
+	cmp $0x20, LEN
+	jl .Lad_out_\i
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_ad(void *state, unsigned int length,
+ *                                const void *data);
+ */
+ENTRY(crypto_aegis128l_aesni_ad)
+	FRAME_BEGIN
+
+	cmp $0x20, LEN
+	jb .Lad_out
+
+	state_load
+
+	mov  SRC, %r8
+	and $0xf, %r8
+	jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+	ad_block a 0
+	ad_block a 1
+	ad_block a 2
+	ad_block a 3
+	ad_block a 4
+	ad_block a 5
+	ad_block a 6
+	ad_block a 7
+
+	add $0x100, SRC
+	jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+	ad_block u 0
+	ad_block u 1
+	ad_block u 2
+	ad_block u 3
+	ad_block u 4
+	ad_block u 5
+	ad_block u 6
+	ad_block u 7
+
+	add $0x100, SRC
+	jmp .Lad_u_loop
+
+.Lad_out_0:
+	state_store0
+	FRAME_END
+	ret
+
+.Lad_out_1:
+	state_store1
+	FRAME_END
+	ret
+
+.Lad_out_2:
+	state_store2
+	FRAME_END
+	ret
+
+.Lad_out_3:
+	state_store3
+	FRAME_END
+	ret
+
+.Lad_out_4:
+	state_store4
+	FRAME_END
+	ret
+
+.Lad_out_5:
+	state_store5
+	FRAME_END
+	ret
+
+.Lad_out_6:
+	state_store6
+	FRAME_END
+	ret
+
+.Lad_out_7:
+	state_store7
+	FRAME_END
+	ret
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128l_aesni_ad)
+
+.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7
+	pxor \s1, \m0
+	pxor \s6, \m0
+	movdqa \s2, T3
+	pand \s3, T3
+	pxor T3, \m0
+
+	pxor \s2, \m1
+	pxor \s5, \m1
+	movdqa \s6, T3
+	pand \s7, T3
+	pxor T3, \m1
+.endm
+
+.macro crypt0 m0 m1
+	crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
+.endm
+
+.macro crypt1 m0 m1
+	crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
+.endm
+
+.macro crypt2 m0 m1
+	crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro crypt3 m0 m1
+	crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro crypt4 m0 m1
+	crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro crypt5 m0 m1
+	crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
+.endm
+
+.macro crypt6 m0 m1
+	crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
+.endm
+
+.macro crypt7 m0 m1
+	crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
+.endm
+
+.macro encrypt_block a i
+	movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+	movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+	movdqa MSG0, T0
+	movdqa MSG1, T1
+	crypt\i T0, T1
+	movdq\a T0, (\i * 0x20 + 0x00)(DST)
+	movdq\a T1, (\i * 0x20 + 0x10)(DST)
+
+	update\i
+
+	sub $0x20, LEN
+	cmp $0x20, LEN
+	jl .Lenc_out_\i
+.endm
+
+.macro decrypt_block a i
+	movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+	movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+	crypt\i MSG0, MSG1
+	movdq\a MSG0, (\i * 0x20 + 0x00)(DST)
+	movdq\a MSG1, (\i * 0x20 + 0x10)(DST)
+
+	update\i
+
+	sub $0x20, LEN
+	cmp $0x20, LEN
+	jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_enc(void *state, unsigned int length,
+ *                                 const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_enc)
+	FRAME_BEGIN
+
+	cmp $0x20, LEN
+	jb .Lenc_out
+
+	state_load
+
+	mov  SRC, %r8
+	or   DST, %r8
+	and $0xf, %r8
+	jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+	encrypt_block a 0
+	encrypt_block a 1
+	encrypt_block a 2
+	encrypt_block a 3
+	encrypt_block a 4
+	encrypt_block a 5
+	encrypt_block a 6
+	encrypt_block a 7
+
+	add $0x100, SRC
+	add $0x100, DST
+	jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+	encrypt_block u 0
+	encrypt_block u 1
+	encrypt_block u 2
+	encrypt_block u 3
+	encrypt_block u 4
+	encrypt_block u 5
+	encrypt_block u 6
+	encrypt_block u 7
+
+	add $0x100, SRC
+	add $0x100, DST
+	jmp .Lenc_u_loop
+
+.Lenc_out_0:
+	state_store0
+	FRAME_END
+	ret
+
+.Lenc_out_1:
+	state_store1
+	FRAME_END
+	ret
+
+.Lenc_out_2:
+	state_store2
+	FRAME_END
+	ret
+
+.Lenc_out_3:
+	state_store3
+	FRAME_END
+	ret
+
+.Lenc_out_4:
+	state_store4
+	FRAME_END
+	ret
+
+.Lenc_out_5:
+	state_store5
+	FRAME_END
+	ret
+
+.Lenc_out_6:
+	state_store6
+	FRAME_END
+	ret
+
+.Lenc_out_7:
+	state_store7
+	FRAME_END
+	ret
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128l_aesni_enc)
+
+/*
+ * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length,
+ *                                      const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_enc_tail)
+	FRAME_BEGIN
+
+	state_load
+
+	/* encrypt message: */
+	call __load_partial
+
+	movdqa MSG0, T0
+	movdqa MSG1, T1
+	crypt0 T0, T1
+
+	call __store_partial
+
+	update0
+
+	state_store0
+
+	FRAME_END
+ENDPROC(crypto_aegis128l_aesni_enc_tail)
+
+/*
+ * void crypto_aegis128l_aesni_dec(void *state, unsigned int length,
+ *                                 const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_dec)
+	FRAME_BEGIN
+
+	cmp $0x20, LEN
+	jb .Ldec_out
+
+	state_load
+
+	mov  SRC, %r8
+	or   DST, %r8
+	and $0xF, %r8
+	jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+	decrypt_block a 0
+	decrypt_block a 1
+	decrypt_block a 2
+	decrypt_block a 3
+	decrypt_block a 4
+	decrypt_block a 5
+	decrypt_block a 6
+	decrypt_block a 7
+
+	add $0x100, SRC
+	add $0x100, DST
+	jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+	decrypt_block u 0
+	decrypt_block u 1
+	decrypt_block u 2
+	decrypt_block u 3
+	decrypt_block u 4
+	decrypt_block u 5
+	decrypt_block u 6
+	decrypt_block u 7
+
+	add $0x100, SRC
+	add $0x100, DST
+	jmp .Ldec_u_loop
+
+.Ldec_out_0:
+	state_store0
+	FRAME_END
+	ret
+
+.Ldec_out_1:
+	state_store1
+	FRAME_END
+	ret
+
+.Ldec_out_2:
+	state_store2
+	FRAME_END
+	ret
+
+.Ldec_out_3:
+	state_store3
+	FRAME_END
+	ret
+
+.Ldec_out_4:
+	state_store4
+	FRAME_END
+	ret
+
+.Ldec_out_5:
+	state_store5
+	FRAME_END
+	ret
+
+.Ldec_out_6:
+	state_store6
+	FRAME_END
+	ret
+
+.Ldec_out_7:
+	state_store7
+	FRAME_END
+	ret
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128l_aesni_dec)
+
+/*
+ * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length,
+ *                                      const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_dec_tail)
+	FRAME_BEGIN
+
+	state_load
+
+	/* decrypt message: */
+	call __load_partial
+
+	crypt0 MSG0, MSG1
+
+	movdqa MSG0, T0
+	movdqa MSG1, T1
+	call __store_partial
+
+	/* mask with byte count: */
+	movq LEN, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	movdqa T0, T1
+	movdqa .Laegis128l_counter0, T2
+	movdqa .Laegis128l_counter1, T3
+	pcmpgtb T2, T0
+	pcmpgtb T3, T1
+	pand T0, MSG0
+	pand T1, MSG1
+
+	update0
+
+	state_store0
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128l_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128l_aesni_final(void *state, void *tag_xor,
+ *                                   u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis128l_aesni_final)
+	FRAME_BEGIN
+
+	state_load
+
+	/* prepare length block: */
+	movq %rdx, MSG0
+	movq %rcx, T0
+	pslldq $8, T0
+	pxor T0, MSG0
+	psllq $3, MSG0 /* multiply by 8 (to get bit count) */
+
+	pxor STATE2, MSG0
+	movdqa MSG0, MSG1
+
+	/* update state: */
+	update0
+	update1
+	update2
+	update3
+	update4
+	update5
+	update6
+
+	/* xor tag: */
+	movdqu (%rsi), T0
+
+	pxor STATE1, T0
+	pxor STATE2, T0
+	pxor STATE3, T0
+	pxor STATE4, T0
+	pxor STATE5, T0
+	pxor STATE6, T0
+	pxor STATE7, T0
+
+	movdqu T0, (%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis128l_aesni_final)
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
new file mode 100644
index 000000000000..876e4866e633
--- /dev/null
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-128L Authenticated-Encryption Algorithm
+ *   Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128L_BLOCK_ALIGN 16
+#define AEGIS128L_BLOCK_SIZE 32
+#define AEGIS128L_NONCE_SIZE 16
+#define AEGIS128L_STATE_BLOCKS 8
+#define AEGIS128L_KEY_SIZE 16
+#define AEGIS128L_MIN_AUTH_SIZE 8
+#define AEGIS128L_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128l_aesni_ad(
+		void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128l_aesni_enc(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_dec(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_enc_tail(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_dec_tail(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_final(
+		void *state, void *tag_xor, unsigned int cryptlen,
+		unsigned int assoclen);
+
+struct aegis_block {
+	u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+	struct aegis_block blocks[AEGIS128L_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+	struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+			     void *dst);
+	void (*crypt_tail)(void *state, unsigned int length, const void *src,
+			   void *dst);
+};
+
+static void crypto_aegis128l_aesni_process_ad(
+		struct aegis_state *state, struct scatterlist *sg_src,
+		unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct aegis_block buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= AEGIS128L_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = AEGIS128L_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+				crypto_aegis128l_aesni_ad(state,
+							  AEGIS128L_BLOCK_SIZE,
+							  buf.bytes);
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			crypto_aegis128l_aesni_ad(state, left, src);
+
+			src += left & ~(AEGIS128L_BLOCK_SIZE - 1);
+			left &= AEGIS128L_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+		pos += left;
+		assoclen -= size;
+
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos);
+		crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes);
+	}
+}
+
+static void crypto_aegis128l_aesni_process_crypt(
+		struct aegis_state *state, struct aead_request *req,
+		const struct aegis_crypt_ops *ops)
+{
+	struct skcipher_walk walk;
+	u8 *src, *dst;
+	unsigned int chunksize, base;
+
+	ops->skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		src = walk.src.virt.addr;
+		dst = walk.dst.virt.addr;
+		chunksize = walk.nbytes;
+
+		ops->crypt_blocks(state, chunksize, src, dst);
+
+		base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1);
+		src += base;
+		dst += base;
+		chunksize &= AEGIS128L_BLOCK_SIZE - 1;
+
+		if (chunksize > 0)
+			ops->crypt_tail(state, chunksize, src, dst);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead)
+{
+	u8 *ctx = crypto_aead_ctx(aead);
+	ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+	return (void *)ctx;
+}
+
+static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead,
+					 const u8 *key, unsigned int keylen)
+{
+	struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead);
+
+	if (keylen != AEGIS128L_KEY_SIZE) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE);
+
+	return 0;
+}
+
+static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm,
+					      unsigned int authsize)
+{
+	if (authsize > AEGIS128L_MAX_AUTH_SIZE)
+		return -EINVAL;
+	if (authsize < AEGIS128L_MIN_AUTH_SIZE)
+		return -EINVAL;
+	return 0;
+}
+
+static void crypto_aegis128l_aesni_crypt(struct aead_request *req,
+					 struct aegis_block *tag_xor,
+					 unsigned int cryptlen,
+					 const struct aegis_crypt_ops *ops)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm);
+	struct aegis_state state;
+
+	kernel_fpu_begin();
+
+	crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv);
+	crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen);
+	crypto_aegis128l_aesni_process_crypt(&state, req, ops);
+	crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+	kernel_fpu_end();
+}
+
+static int crypto_aegis128l_aesni_encrypt(struct aead_request *req)
+{
+	static const struct aegis_crypt_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_blocks = crypto_aegis128l_aesni_enc,
+		.crypt_tail = crypto_aegis128l_aesni_enc_tail,
+	};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_block tag = {};
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+	scatterwalk_map_and_copy(tag.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+
+static int crypto_aegis128l_aesni_decrypt(struct aead_request *req)
+{
+	static const struct aegis_block zeros = {};
+
+	static const struct aegis_crypt_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_blocks = crypto_aegis128l_aesni_dec,
+		.crypt_tail = crypto_aegis128l_aesni_dec_tail,
+	};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
+{
+	return 0;
+}
+
+static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead,
+					 const u8 *key, unsigned int keylen)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead,
+					      unsigned int authsize)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+	return 0;
+}
+
+static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis128l_aesni_alg[] = {
+	{
+		.setkey = crypto_aegis128l_aesni_setkey,
+		.setauthsize = crypto_aegis128l_aesni_setauthsize,
+		.encrypt = crypto_aegis128l_aesni_encrypt,
+		.decrypt = crypto_aegis128l_aesni_decrypt,
+		.init = crypto_aegis128l_aesni_init_tfm,
+		.exit = crypto_aegis128l_aesni_exit_tfm,
+
+		.ivsize = AEGIS128L_NONCE_SIZE,
+		.maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+		.chunksize = AEGIS128L_BLOCK_SIZE,
+
+		.base = {
+			.cra_flags = CRYPTO_ALG_INTERNAL,
+			.cra_blocksize = 1,
+			.cra_ctxsize = sizeof(struct aegis_ctx) +
+				__alignof__(struct aegis_ctx),
+			.cra_alignmask = 0,
+
+			.cra_name = "__aegis128l",
+			.cra_driver_name = "__aegis128l-aesni",
+
+			.cra_module = THIS_MODULE,
+		}
+	}, {
+		.setkey = cryptd_aegis128l_aesni_setkey,
+		.setauthsize = cryptd_aegis128l_aesni_setauthsize,
+		.encrypt = cryptd_aegis128l_aesni_encrypt,
+		.decrypt = cryptd_aegis128l_aesni_decrypt,
+		.init = cryptd_aegis128l_aesni_init_tfm,
+		.exit = cryptd_aegis128l_aesni_exit_tfm,
+
+		.ivsize = AEGIS128L_NONCE_SIZE,
+		.maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+		.chunksize = AEGIS128L_BLOCK_SIZE,
+
+		.base = {
+			.cra_flags = CRYPTO_ALG_ASYNC,
+			.cra_blocksize = 1,
+			.cra_ctxsize = sizeof(struct cryptd_aead *),
+			.cra_alignmask = 0,
+
+			.cra_priority = 400,
+
+			.cra_name = "aegis128l",
+			.cra_driver_name = "aegis128l-aesni",
+
+			.cra_module = THIS_MODULE,
+		}
+	}
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_AES),
+	X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis128l_aesni_module_init(void)
+{
+	if (!x86_match_cpu(aesni_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_aegis128l_aesni_alg,
+				     ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+}
+
+static void __exit crypto_aegis128l_aesni_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_aegis128l_aesni_alg,
+				ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+}
+
+module_init(crypto_aegis128l_aesni_module_init);
+module_exit(crypto_aegis128l_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128l");
+MODULE_ALIAS_CRYPTO("aegis128l-aesni");
diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S
new file mode 100644
index 000000000000..1d977d515bf9
--- /dev/null
+++ b/arch/x86/crypto/aegis256-aesni-asm.S
@@ -0,0 +1,702 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128L
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0	%xmm0
+#define STATE1	%xmm1
+#define STATE2	%xmm2
+#define STATE3	%xmm3
+#define STATE4	%xmm4
+#define STATE5	%xmm5
+#define MSG	%xmm6
+#define T0	%xmm7
+#define T1	%xmm8
+#define T2	%xmm9
+#define T3	%xmm10
+
+#define STATEP	%rdi
+#define LEN	%rsi
+#define SRC	%rdx
+#define DST	%rcx
+
+.section .rodata.cst16.aegis256_const, "aM", @progbits, 32
+.align 16
+.Laegis256_const_0:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis256_const_1:
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16
+.align 16
+.Laegis256_counter:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   SRC - src
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   T0
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	pxor MSG, MSG
+
+	mov LEN, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov LEN, %r8
+	and $0x1E, %r8
+	add SRC, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov LEN, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov LEN, %r8
+	and $0x1C, %r8
+	add SRC, %r8
+	shl $0x10, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov LEN, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov LEN, %r8
+	and $0x18, %r8
+	add SRC, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG
+
+	mov LEN, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov LEN, %r8
+	and $0x10, %r8
+	add SRC, %r8
+	pslldq $8, MSG
+	movq (%r8), T0
+	pxor T0, MSG
+
+.Lld_partial_8:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   LEN - bytes
+ *   DST - dst
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov LEN, %r8
+	mov DST, %r9
+
+	movq T0, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	psrldq $8, T0
+	movq T0, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $0x10, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+.macro update
+	movdqa STATE5, T0
+	aesenc STATE0, STATE5
+	aesenc STATE1, STATE0
+	aesenc STATE2, STATE1
+	aesenc STATE3, STATE2
+	aesenc STATE4, STATE3
+	aesenc T0,     STATE4
+.endm
+
+.macro update0 m
+	update
+	pxor \m, STATE5
+.endm
+
+.macro update1 m
+	update
+	pxor \m, STATE4
+.endm
+
+.macro update2 m
+	update
+	pxor \m, STATE3
+.endm
+
+.macro update3 m
+	update
+	pxor \m, STATE2
+.endm
+
+.macro update4 m
+	update
+	pxor \m, STATE1
+.endm
+
+.macro update5 m
+	update
+	pxor \m, STATE0
+.endm
+
+.macro state_load
+	movdqu 0x00(STATEP), STATE0
+	movdqu 0x10(STATEP), STATE1
+	movdqu 0x20(STATEP), STATE2
+	movdqu 0x30(STATEP), STATE3
+	movdqu 0x40(STATEP), STATE4
+	movdqu 0x50(STATEP), STATE5
+.endm
+
+.macro state_store s0 s1 s2 s3 s4 s5
+	movdqu \s5, 0x00(STATEP)
+	movdqu \s0, 0x10(STATEP)
+	movdqu \s1, 0x20(STATEP)
+	movdqu \s2, 0x30(STATEP)
+	movdqu \s3, 0x40(STATEP)
+	movdqu \s4, 0x50(STATEP)
+.endm
+
+.macro state_store0
+	state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro state_store1
+	state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro state_store2
+	state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro state_store3
+	state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
+.endm
+
+.macro state_store4
+	state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
+.endm
+
+.macro state_store5
+	state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
+.endm
+
+/*
+ * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis256_aesni_init)
+	FRAME_BEGIN
+
+	/* load key: */
+	movdqa 0x00(%rsi), MSG
+	movdqa 0x10(%rsi), T1
+	movdqa MSG, STATE4
+	movdqa T1, STATE5
+
+	/* load IV: */
+	movdqu 0x00(%rdx), T2
+	movdqu 0x10(%rdx), T3
+	pxor MSG, T2
+	pxor T1, T3
+	movdqa T2, STATE0
+	movdqa T3, STATE1
+
+	/* load the constants: */
+	movdqa .Laegis256_const_0, STATE3
+	movdqa .Laegis256_const_1, STATE2
+	pxor STATE3, STATE4
+	pxor STATE2, STATE5
+
+	/* update 10 times with IV and KEY: */
+	update0 MSG
+	update1 T1
+	update2 T2
+	update3 T3
+	update4 MSG
+	update5 T1
+	update0 T2
+	update1 T3
+	update2 MSG
+	update3 T1
+	update4 T2
+	update5 T3
+	update0 MSG
+	update1 T1
+	update2 T2
+	update3 T3
+
+	state_store3
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis256_aesni_init)
+
+.macro ad_block a i
+	movdq\a (\i * 0x10)(SRC), MSG
+	update\i MSG
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lad_out_\i
+.endm
+
+/*
+ * void crypto_aegis256_aesni_ad(void *state, unsigned int length,
+ *                               const void *data);
+ */
+ENTRY(crypto_aegis256_aesni_ad)
+	FRAME_BEGIN
+
+	cmp $0x10, LEN
+	jb .Lad_out
+
+	state_load
+
+	mov  SRC, %r8
+	and $0xf, %r8
+	jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+	ad_block a 0
+	ad_block a 1
+	ad_block a 2
+	ad_block a 3
+	ad_block a 4
+	ad_block a 5
+
+	add $0x60, SRC
+	jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+	ad_block u 0
+	ad_block u 1
+	ad_block u 2
+	ad_block u 3
+	ad_block u 4
+	ad_block u 5
+
+	add $0x60, SRC
+	jmp .Lad_u_loop
+
+.Lad_out_0:
+	state_store0
+	FRAME_END
+	ret
+
+.Lad_out_1:
+	state_store1
+	FRAME_END
+	ret
+
+.Lad_out_2:
+	state_store2
+	FRAME_END
+	ret
+
+.Lad_out_3:
+	state_store3
+	FRAME_END
+	ret
+
+.Lad_out_4:
+	state_store4
+	FRAME_END
+	ret
+
+.Lad_out_5:
+	state_store5
+	FRAME_END
+	ret
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis256_aesni_ad)
+
+.macro crypt m s0 s1 s2 s3 s4 s5
+	pxor \s1, \m
+	pxor \s4, \m
+	pxor \s5, \m
+	movdqa \s2, T3
+	pand \s3, T3
+	pxor T3, \m
+.endm
+
+.macro crypt0 m
+	crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro crypt1 m
+	crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro crypt2 m
+	crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro crypt3 m
+	crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
+.endm
+
+.macro crypt4 m
+	crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
+.endm
+
+.macro crypt5 m
+	crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
+.endm
+
+.macro encrypt_block a i
+	movdq\a (\i * 0x10)(SRC), MSG
+	movdqa MSG, T0
+	crypt\i T0
+	movdq\a T0, (\i * 0x10)(DST)
+
+	update\i MSG
+
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Lenc_out_\i
+.endm
+
+.macro decrypt_block a i
+	movdq\a (\i * 0x10)(SRC), MSG
+	crypt\i MSG
+	movdq\a MSG, (\i * 0x10)(DST)
+
+	update\i MSG
+
+	sub $0x10, LEN
+	cmp $0x10, LEN
+	jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis256_aesni_enc(void *state, unsigned int length,
+ *                                const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_enc)
+	FRAME_BEGIN
+
+	cmp $0x10, LEN
+	jb .Lenc_out
+
+	state_load
+
+	mov  SRC, %r8
+	or   DST, %r8
+	and $0xf, %r8
+	jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+	encrypt_block a 0
+	encrypt_block a 1
+	encrypt_block a 2
+	encrypt_block a 3
+	encrypt_block a 4
+	encrypt_block a 5
+
+	add $0x60, SRC
+	add $0x60, DST
+	jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+	encrypt_block u 0
+	encrypt_block u 1
+	encrypt_block u 2
+	encrypt_block u 3
+	encrypt_block u 4
+	encrypt_block u 5
+
+	add $0x60, SRC
+	add $0x60, DST
+	jmp .Lenc_u_loop
+
+.Lenc_out_0:
+	state_store0
+	FRAME_END
+	ret
+
+.Lenc_out_1:
+	state_store1
+	FRAME_END
+	ret
+
+.Lenc_out_2:
+	state_store2
+	FRAME_END
+	ret
+
+.Lenc_out_3:
+	state_store3
+	FRAME_END
+	ret
+
+.Lenc_out_4:
+	state_store4
+	FRAME_END
+	ret
+
+.Lenc_out_5:
+	state_store5
+	FRAME_END
+	ret
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis256_aesni_enc)
+
+/*
+ * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length,
+ *                                     const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_enc_tail)
+	FRAME_BEGIN
+
+	state_load
+
+	/* encrypt message: */
+	call __load_partial
+
+	movdqa MSG, T0
+	crypt0 T0
+
+	call __store_partial
+
+	update0 MSG
+
+	state_store0
+
+	FRAME_END
+ENDPROC(crypto_aegis256_aesni_enc_tail)
+
+/*
+ * void crypto_aegis256_aesni_dec(void *state, unsigned int length,
+ *                                const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_dec)
+	FRAME_BEGIN
+
+	cmp $0x10, LEN
+	jb .Ldec_out
+
+	state_load
+
+	mov  SRC, %r8
+	or   DST, %r8
+	and $0xF, %r8
+	jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+	decrypt_block a 0
+	decrypt_block a 1
+	decrypt_block a 2
+	decrypt_block a 3
+	decrypt_block a 4
+	decrypt_block a 5
+
+	add $0x60, SRC
+	add $0x60, DST
+	jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+	decrypt_block u 0
+	decrypt_block u 1
+	decrypt_block u 2
+	decrypt_block u 3
+	decrypt_block u 4
+	decrypt_block u 5
+
+	add $0x60, SRC
+	add $0x60, DST
+	jmp .Ldec_u_loop
+
+.Ldec_out_0:
+	state_store0
+	FRAME_END
+	ret
+
+.Ldec_out_1:
+	state_store1
+	FRAME_END
+	ret
+
+.Ldec_out_2:
+	state_store2
+	FRAME_END
+	ret
+
+.Ldec_out_3:
+	state_store3
+	FRAME_END
+	ret
+
+.Ldec_out_4:
+	state_store4
+	FRAME_END
+	ret
+
+.Ldec_out_5:
+	state_store5
+	FRAME_END
+	ret
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis256_aesni_dec)
+
+/*
+ * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length,
+ *                                     const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_dec_tail)
+	FRAME_BEGIN
+
+	state_load
+
+	/* decrypt message: */
+	call __load_partial
+
+	crypt0 MSG
+
+	movdqa MSG, T0
+	call __store_partial
+
+	/* mask with byte count: */
+	movq LEN, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	movdqa .Laegis256_counter, T1
+	pcmpgtb T1, T0
+	pand T0, MSG
+
+	update0 MSG
+
+	state_store0
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis256_aesni_dec_tail)
+
+/*
+ * void crypto_aegis256_aesni_final(void *state, void *tag_xor,
+ *                                  u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis256_aesni_final)
+	FRAME_BEGIN
+
+	state_load
+
+	/* prepare length block: */
+	movq %rdx, MSG
+	movq %rcx, T0
+	pslldq $8, T0
+	pxor T0, MSG
+	psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+	pxor STATE3, MSG
+
+	/* update state: */
+	update0 MSG
+	update1 MSG
+	update2 MSG
+	update3 MSG
+	update4 MSG
+	update5 MSG
+	update0 MSG
+
+	/* xor tag: */
+	movdqu (%rsi), MSG
+
+	pxor STATE0, MSG
+	pxor STATE1, MSG
+	pxor STATE2, MSG
+	pxor STATE3, MSG
+	pxor STATE4, MSG
+	pxor STATE5, MSG
+
+	movdqu MSG, (%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_aegis256_aesni_final)
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
new file mode 100644
index 000000000000..2b5dd3af8f4d
--- /dev/null
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -0,0 +1,407 @@
+/*
+ * The AEGIS-256 Authenticated-Encryption Algorithm
+ *   Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS256_BLOCK_ALIGN 16
+#define AEGIS256_BLOCK_SIZE 16
+#define AEGIS256_NONCE_SIZE 32
+#define AEGIS256_STATE_BLOCKS 6
+#define AEGIS256_KEY_SIZE 32
+#define AEGIS256_MIN_AUTH_SIZE 8
+#define AEGIS256_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis256_aesni_ad(
+		void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis256_aesni_enc(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_dec(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_enc_tail(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_dec_tail(
+		void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_final(
+		void *state, void *tag_xor, unsigned int cryptlen,
+		unsigned int assoclen);
+
+struct aegis_block {
+	u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+	struct aegis_block blocks[AEGIS256_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+	struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE];
+};
+
+struct aegis_crypt_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+			     void *dst);
+	void (*crypt_tail)(void *state, unsigned int length, const void *src,
+			   void *dst);
+};
+
+static void crypto_aegis256_aesni_process_ad(
+		struct aegis_state *state, struct scatterlist *sg_src,
+		unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct aegis_block buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= AEGIS256_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = AEGIS256_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+				crypto_aegis256_aesni_ad(state,
+							 AEGIS256_BLOCK_SIZE,
+							 buf.bytes);
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			crypto_aegis256_aesni_ad(state, left, src);
+
+			src += left & ~(AEGIS256_BLOCK_SIZE - 1);
+			left &= AEGIS256_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+		pos += left;
+		assoclen -= size;
+
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos);
+		crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes);
+	}
+}
+
+static void crypto_aegis256_aesni_process_crypt(
+		struct aegis_state *state, struct aead_request *req,
+		const struct aegis_crypt_ops *ops)
+{
+	struct skcipher_walk walk;
+	u8 *src, *dst;
+	unsigned int chunksize, base;
+
+	ops->skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		src = walk.src.virt.addr;
+		dst = walk.dst.virt.addr;
+		chunksize = walk.nbytes;
+
+		ops->crypt_blocks(state, chunksize, src, dst);
+
+		base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1);
+		src += base;
+		dst += base;
+		chunksize &= AEGIS256_BLOCK_SIZE - 1;
+
+		if (chunksize > 0)
+			ops->crypt_tail(state, chunksize, src, dst);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead)
+{
+	u8 *ctx = crypto_aead_ctx(aead);
+	ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+	return (void *)ctx;
+}
+
+static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key,
+					unsigned int keylen)
+{
+	struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead);
+
+	if (keylen != AEGIS256_KEY_SIZE) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->key, key, AEGIS256_KEY_SIZE);
+
+	return 0;
+}
+
+static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm,
+						unsigned int authsize)
+{
+	if (authsize > AEGIS256_MAX_AUTH_SIZE)
+		return -EINVAL;
+	if (authsize < AEGIS256_MIN_AUTH_SIZE)
+		return -EINVAL;
+	return 0;
+}
+
+static void crypto_aegis256_aesni_crypt(struct aead_request *req,
+					struct aegis_block *tag_xor,
+					unsigned int cryptlen,
+					const struct aegis_crypt_ops *ops)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm);
+	struct aegis_state state;
+
+	kernel_fpu_begin();
+
+	crypto_aegis256_aesni_init(&state, ctx->key, req->iv);
+	crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen);
+	crypto_aegis256_aesni_process_crypt(&state, req, ops);
+	crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+	kernel_fpu_end();
+}
+
+static int crypto_aegis256_aesni_encrypt(struct aead_request *req)
+{
+	static const struct aegis_crypt_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_blocks = crypto_aegis256_aesni_enc,
+		.crypt_tail = crypto_aegis256_aesni_enc_tail,
+	};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_block tag = {};
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+	scatterwalk_map_and_copy(tag.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+
+static int crypto_aegis256_aesni_decrypt(struct aead_request *req)
+{
+	static const struct aegis_block zeros = {};
+
+	static const struct aegis_crypt_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_blocks = crypto_aegis256_aesni_dec,
+		.crypt_tail = crypto_aegis256_aesni_dec_tail,
+	};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aegis_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead)
+{
+	return 0;
+}
+
+static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead,
+					const u8 *key, unsigned int keylen)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead,
+					     unsigned int authsize)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis256_aesni_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis256_aesni_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+	return 0;
+}
+
+static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis256_aesni_alg[] = {
+	{
+		.setkey = crypto_aegis256_aesni_setkey,
+		.setauthsize = crypto_aegis256_aesni_setauthsize,
+		.encrypt = crypto_aegis256_aesni_encrypt,
+		.decrypt = crypto_aegis256_aesni_decrypt,
+		.init = crypto_aegis256_aesni_init_tfm,
+		.exit = crypto_aegis256_aesni_exit_tfm,
+
+		.ivsize = AEGIS256_NONCE_SIZE,
+		.maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+		.chunksize = AEGIS256_BLOCK_SIZE,
+
+		.base = {
+			.cra_flags = CRYPTO_ALG_INTERNAL,
+			.cra_blocksize = 1,
+			.cra_ctxsize = sizeof(struct aegis_ctx) +
+				__alignof__(struct aegis_ctx),
+			.cra_alignmask = 0,
+
+			.cra_name = "__aegis256",
+			.cra_driver_name = "__aegis256-aesni",
+
+			.cra_module = THIS_MODULE,
+		}
+	}, {
+		.setkey = cryptd_aegis256_aesni_setkey,
+		.setauthsize = cryptd_aegis256_aesni_setauthsize,
+		.encrypt = cryptd_aegis256_aesni_encrypt,
+		.decrypt = cryptd_aegis256_aesni_decrypt,
+		.init = cryptd_aegis256_aesni_init_tfm,
+		.exit = cryptd_aegis256_aesni_exit_tfm,
+
+		.ivsize = AEGIS256_NONCE_SIZE,
+		.maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+		.chunksize = AEGIS256_BLOCK_SIZE,
+
+		.base = {
+			.cra_flags = CRYPTO_ALG_ASYNC,
+			.cra_blocksize = 1,
+			.cra_ctxsize = sizeof(struct cryptd_aead *),
+			.cra_alignmask = 0,
+
+			.cra_priority = 400,
+
+			.cra_name = "aegis256",
+			.cra_driver_name = "aegis256-aesni",
+
+			.cra_module = THIS_MODULE,
+		}
+	}
+};
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_AES),
+	X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static int __init crypto_aegis256_aesni_module_init(void)
+{
+	if (!x86_match_cpu(aesni_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_aegis256_aesni_alg,
+				    ARRAY_SIZE(crypto_aegis256_aesni_alg));
+}
+
+static void __exit crypto_aegis256_aesni_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_aegis256_aesni_alg,
+				ARRAY_SIZE(crypto_aegis256_aesni_alg));
+}
+
+module_init(crypto_aegis256_aesni_module_init);
+module_exit(crypto_aegis256_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis256");
+MODULE_ALIAS_CRYPTO("aegis256-aesni");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 0420bab19efb..2ddbe3a1868b 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -364,5 +364,5 @@ module_exit(ghash_pclmulqdqni_mod_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
-		   "acclerated by PCLMULQDQ-NI");
+		   "accelerated by PCLMULQDQ-NI");
 MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S
new file mode 100644
index 000000000000..37d422e77931
--- /dev/null
+++ b/arch/x86/crypto/morus1280-avx2-asm.S
@@ -0,0 +1,621 @@
+/*
+ * AVX2 implementation of MORUS-1280
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
+
+#define STATE0		%ymm0
+#define STATE0_LOW	%xmm0
+#define STATE1		%ymm1
+#define STATE2		%ymm2
+#define STATE3		%ymm3
+#define STATE4		%ymm4
+#define KEY		%ymm5
+#define MSG		%ymm5
+#define MSG_LOW		%xmm5
+#define T0		%ymm6
+#define T0_LOW		%xmm6
+#define T1		%ymm7
+
+.section .rodata.cst32.morus1280_const, "aM", @progbits, 32
+.align 32
+.Lmorus1280_const:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32
+.align 32
+.Lmorus1280_counter:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+.macro morus1280_round s0, s1, s2, s3, s4, b, w
+	vpand \s1, \s2, T0
+	vpxor T0, \s0, \s0
+	vpxor \s3, \s0, \s0
+	vpsllq $\b, \s0, T0
+	vpsrlq $(64 - \b), \s0, \s0
+	vpxor T0, \s0, \s0
+	vpermq $\w, \s3, \s3
+.endm
+
+/*
+ * __morus1280_update: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ *   MSG        - message block
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus1280_update:
+	morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
+	vpxor MSG, STATE1, STATE1
+	morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
+	vpxor MSG, STATE2, STATE2
+	morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
+	vpxor MSG, STATE3, STATE3
+	morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2,  7, MASK2
+	vpxor MSG, STATE4, STATE4
+	morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3,  4, MASK1
+	ret
+ENDPROC(__morus1280_update)
+
+/*
+ * __morus1280_update_zero: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus1280_update_zero:
+	morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
+	morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
+	morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
+	morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2,  7, MASK2
+	morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3,  4, MASK1
+	ret
+ENDPROC(__morus1280_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   %rsi - src
+ *   %rcx - bytes
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	vpxor MSG, MSG, MSG
+
+	mov %rcx, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov %rcx, %r8
+	and $0x1E, %r8
+	add %rsi, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov %rcx, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov %rcx, %r8
+	and $0x1C, %r8
+	add %rsi, %r8
+	shl $16, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov %rcx, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov %rcx, %r8
+	and $0x18, %r8
+	add %rsi, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG_LOW
+
+	mov %rcx, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov %rcx, %r8
+	and $0x10, %r8
+	add %rsi, %r8
+	pshufd $MASK2, MSG_LOW, MSG_LOW
+	pinsrq $0, (%r8), MSG_LOW
+
+.Lld_partial_8:
+	mov %rcx, %r8
+	and $0x10, %r8
+	jz .Lld_partial_16
+
+	vpermq $MASK2, MSG, MSG
+	movdqu (%rsi), MSG_LOW
+
+.Lld_partial_16:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   %rdx - dst
+ *   %rcx - bytes
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov %rcx, %r8
+	mov %rdx, %r9
+
+	cmp $16, %r8
+	jl .Lst_partial_16
+
+	movdqu T0_LOW, (%r9)
+	vpermq $MASK2, T0, T0
+
+	sub $16, %r8
+	add $16, %r9
+
+.Lst_partial_16:
+	movq T0_LOW, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	pextrq $1, T0_LOW, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $16, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus1280_avx2_init(void *state, const void *key,
+ *                                 const void *iv);
+ */
+ENTRY(crypto_morus1280_avx2_init)
+	FRAME_BEGIN
+
+	/* load IV: */
+	vpxor STATE0, STATE0, STATE0
+	movdqu (%rdx), STATE0_LOW
+	/* load key: */
+	vmovdqu (%rsi), KEY
+	vmovdqa KEY, STATE1
+	/* load all ones: */
+	vpcmpeqd STATE2, STATE2, STATE2
+	/* load all zeros: */
+	vpxor STATE3, STATE3, STATE3
+	/* load the constant: */
+	vmovdqa .Lmorus1280_const, STATE4
+
+	/* update 16 times with zero: */
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+
+	/* xor-in the key again after updates: */
+	vpxor KEY, STATE1, STATE1
+
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_init)
+
+/*
+ * void crypto_morus1280_avx2_ad(void *state, const void *data,
+ *                               unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_ad)
+	FRAME_BEGIN
+
+	cmp $32, %rdx
+	jb .Lad_out
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	mov %rsi,  %r8
+	and $0x1F, %r8
+	jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+	vmovdqa (%rsi), MSG
+	call __morus1280_update
+	sub $32, %rdx
+	add $32, %rsi
+	cmp $32, %rdx
+	jge .Lad_a_loop
+
+	jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+	vmovdqu (%rsi), MSG
+	call __morus1280_update
+	sub $32, %rdx
+	add $32, %rsi
+	cmp $32, %rdx
+	jge .Lad_u_loop
+
+.Lad_cont:
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_ad)
+
+/*
+ * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst,
+ *                                unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_enc)
+	FRAME_BEGIN
+
+	cmp $32, %rcx
+	jb .Lenc_out
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	mov %rsi,  %r8
+	or  %rdx,  %r8
+	and $0x1F, %r8
+	jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+	vmovdqa (%rsi), MSG
+	vmovdqa MSG, T0
+	vpxor STATE0, T0, T0
+	vpermq $MASK3, STATE1, T1
+	vpxor T1, T0, T0
+	vpand STATE2, STATE3, T1
+	vpxor T1, T0, T0
+	vmovdqa T0, (%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Lenc_a_loop
+
+	jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+	vmovdqu (%rsi), MSG
+	vmovdqa MSG, T0
+	vpxor STATE0, T0, T0
+	vpermq $MASK3, STATE1, T1
+	vpxor T1, T0, T0
+	vpand STATE2, STATE3, T1
+	vpxor T1, T0, T0
+	vmovdqu T0, (%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Lenc_u_loop
+
+.Lenc_cont:
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_enc)
+
+/*
+ * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst,
+ *                                     unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_enc_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	/* encrypt message: */
+	call __load_partial
+
+	vmovdqa MSG, T0
+	vpxor STATE0, T0, T0
+	vpermq $MASK3, STATE1, T1
+	vpxor T1, T0, T0
+	vpand STATE2, STATE3, T1
+	vpxor T1, T0, T0
+
+	call __store_partial
+
+	call __morus1280_update
+
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+	FRAME_END
+ENDPROC(crypto_morus1280_avx2_enc_tail)
+
+/*
+ * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst,
+ *                                unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_dec)
+	FRAME_BEGIN
+
+	cmp $32, %rcx
+	jb .Ldec_out
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	mov %rsi,  %r8
+	or  %rdx,  %r8
+	and $0x1F, %r8
+	jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+	vmovdqa (%rsi), MSG
+	vpxor STATE0, MSG, MSG
+	vpermq $MASK3, STATE1, T0
+	vpxor T0, MSG, MSG
+	vpand STATE2, STATE3, T0
+	vpxor T0, MSG, MSG
+	vmovdqa MSG, (%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Ldec_a_loop
+
+	jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+	vmovdqu (%rsi), MSG
+	vpxor STATE0, MSG, MSG
+	vpermq $MASK3, STATE1, T0
+	vpxor T0, MSG, MSG
+	vpand STATE2, STATE3, T0
+	vpxor T0, MSG, MSG
+	vmovdqu MSG, (%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Ldec_u_loop
+
+.Ldec_cont:
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_dec)
+
+/*
+ * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst,
+ *                                     unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_dec_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	/* decrypt message: */
+	call __load_partial
+
+	vpxor STATE0, MSG, MSG
+	vpermq $MASK3, STATE1, T0
+	vpxor T0, MSG, MSG
+	vpand STATE2, STATE3, T0
+	vpxor T0, MSG, MSG
+	vmovdqa MSG, T0
+
+	call __store_partial
+
+	/* mask with byte count: */
+	movq %rcx, T0_LOW
+	vpbroadcastb T0_LOW, T0
+	vmovdqa .Lmorus1280_counter, T1
+	vpcmpgtb T1, T0, T0
+	vpand T0, MSG, MSG
+
+	call __morus1280_update
+
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_dec_tail)
+
+/*
+ * void crypto_morus1280_avx2_final(void *state, void *tag_xor,
+ *                                  u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus1280_avx2_final)
+	FRAME_BEGIN
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	/* xor state[0] into state[4]: */
+	vpxor STATE0, STATE4, STATE4
+
+	/* prepare length block: */
+	vpxor MSG, MSG, MSG
+	vpinsrq $0, %rdx, MSG_LOW, MSG_LOW
+	vpinsrq $1, %rcx, MSG_LOW, MSG_LOW
+	vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */
+
+	/* update state: */
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+
+	/* xor tag: */
+	vmovdqu (%rsi), MSG
+
+	vpxor STATE0, MSG, MSG
+	vpermq $MASK3, STATE1, T0
+	vpxor T0, MSG, MSG
+	vpand STATE2, STATE3, T0
+	vpxor T0, MSG, MSG
+	vmovdqu MSG, (%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_final)
diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c
new file mode 100644
index 000000000000..f111f36d26dc
--- /dev/null
+++ b/arch/x86/crypto/morus1280-avx2-glue.c
@@ -0,0 +1,68 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ *   Glue for AVX2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus1280_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key,
+					   const void *iv);
+asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data,
+					 unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src,
+					  void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src,
+					  void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src,
+					       void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src,
+					       void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor,
+					    u64 assoclen, u64 cryptlen);
+
+MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
+
+static const struct x86_cpu_id avx2_cpu_id[] = {
+    X86_FEATURE_MATCH(X86_FEATURE_AVX2),
+    {}
+};
+MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id);
+
+static int __init crypto_morus1280_avx2_module_init(void)
+{
+	if (!x86_match_cpu(avx2_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_morus1280_avx2_algs,
+				     ARRAY_SIZE(crypto_morus1280_avx2_algs));
+}
+
+static void __exit crypto_morus1280_avx2_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_morus1280_avx2_algs,
+				ARRAY_SIZE(crypto_morus1280_avx2_algs));
+}
+
+module_init(crypto_morus1280_avx2_module_init);
+module_exit(crypto_morus1280_avx2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-avx2");
diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S
new file mode 100644
index 000000000000..1fe637c7be9d
--- /dev/null
+++ b/arch/x86/crypto/morus1280-sse2-asm.S
@@ -0,0 +1,895 @@
+/*
+ * SSE2 implementation of MORUS-1280
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+
+#define STATE0_LO	%xmm0
+#define STATE0_HI	%xmm1
+#define STATE1_LO	%xmm2
+#define STATE1_HI	%xmm3
+#define STATE2_LO	%xmm4
+#define STATE2_HI	%xmm5
+#define STATE3_LO	%xmm6
+#define STATE3_HI	%xmm7
+#define STATE4_LO	%xmm8
+#define STATE4_HI	%xmm9
+#define KEY_LO		%xmm10
+#define KEY_HI		%xmm11
+#define MSG_LO		%xmm10
+#define MSG_HI		%xmm11
+#define T0_LO		%xmm12
+#define T0_HI		%xmm13
+#define T1_LO		%xmm14
+#define T1_HI		%xmm15
+
+.section .rodata.cst16.morus640_const, "aM", @progbits, 16
+.align 16
+.Lmorus640_const_0:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Lmorus640_const_1:
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
+.align 16
+.Lmorus640_counter_0:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Lmorus640_counter_1:
+	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+.macro rol1 hi, lo
+	/*
+	 * HI_1 | HI_0 || LO_1 | LO_0
+	 *  ==>
+	 * HI_0 | HI_1 || LO_1 | LO_0
+	 *  ==>
+	 * HI_0 | LO_1 || LO_0 | HI_1
+	 */
+	pshufd $MASK2, \hi, \hi
+	movdqa \hi, T0_LO
+	punpcklqdq \lo, T0_LO
+	punpckhqdq \hi, \lo
+	movdqa \lo, \hi
+	movdqa T0_LO, \lo
+.endm
+
+.macro rol2 hi, lo
+	movdqa \lo, T0_LO
+	movdqa \hi, \lo
+	movdqa T0_LO, \hi
+.endm
+
+.macro rol3 hi, lo
+	/*
+	 * HI_1 | HI_0 || LO_1 | LO_0
+	 *  ==>
+	 * HI_0 | HI_1 || LO_1 | LO_0
+	 *  ==>
+	 * LO_0 | HI_1 || HI_0 | LO_1
+	 */
+	pshufd $MASK2, \hi, \hi
+	movdqa \lo, T0_LO
+	punpckhqdq \hi, T0_LO
+	punpcklqdq \lo, \hi
+	movdqa T0_LO, \lo
+.endm
+
+.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
+	movdqa \s1_l, T0_LO
+	pand \s2_l, T0_LO
+	pxor T0_LO, \s0_l
+
+	movdqa \s1_h, T0_LO
+	pand \s2_h, T0_LO
+	pxor T0_LO, \s0_h
+
+	pxor \s3_l, \s0_l
+	pxor \s3_h, \s0_h
+
+	movdqa \s0_l, T0_LO
+	psllq $\b, T0_LO
+	psrlq $(64 - \b), \s0_l
+	pxor T0_LO, \s0_l
+
+	movdqa \s0_h, T0_LO
+	psllq $\b, T0_LO
+	psrlq $(64 - \b), \s0_h
+	pxor T0_LO, \s0_h
+
+	\w \s3_h, \s3_l
+.endm
+
+/*
+ * __morus1280_update: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ *   MSG        - message block
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus1280_update:
+	morus1280_round \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		13, rol1
+	pxor MSG_LO, STATE1_LO
+	pxor MSG_HI, STATE1_HI
+	morus1280_round \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		46, rol2
+	pxor MSG_LO, STATE2_LO
+	pxor MSG_HI, STATE2_HI
+	morus1280_round \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		38, rol3
+	pxor MSG_LO, STATE3_LO
+	pxor MSG_HI, STATE3_HI
+	morus1280_round \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		7, rol2
+	pxor MSG_LO, STATE4_LO
+	pxor MSG_HI, STATE4_HI
+	morus1280_round \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		4, rol1
+	ret
+ENDPROC(__morus1280_update)
+
+/*
+ * __morus1280_update_zero: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus1280_update_zero:
+	morus1280_round \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		13, rol1
+	morus1280_round \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		46, rol2
+	morus1280_round \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		38, rol3
+	morus1280_round \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		7, rol2
+	morus1280_round \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		4, rol1
+	ret
+ENDPROC(__morus1280_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   %rsi - src
+ *   %rcx - bytes
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	pxor MSG_LO, MSG_LO
+	pxor MSG_HI, MSG_HI
+
+	mov %rcx, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov %rcx, %r8
+	and $0x1E, %r8
+	add %rsi, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov %rcx, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov %rcx, %r8
+	and $0x1C, %r8
+	add %rsi, %r8
+	shl $16, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov %rcx, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov %rcx, %r8
+	and $0x18, %r8
+	add %rsi, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG_LO
+
+	mov %rcx, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov %rcx, %r8
+	and $0x10, %r8
+	add %rsi, %r8
+	pslldq $8, MSG_LO
+	movq (%r8), T0_LO
+	pxor T0_LO, MSG_LO
+
+.Lld_partial_8:
+	mov %rcx, %r8
+	and $0x10, %r8
+	jz .Lld_partial_16
+
+	movdqa MSG_LO, MSG_HI
+	movdqu (%rsi), MSG_LO
+
+.Lld_partial_16:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   %rdx - dst
+ *   %rcx - bytes
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov %rcx, %r8
+	mov %rdx, %r9
+
+	cmp $16, %r8
+	jl .Lst_partial_16
+
+	movdqu T0_LO, (%r9)
+	movdqa T0_HI, T0_LO
+
+	sub $16, %r8
+	add $16, %r9
+
+.Lst_partial_16:
+	movq T0_LO, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	psrldq $8, T0_LO
+	movq T0_LO, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $16, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus1280_sse2_init(void *state, const void *key,
+ *                                 const void *iv);
+ */
+ENTRY(crypto_morus1280_sse2_init)
+	FRAME_BEGIN
+
+	/* load IV: */
+	pxor STATE0_HI, STATE0_HI
+	movdqu (%rdx), STATE0_LO
+	/* load key: */
+	movdqu  0(%rsi), KEY_LO
+	movdqu 16(%rsi), KEY_HI
+	movdqa KEY_LO, STATE1_LO
+	movdqa KEY_HI, STATE1_HI
+	/* load all ones: */
+	pcmpeqd STATE2_LO, STATE2_LO
+	pcmpeqd STATE2_HI, STATE2_HI
+	/* load all zeros: */
+	pxor STATE3_LO, STATE3_LO
+	pxor STATE3_HI, STATE3_HI
+	/* load the constant: */
+	movdqa .Lmorus640_const_0, STATE4_LO
+	movdqa .Lmorus640_const_1, STATE4_HI
+
+	/* update 16 times with zero: */
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+
+	/* xor-in the key again after updates: */
+	pxor KEY_LO, STATE1_LO
+	pxor KEY_HI, STATE1_HI
+
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_init)
+
+/*
+ * void crypto_morus1280_sse2_ad(void *state, const void *data,
+ *                               unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_ad)
+	FRAME_BEGIN
+
+	cmp $32, %rdx
+	jb .Lad_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	mov %rsi, %r8
+	and $0xF, %r8
+	jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+	movdqa  0(%rsi), MSG_LO
+	movdqa 16(%rsi), MSG_HI
+	call __morus1280_update
+	sub $32, %rdx
+	add $32, %rsi
+	cmp $32, %rdx
+	jge .Lad_a_loop
+
+	jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+	movdqu  0(%rsi), MSG_LO
+	movdqu 16(%rsi), MSG_HI
+	call __morus1280_update
+	sub $32, %rdx
+	add $32, %rsi
+	cmp $32, %rdx
+	jge .Lad_u_loop
+
+.Lad_cont:
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_ad)
+
+/*
+ * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
+ *                                unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_enc)
+	FRAME_BEGIN
+
+	cmp $32, %rcx
+	jb .Lenc_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	mov %rsi, %r8
+	or  %rdx, %r8
+	and $0xF, %r8
+	jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+	movdqa  0(%rsi), MSG_LO
+	movdqa 16(%rsi), MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	movdqa MSG_LO, T0_LO
+	movdqa MSG_HI, T0_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	pxor STATE0_LO, T0_LO
+	pxor STATE0_HI, T0_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	movdqa T0_LO,  0(%rdx)
+	movdqa T0_HI, 16(%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Lenc_a_loop
+
+	jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+	movdqu  0(%rsi), MSG_LO
+	movdqu 16(%rsi), MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	movdqa MSG_LO, T0_LO
+	movdqa MSG_HI, T0_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	pxor STATE0_LO, T0_LO
+	pxor STATE0_HI, T0_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	movdqu T0_LO,  0(%rdx)
+	movdqu T0_HI, 16(%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Lenc_u_loop
+
+.Lenc_cont:
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_enc)
+
+/*
+ * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
+ *                                     unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_enc_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	/* encrypt message: */
+	call __load_partial
+
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	movdqa MSG_LO, T0_LO
+	movdqa MSG_HI, T0_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	pxor STATE0_LO, T0_LO
+	pxor STATE0_HI, T0_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+
+	call __store_partial
+
+	call __morus1280_update
+
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+	FRAME_END
+ENDPROC(crypto_morus1280_sse2_enc_tail)
+
+/*
+ * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
+ *                                unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_dec)
+	FRAME_BEGIN
+
+	cmp $32, %rcx
+	jb .Ldec_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	mov %rsi, %r8
+	or  %rdx, %r8
+	and $0xF, %r8
+	jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+	movdqa  0(%rsi), MSG_LO
+	movdqa 16(%rsi), MSG_HI
+	pxor STATE0_LO, MSG_LO
+	pxor STATE0_HI, MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa MSG_LO,  0(%rdx)
+	movdqa MSG_HI, 16(%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Ldec_a_loop
+
+	jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+	movdqu  0(%rsi), MSG_LO
+	movdqu 16(%rsi), MSG_HI
+	pxor STATE0_LO, MSG_LO
+	pxor STATE0_HI, MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqu MSG_LO,  0(%rdx)
+	movdqu MSG_HI, 16(%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Ldec_u_loop
+
+.Ldec_cont:
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_dec)
+
+/*
+ * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
+ *                                     unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_dec_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	/* decrypt message: */
+	call __load_partial
+
+	pxor STATE0_LO, MSG_LO
+	pxor STATE0_HI, MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa MSG_LO, T0_LO
+	movdqa MSG_HI, T0_HI
+
+	call __store_partial
+
+	/* mask with byte count: */
+	movq %rcx, T0_LO
+	punpcklbw T0_LO, T0_LO
+	punpcklbw T0_LO, T0_LO
+	punpcklbw T0_LO, T0_LO
+	punpcklbw T0_LO, T0_LO
+	movdqa T0_LO, T0_HI
+	movdqa .Lmorus640_counter_0, T1_LO
+	movdqa .Lmorus640_counter_1, T1_HI
+	pcmpgtb T1_LO, T0_LO
+	pcmpgtb T1_HI, T0_HI
+	pand T0_LO, MSG_LO
+	pand T0_HI, MSG_HI
+
+	call __morus1280_update
+
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_dec_tail)
+
+/*
+ * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
+ *                                  u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus1280_sse2_final)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	/* xor state[0] into state[4]: */
+	pxor STATE0_LO, STATE4_LO
+	pxor STATE0_HI, STATE4_HI
+
+	/* prepare length block: */
+	movq %rdx, MSG_LO
+	movq %rcx, T0_LO
+	pslldq $8, T0_LO
+	pxor T0_LO, MSG_LO
+	psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
+	pxor MSG_HI, MSG_HI
+
+	/* update state: */
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+
+	/* xor tag: */
+	movdqu  0(%rsi), MSG_LO
+	movdqu 16(%rsi), MSG_HI
+
+	pxor STATE0_LO, MSG_LO
+	pxor STATE0_HI, MSG_HI
+	movdqa STATE1_LO, T0_LO
+	movdqa STATE1_HI, T0_HI
+	rol3 T0_HI, T0_LO
+	pxor T0_LO, MSG_LO
+	pxor T0_HI, MSG_HI
+	movdqa STATE2_LO, T0_LO
+	movdqa STATE2_HI, T0_HI
+	pand STATE3_LO, T0_LO
+	pand STATE3_HI, T0_HI
+	pxor T0_LO, MSG_LO
+	pxor T0_HI, MSG_HI
+
+	movdqu MSG_LO,  0(%rsi)
+	movdqu MSG_HI, 16(%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_final)
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c
new file mode 100644
index 000000000000..839270aa713c
--- /dev/null
+++ b/arch/x86/crypto/morus1280-sse2-glue.c
@@ -0,0 +1,68 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ *   Glue for SSE2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus1280_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key,
+					   const void *iv);
+asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data,
+					 unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src,
+					  void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src,
+					  void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src,
+					       void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src,
+					       void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor,
+					    u64 assoclen, u64 cryptlen);
+
+MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
+
+static const struct x86_cpu_id sse2_cpu_id[] = {
+    X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+    {}
+};
+MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
+
+static int __init crypto_morus1280_sse2_module_init(void)
+{
+	if (!x86_match_cpu(sse2_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_morus1280_sse2_algs,
+				     ARRAY_SIZE(crypto_morus1280_sse2_algs));
+}
+
+static void __exit crypto_morus1280_sse2_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_morus1280_sse2_algs,
+				ARRAY_SIZE(crypto_morus1280_sse2_algs));
+}
+
+module_init(crypto_morus1280_sse2_module_init);
+module_exit(crypto_morus1280_sse2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-sse2");
diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c
new file mode 100644
index 000000000000..0dccdda1eb3a
--- /dev/null
+++ b/arch/x86/crypto/morus1280_glue.c
@@ -0,0 +1,302 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ *   Common x86 SIMD glue skeleton
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus1280_glue.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <asm/fpu/api.h>
+
+struct morus1280_state {
+	struct morus1280_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus1280_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_blocks)(void *state, const void *src, void *dst,
+			     unsigned int length);
+	void (*crypt_tail)(void *state, const void *src, void *dst,
+			   unsigned int length);
+};
+
+static void crypto_morus1280_glue_process_ad(
+		struct morus1280_state *state,
+		const struct morus1280_glue_ops *ops,
+		struct scatterlist *sg_src, unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct morus1280_block buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= MORUS1280_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = MORUS1280_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+				ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			ops->ad(state, src, left);
+			src += left & ~(MORUS1280_BLOCK_SIZE - 1);
+			left &= MORUS1280_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+
+		pos += left;
+		assoclen -= size;
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos);
+		ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
+	}
+}
+
+static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state,
+						struct morus1280_ops ops,
+						struct aead_request *req)
+{
+	struct skcipher_walk walk;
+	u8 *cursor_src, *cursor_dst;
+	unsigned int chunksize, base;
+
+	ops.skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		cursor_src = walk.src.virt.addr;
+		cursor_dst = walk.dst.virt.addr;
+		chunksize = walk.nbytes;
+
+		ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
+
+		base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1);
+		cursor_src += base;
+		cursor_dst += base;
+		chunksize &= MORUS1280_BLOCK_SIZE - 1;
+
+		if (chunksize > 0)
+			ops.crypt_tail(state, cursor_src, cursor_dst,
+				       chunksize);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				 unsigned int keylen)
+{
+	struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (keylen == MORUS1280_BLOCK_SIZE) {
+		memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE);
+	} else if (keylen == MORUS1280_BLOCK_SIZE / 2) {
+		memcpy(ctx->key.bytes, key, keylen);
+		memcpy(ctx->key.bytes + keylen, key, keylen);
+	} else {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey);
+
+int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm,
+				      unsigned int authsize)
+{
+	return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize);
+
+static void crypto_morus1280_glue_crypt(struct aead_request *req,
+					struct morus1280_ops ops,
+					unsigned int cryptlen,
+					struct morus1280_block *tag_xor)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus1280_state state;
+
+	kernel_fpu_begin();
+
+	ctx->ops->init(&state, &ctx->key, req->iv);
+	crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
+	crypto_morus1280_glue_process_crypt(&state, ops, req);
+	ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
+
+	kernel_fpu_end();
+}
+
+int crypto_morus1280_glue_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus1280_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_blocks = ctx->ops->enc,
+		.crypt_tail = ctx->ops->enc_tail,
+	};
+
+	struct morus1280_block tag = {};
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
+
+	scatterwalk_map_and_copy(tag.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt);
+
+int crypto_morus1280_glue_decrypt(struct aead_request *req)
+{
+	static const u8 zeros[MORUS1280_BLOCK_SIZE] = {};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus1280_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_blocks = ctx->ops->dec,
+		.crypt_tail = ctx->ops->dec_tail,
+	};
+
+	struct morus1280_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
+
+	return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt);
+
+void crypto_morus1280_glue_init_ops(struct crypto_aead *aead,
+				    const struct morus1280_glue_ops *ops)
+{
+	struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+	ctx->ops = ops;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops);
+
+int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				 unsigned int keylen)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey);
+
+int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead,
+				      unsigned int authsize)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize);
+
+int cryptd_morus1280_glue_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_encrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt);
+
+int cryptd_morus1280_glue_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_decrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt);
+
+int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
+	char internal_name[CRYPTO_MAX_ALG_NAME];
+
+	if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
+			>= CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm);
+
+void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations");
diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S
new file mode 100644
index 000000000000..71c72a0a0862
--- /dev/null
+++ b/arch/x86/crypto/morus640-sse2-asm.S
@@ -0,0 +1,614 @@
+/*
+ * SSE2 implementation of MORUS-640
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
+
+#define STATE0	%xmm0
+#define STATE1	%xmm1
+#define STATE2	%xmm2
+#define STATE3	%xmm3
+#define STATE4	%xmm4
+#define KEY	%xmm5
+#define MSG	%xmm5
+#define T0	%xmm6
+#define T1	%xmm7
+
+.section .rodata.cst16.morus640_const, "aM", @progbits, 32
+.align 16
+.Lmorus640_const_0:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Lmorus640_const_1:
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
+.align 16
+.Lmorus640_counter:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+.macro morus640_round s0, s1, s2, s3, s4, b, w
+	movdqa \s1, T0
+	pand \s2, T0
+	pxor T0, \s0
+	pxor \s3, \s0
+	movdqa \s0, T0
+	pslld $\b, T0
+	psrld $(32 - \b), \s0
+	pxor T0, \s0
+	pshufd $\w, \s3, \s3
+.endm
+
+/*
+ * __morus640_update: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ *   MSG        - message block
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus640_update:
+	morus640_round STATE0, STATE1, STATE2, STATE3, STATE4,  5, MASK1
+	pxor MSG, STATE1
+	morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
+	pxor MSG, STATE2
+	morus640_round STATE2, STATE3, STATE4, STATE0, STATE1,  7, MASK3
+	pxor MSG, STATE3
+	morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
+	pxor MSG, STATE4
+	morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
+	ret
+ENDPROC(__morus640_update)
+
+
+/*
+ * __morus640_update_zero: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus640_update_zero:
+	morus640_round STATE0, STATE1, STATE2, STATE3, STATE4,  5, MASK1
+	morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
+	morus640_round STATE2, STATE3, STATE4, STATE0, STATE1,  7, MASK3
+	morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
+	morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
+	ret
+ENDPROC(__morus640_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   %rsi - src
+ *   %rcx - bytes
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   T0
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	pxor MSG, MSG
+
+	mov %rcx, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov %rcx, %r8
+	and $0x1E, %r8
+	add %rsi, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov %rcx, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov %rcx, %r8
+	and $0x1C, %r8
+	add %rsi, %r8
+	shl $16, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov %rcx, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov %rcx, %r8
+	and $0x18, %r8
+	add %rsi, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG
+
+	mov %rcx, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov %rcx, %r8
+	and $0x10, %r8
+	add %rsi, %r8
+	pslldq $8, MSG
+	movq (%r8), T0
+	pxor T0, MSG
+
+.Lld_partial_8:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   %rdx - dst
+ *   %rcx - bytes
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov %rcx, %r8
+	mov %rdx, %r9
+
+	movq T0, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	psrldq $8, T0
+	movq T0, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $16, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_morus640_sse2_init)
+	FRAME_BEGIN
+
+	/* load IV: */
+	movdqu (%rdx), STATE0
+	/* load key: */
+	movdqu (%rsi), KEY
+	movdqa KEY, STATE1
+	/* load all ones: */
+	pcmpeqd STATE2, STATE2
+	/* load the constants: */
+	movdqa .Lmorus640_const_0, STATE3
+	movdqa .Lmorus640_const_1, STATE4
+
+	/* update 16 times with zero: */
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+
+	/* xor-in the key again after updates: */
+	pxor KEY, STATE1
+
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_init)
+
+/*
+ * void crypto_morus640_sse2_ad(void *state, const void *data,
+ *                              unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_ad)
+	FRAME_BEGIN
+
+	cmp $16, %rdx
+	jb .Lad_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	mov %rsi, %r8
+	and $0xF, %r8
+	jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+	movdqa (%rsi), MSG
+	call __morus640_update
+	sub $16, %rdx
+	add $16, %rsi
+	cmp $16, %rdx
+	jge .Lad_a_loop
+
+	jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+	movdqu (%rsi), MSG
+	call __morus640_update
+	sub $16, %rdx
+	add $16, %rsi
+	cmp $16, %rdx
+	jge .Lad_u_loop
+
+.Lad_cont:
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_ad)
+
+/*
+ * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
+ *                               unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_enc)
+	FRAME_BEGIN
+
+	cmp $16, %rcx
+	jb .Lenc_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	mov %rsi, %r8
+	or  %rdx, %r8
+	and $0xF, %r8
+	jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+	movdqa (%rsi), MSG
+	movdqa MSG, T0
+	pxor STATE0, T0
+	pshufd $MASK3, STATE1, T1
+	pxor T1, T0
+	movdqa STATE2, T1
+	pand STATE3, T1
+	pxor T1, T0
+	movdqa T0, (%rdx)
+
+	call __morus640_update
+	sub $16, %rcx
+	add $16, %rsi
+	add $16, %rdx
+	cmp $16, %rcx
+	jge .Lenc_a_loop
+
+	jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+	movdqu (%rsi), MSG
+	movdqa MSG, T0
+	pxor STATE0, T0
+	pshufd $MASK3, STATE1, T1
+	pxor T1, T0
+	movdqa STATE2, T1
+	pand STATE3, T1
+	pxor T1, T0
+	movdqu T0, (%rdx)
+
+	call __morus640_update
+	sub $16, %rcx
+	add $16, %rsi
+	add $16, %rdx
+	cmp $16, %rcx
+	jge .Lenc_u_loop
+
+.Lenc_cont:
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_enc)
+
+/*
+ * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
+ *                                    unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_enc_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	/* encrypt message: */
+	call __load_partial
+
+	movdqa MSG, T0
+	pxor STATE0, T0
+	pshufd $MASK3, STATE1, T1
+	pxor T1, T0
+	movdqa STATE2, T1
+	pand STATE3, T1
+	pxor T1, T0
+
+	call __store_partial
+
+	call __morus640_update
+
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+	FRAME_END
+ENDPROC(crypto_morus640_sse2_enc_tail)
+
+/*
+ * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
+ *                               unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_dec)
+	FRAME_BEGIN
+
+	cmp $16, %rcx
+	jb .Ldec_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	mov %rsi, %r8
+	or  %rdx, %r8
+	and $0xF, %r8
+	jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+	movdqa (%rsi), MSG
+	pxor STATE0, MSG
+	pshufd $MASK3, STATE1, T0
+	pxor T0, MSG
+	movdqa STATE2, T0
+	pand STATE3, T0
+	pxor T0, MSG
+	movdqa MSG, (%rdx)
+
+	call __morus640_update
+	sub $16, %rcx
+	add $16, %rsi
+	add $16, %rdx
+	cmp $16, %rcx
+	jge .Ldec_a_loop
+
+	jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+	movdqu (%rsi), MSG
+	pxor STATE0, MSG
+	pshufd $MASK3, STATE1, T0
+	pxor T0, MSG
+	movdqa STATE2, T0
+	pand STATE3, T0
+	pxor T0, MSG
+	movdqu MSG, (%rdx)
+
+	call __morus640_update
+	sub $16, %rcx
+	add $16, %rsi
+	add $16, %rdx
+	cmp $16, %rcx
+	jge .Ldec_u_loop
+
+.Ldec_cont:
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_dec)
+
+/*
+ * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
+ *                                    unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_dec_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	/* decrypt message: */
+	call __load_partial
+
+	pxor STATE0, MSG
+	pshufd $MASK3, STATE1, T0
+	pxor T0, MSG
+	movdqa STATE2, T0
+	pand STATE3, T0
+	pxor T0, MSG
+	movdqa MSG, T0
+
+	call __store_partial
+
+	/* mask with byte count: */
+	movq %rcx, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	movdqa .Lmorus640_counter, T1
+	pcmpgtb T1, T0
+	pand T0, MSG
+
+	call __morus640_update
+
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_dec_tail)
+
+/*
+ * void crypto_morus640_sse2_final(void *state, void *tag_xor,
+ *	                           u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus640_sse2_final)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	/* xor state[0] into state[4]: */
+	pxor STATE0, STATE4
+
+	/* prepare length block: */
+	movq %rdx, MSG
+	movq %rcx, T0
+	pslldq $8, T0
+	pxor T0, MSG
+	psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+	/* update state: */
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+
+	/* xor tag: */
+	movdqu (%rsi), MSG
+
+	pxor STATE0, MSG
+	pshufd $MASK3, STATE1, T0
+	pxor T0, MSG
+	movdqa STATE2, T0
+	pand STATE3, T0
+	pxor T0, MSG
+
+	movdqu MSG, (%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_final)
diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c
new file mode 100644
index 000000000000..26b47e2db8d2
--- /dev/null
+++ b/arch/x86/crypto/morus640-sse2-glue.c
@@ -0,0 +1,68 @@
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ *   Glue for SSE2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus640_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus640_sse2_init(void *state, const void *key,
+					  const void *iv);
+asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data,
+					unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src,
+					 void *dst, unsigned int length);
+asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src,
+					 void *dst, unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src,
+					      void *dst, unsigned int length);
+asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src,
+					      void *dst, unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor,
+					   u64 assoclen, u64 cryptlen);
+
+MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
+
+static const struct x86_cpu_id sse2_cpu_id[] = {
+    X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+    {}
+};
+MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
+
+static int __init crypto_morus640_sse2_module_init(void)
+{
+	if (!x86_match_cpu(sse2_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_morus640_sse2_algs,
+				     ARRAY_SIZE(crypto_morus640_sse2_algs));
+}
+
+static void __exit crypto_morus640_sse2_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_morus640_sse2_algs,
+				ARRAY_SIZE(crypto_morus640_sse2_algs));
+}
+
+module_init(crypto_morus640_sse2_module_init);
+module_exit(crypto_morus640_sse2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation");
+MODULE_ALIAS_CRYPTO("morus640");
+MODULE_ALIAS_CRYPTO("morus640-sse2");
diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c
new file mode 100644
index 000000000000..7b58fe4d9bd1
--- /dev/null
+++ b/arch/x86/crypto/morus640_glue.c
@@ -0,0 +1,298 @@
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ *   Common x86 SIMD glue skeleton
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus640_glue.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <asm/fpu/api.h>
+
+struct morus640_state {
+	struct morus640_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus640_ops {
+	int (*skcipher_walk_init)(struct skcipher_walk *walk,
+				  struct aead_request *req, bool atomic);
+
+	void (*crypt_blocks)(void *state, const void *src, void *dst,
+			     unsigned int length);
+	void (*crypt_tail)(void *state, const void *src, void *dst,
+			   unsigned int length);
+};
+
+static void crypto_morus640_glue_process_ad(
+		struct morus640_state *state,
+		const struct morus640_glue_ops *ops,
+		struct scatterlist *sg_src, unsigned int assoclen)
+{
+	struct scatter_walk walk;
+	struct morus640_block buf;
+	unsigned int pos = 0;
+
+	scatterwalk_start(&walk, sg_src);
+	while (assoclen != 0) {
+		unsigned int size = scatterwalk_clamp(&walk, assoclen);
+		unsigned int left = size;
+		void *mapped = scatterwalk_map(&walk);
+		const u8 *src = (const u8 *)mapped;
+
+		if (pos + size >= MORUS640_BLOCK_SIZE) {
+			if (pos > 0) {
+				unsigned int fill = MORUS640_BLOCK_SIZE - pos;
+				memcpy(buf.bytes + pos, src, fill);
+				ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
+				pos = 0;
+				left -= fill;
+				src += fill;
+			}
+
+			ops->ad(state, src, left);
+			src += left & ~(MORUS640_BLOCK_SIZE - 1);
+			left &= MORUS640_BLOCK_SIZE - 1;
+		}
+
+		memcpy(buf.bytes + pos, src, left);
+
+		pos += left;
+		assoclen -= size;
+		scatterwalk_unmap(mapped);
+		scatterwalk_advance(&walk, size);
+		scatterwalk_done(&walk, 0, assoclen);
+	}
+
+	if (pos > 0) {
+		memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos);
+		ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
+	}
+}
+
+static void crypto_morus640_glue_process_crypt(struct morus640_state *state,
+					       struct morus640_ops ops,
+					       struct aead_request *req)
+{
+	struct skcipher_walk walk;
+	u8 *cursor_src, *cursor_dst;
+	unsigned int chunksize, base;
+
+	ops.skcipher_walk_init(&walk, req, false);
+
+	while (walk.nbytes) {
+		cursor_src = walk.src.virt.addr;
+		cursor_dst = walk.dst.virt.addr;
+		chunksize = walk.nbytes;
+
+		ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
+
+		base = chunksize & ~(MORUS640_BLOCK_SIZE - 1);
+		cursor_src += base;
+		cursor_dst += base;
+		chunksize &= MORUS640_BLOCK_SIZE - 1;
+
+		if (chunksize > 0)
+			ops.crypt_tail(state, cursor_src, cursor_dst,
+				       chunksize);
+
+		skcipher_walk_done(&walk, 0);
+	}
+}
+
+int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				unsigned int keylen)
+{
+	struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (keylen != MORUS640_BLOCK_SIZE) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey);
+
+int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm,
+				     unsigned int authsize)
+{
+	return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize);
+
+static void crypto_morus640_glue_crypt(struct aead_request *req,
+				       struct morus640_ops ops,
+				       unsigned int cryptlen,
+				       struct morus640_block *tag_xor)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus640_state state;
+
+	kernel_fpu_begin();
+
+	ctx->ops->init(&state, &ctx->key, req->iv);
+	crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
+	crypto_morus640_glue_process_crypt(&state, ops, req);
+	ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
+
+	kernel_fpu_end();
+}
+
+int crypto_morus640_glue_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus640_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_encrypt,
+		.crypt_blocks = ctx->ops->enc,
+		.crypt_tail = ctx->ops->enc_tail,
+	};
+
+	struct morus640_block tag = {};
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen;
+
+	crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
+
+	scatterwalk_map_and_copy(tag.bytes, req->dst,
+				 req->assoclen + cryptlen, authsize, 1);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt);
+
+int crypto_morus640_glue_decrypt(struct aead_request *req)
+{
+	static const u8 zeros[MORUS640_BLOCK_SIZE] = {};
+
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+	struct morus640_ops OPS = {
+		.skcipher_walk_init = skcipher_walk_aead_decrypt,
+		.crypt_blocks = ctx->ops->dec,
+		.crypt_tail = ctx->ops->dec_tail,
+	};
+
+	struct morus640_block tag;
+	unsigned int authsize = crypto_aead_authsize(tfm);
+	unsigned int cryptlen = req->cryptlen - authsize;
+
+	scatterwalk_map_and_copy(tag.bytes, req->src,
+				 req->assoclen + cryptlen, authsize, 0);
+
+	crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
+
+	return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt);
+
+void crypto_morus640_glue_init_ops(struct crypto_aead *aead,
+				   const struct morus640_glue_ops *ops)
+{
+	struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+	ctx->ops = ops;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops);
+
+int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+				unsigned int keylen)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey);
+
+int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead,
+				     unsigned int authsize)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize);
+
+int cryptd_morus640_glue_encrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_encrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt);
+
+int cryptd_morus640_glue_decrypt(struct aead_request *req)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+
+	aead = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		aead = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, aead);
+
+	return crypto_aead_decrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt);
+
+int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+	const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
+	char internal_name[CRYPTO_MAX_ALG_NAME];
+
+	if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
+			>= CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm);
+
+void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations");
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
deleted file mode 100644
index 6014b7b9e52a..000000000000
--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ /dev/null
@@ -1,938 +0,0 @@
-# Derived from:
-#	salsa20_pm.s version 20051229
-#	D. J. Bernstein
-#	Public domain.
-
-#include <linux/linkage.h>
-
-.text
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
-	mov	%esp,%eax
-	and	$31,%eax
-	add	$256,%eax
-	sub	%eax,%esp
-	# eax_stack = eax
-	movl	%eax,80(%esp)
-	# ebx_stack = ebx
-	movl	%ebx,84(%esp)
-	# esi_stack = esi
-	movl	%esi,88(%esp)
-	# edi_stack = edi
-	movl	%edi,92(%esp)
-	# ebp_stack = ebp
-	movl	%ebp,96(%esp)
-	# x = arg1
-	movl	4(%esp,%eax),%edx
-	# m = arg2
-	movl	8(%esp,%eax),%esi
-	# out = arg3
-	movl	12(%esp,%eax),%edi
-	# bytes = arg4
-	movl	16(%esp,%eax),%ebx
-	# bytes -= 0
-	sub	$0,%ebx
-	# goto done if unsigned<=
-	jbe	._done
-._start:
-	# in0 = *(uint32 *) (x + 0)
-	movl	0(%edx),%eax
-	# in1 = *(uint32 *) (x + 4)
-	movl	4(%edx),%ecx
-	# in2 = *(uint32 *) (x + 8)
-	movl	8(%edx),%ebp
-	# j0 = in0
-	movl	%eax,164(%esp)
-	# in3 = *(uint32 *) (x + 12)
-	movl	12(%edx),%eax
-	# j1 = in1
-	movl	%ecx,168(%esp)
-	# in4 = *(uint32 *) (x + 16)
-	movl	16(%edx),%ecx
-	# j2 = in2
-	movl	%ebp,172(%esp)
-	# in5 = *(uint32 *) (x + 20)
-	movl	20(%edx),%ebp
-	# j3 = in3
-	movl	%eax,176(%esp)
-	# in6 = *(uint32 *) (x + 24)
-	movl	24(%edx),%eax
-	# j4 = in4
-	movl	%ecx,180(%esp)
-	# in7 = *(uint32 *) (x + 28)
-	movl	28(%edx),%ecx
-	# j5 = in5
-	movl	%ebp,184(%esp)
-	# in8 = *(uint32 *) (x + 32)
-	movl	32(%edx),%ebp
-	# j6 = in6
-	movl	%eax,188(%esp)
-	# in9 = *(uint32 *) (x + 36)
-	movl	36(%edx),%eax
-	# j7 = in7
-	movl	%ecx,192(%esp)
-	# in10 = *(uint32 *) (x + 40)
-	movl	40(%edx),%ecx
-	# j8 = in8
-	movl	%ebp,196(%esp)
-	# in11 = *(uint32 *) (x + 44)
-	movl	44(%edx),%ebp
-	# j9 = in9
-	movl	%eax,200(%esp)
-	# in12 = *(uint32 *) (x + 48)
-	movl	48(%edx),%eax
-	# j10 = in10
-	movl	%ecx,204(%esp)
-	# in13 = *(uint32 *) (x + 52)
-	movl	52(%edx),%ecx
-	# j11 = in11
-	movl	%ebp,208(%esp)
-	# in14 = *(uint32 *) (x + 56)
-	movl	56(%edx),%ebp
-	# j12 = in12
-	movl	%eax,212(%esp)
-	# in15 = *(uint32 *) (x + 60)
-	movl	60(%edx),%eax
-	# j13 = in13
-	movl	%ecx,216(%esp)
-	# j14 = in14
-	movl	%ebp,220(%esp)
-	# j15 = in15
-	movl	%eax,224(%esp)
-	# x_backup = x
-	movl	%edx,64(%esp)
-._bytesatleast1:
-	#   bytes - 64
-	cmp	$64,%ebx
-	#   goto nocopy if unsigned>=
-	jae	._nocopy
-	#     ctarget = out
-	movl	%edi,228(%esp)
-	#     out = &tmp
-	leal	0(%esp),%edi
-	#     i = bytes
-	mov	%ebx,%ecx
-	#     while (i) { *out++ = *m++; --i }
-	rep	movsb
-	#     out = &tmp
-	leal	0(%esp),%edi
-	#     m = &tmp
-	leal	0(%esp),%esi
-._nocopy:
-	#   out_backup = out
-	movl	%edi,72(%esp)
-	#   m_backup = m
-	movl	%esi,68(%esp)
-	#   bytes_backup = bytes
-	movl	%ebx,76(%esp)
-	#   in0 = j0
-	movl	164(%esp),%eax
-	#   in1 = j1
-	movl	168(%esp),%ecx
-	#   in2 = j2
-	movl	172(%esp),%edx
-	#   in3 = j3
-	movl	176(%esp),%ebx
-	#   x0 = in0
-	movl	%eax,100(%esp)
-	#   x1 = in1
-	movl	%ecx,104(%esp)
-	#   x2 = in2
-	movl	%edx,108(%esp)
-	#   x3 = in3
-	movl	%ebx,112(%esp)
-	#   in4 = j4
-	movl	180(%esp),%eax
-	#   in5 = j5
-	movl	184(%esp),%ecx
-	#   in6 = j6
-	movl	188(%esp),%edx
-	#   in7 = j7
-	movl	192(%esp),%ebx
-	#   x4 = in4
-	movl	%eax,116(%esp)
-	#   x5 = in5
-	movl	%ecx,120(%esp)
-	#   x6 = in6
-	movl	%edx,124(%esp)
-	#   x7 = in7
-	movl	%ebx,128(%esp)
-	#   in8 = j8
-	movl	196(%esp),%eax
-	#   in9 = j9
-	movl	200(%esp),%ecx
-	#   in10 = j10
-	movl	204(%esp),%edx
-	#   in11 = j11
-	movl	208(%esp),%ebx
-	#   x8 = in8
-	movl	%eax,132(%esp)
-	#   x9 = in9
-	movl	%ecx,136(%esp)
-	#   x10 = in10
-	movl	%edx,140(%esp)
-	#   x11 = in11
-	movl	%ebx,144(%esp)
-	#   in12 = j12
-	movl	212(%esp),%eax
-	#   in13 = j13
-	movl	216(%esp),%ecx
-	#   in14 = j14
-	movl	220(%esp),%edx
-	#   in15 = j15
-	movl	224(%esp),%ebx
-	#   x12 = in12
-	movl	%eax,148(%esp)
-	#   x13 = in13
-	movl	%ecx,152(%esp)
-	#   x14 = in14
-	movl	%edx,156(%esp)
-	#   x15 = in15
-	movl	%ebx,160(%esp)
-	#   i = 20
-	mov	$20,%ebp
-	# p = x0
-	movl	100(%esp),%eax
-	# s = x5
-	movl	120(%esp),%ecx
-	# t = x10
-	movl	140(%esp),%edx
-	# w = x15
-	movl	160(%esp),%ebx
-._mainloop:
-	# x0 = p
-	movl	%eax,100(%esp)
-	# 				x10 = t
-	movl	%edx,140(%esp)
-	# p += x12
-	addl	148(%esp),%eax
-	# 		x5 = s
-	movl	%ecx,120(%esp)
-	# 				t += x6
-	addl	124(%esp),%edx
-	# 						x15 = w
-	movl	%ebx,160(%esp)
-	# 		r = x1
-	movl	104(%esp),%esi
-	# 		r += s
-	add	%ecx,%esi
-	# 						v = x11
-	movl	144(%esp),%edi
-	# 						v += w
-	add	%ebx,%edi
-	# p <<<= 7
-	rol	$7,%eax
-	# p ^= x4
-	xorl	116(%esp),%eax
-	# 				t <<<= 7
-	rol	$7,%edx
-	# 				t ^= x14
-	xorl	156(%esp),%edx
-	# 		r <<<= 7
-	rol	$7,%esi
-	# 		r ^= x9
-	xorl	136(%esp),%esi
-	# 						v <<<= 7
-	rol	$7,%edi
-	# 						v ^= x3
-	xorl	112(%esp),%edi
-	# x4 = p
-	movl	%eax,116(%esp)
-	# 				x14 = t
-	movl	%edx,156(%esp)
-	# p += x0
-	addl	100(%esp),%eax
-	# 		x9 = r
-	movl	%esi,136(%esp)
-	# 				t += x10
-	addl	140(%esp),%edx
-	# 						x3 = v
-	movl	%edi,112(%esp)
-	# p <<<= 9
-	rol	$9,%eax
-	# p ^= x8
-	xorl	132(%esp),%eax
-	# 				t <<<= 9
-	rol	$9,%edx
-	# 				t ^= x2
-	xorl	108(%esp),%edx
-	# 		s += r
-	add	%esi,%ecx
-	# 		s <<<= 9
-	rol	$9,%ecx
-	# 		s ^= x13
-	xorl	152(%esp),%ecx
-	# 						w += v
-	add	%edi,%ebx
-	# 						w <<<= 9
-	rol	$9,%ebx
-	# 						w ^= x7
-	xorl	128(%esp),%ebx
-	# x8 = p
-	movl	%eax,132(%esp)
-	# 				x2 = t
-	movl	%edx,108(%esp)
-	# p += x4
-	addl	116(%esp),%eax
-	# 		x13 = s
-	movl	%ecx,152(%esp)
-	# 				t += x14
-	addl	156(%esp),%edx
-	# 						x7 = w
-	movl	%ebx,128(%esp)
-	# p <<<= 13
-	rol	$13,%eax
-	# p ^= x12
-	xorl	148(%esp),%eax
-	# 				t <<<= 13
-	rol	$13,%edx
-	# 				t ^= x6
-	xorl	124(%esp),%edx
-	# 		r += s
-	add	%ecx,%esi
-	# 		r <<<= 13
-	rol	$13,%esi
-	# 		r ^= x1
-	xorl	104(%esp),%esi
-	# 						v += w
-	add	%ebx,%edi
-	# 						v <<<= 13
-	rol	$13,%edi
-	# 						v ^= x11
-	xorl	144(%esp),%edi
-	# x12 = p
-	movl	%eax,148(%esp)
-	# 				x6 = t
-	movl	%edx,124(%esp)
-	# p += x8
-	addl	132(%esp),%eax
-	# 		x1 = r
-	movl	%esi,104(%esp)
-	# 				t += x2
-	addl	108(%esp),%edx
-	# 						x11 = v
-	movl	%edi,144(%esp)
-	# p <<<= 18
-	rol	$18,%eax
-	# p ^= x0
-	xorl	100(%esp),%eax
-	# 				t <<<= 18
-	rol	$18,%edx
-	# 				t ^= x10
-	xorl	140(%esp),%edx
-	# 		s += r
-	add	%esi,%ecx
-	# 		s <<<= 18
-	rol	$18,%ecx
-	# 		s ^= x5
-	xorl	120(%esp),%ecx
-	# 						w += v
-	add	%edi,%ebx
-	# 						w <<<= 18
-	rol	$18,%ebx
-	# 						w ^= x15
-	xorl	160(%esp),%ebx
-	# x0 = p
-	movl	%eax,100(%esp)
-	# 				x10 = t
-	movl	%edx,140(%esp)
-	# p += x3
-	addl	112(%esp),%eax
-	# p <<<= 7
-	rol	$7,%eax
-	# 		x5 = s
-	movl	%ecx,120(%esp)
-	# 				t += x9
-	addl	136(%esp),%edx
-	# 						x15 = w
-	movl	%ebx,160(%esp)
-	# 		r = x4
-	movl	116(%esp),%esi
-	# 		r += s
-	add	%ecx,%esi
-	# 						v = x14
-	movl	156(%esp),%edi
-	# 						v += w
-	add	%ebx,%edi
-	# p ^= x1
-	xorl	104(%esp),%eax
-	# 				t <<<= 7
-	rol	$7,%edx
-	# 				t ^= x11
-	xorl	144(%esp),%edx
-	# 		r <<<= 7
-	rol	$7,%esi
-	# 		r ^= x6
-	xorl	124(%esp),%esi
-	# 						v <<<= 7
-	rol	$7,%edi
-	# 						v ^= x12
-	xorl	148(%esp),%edi
-	# x1 = p
-	movl	%eax,104(%esp)
-	# 				x11 = t
-	movl	%edx,144(%esp)
-	# p += x0
-	addl	100(%esp),%eax
-	# 		x6 = r
-	movl	%esi,124(%esp)
-	# 				t += x10
-	addl	140(%esp),%edx
-	# 						x12 = v
-	movl	%edi,148(%esp)
-	# p <<<= 9
-	rol	$9,%eax
-	# p ^= x2
-	xorl	108(%esp),%eax
-	# 				t <<<= 9
-	rol	$9,%edx
-	# 				t ^= x8
-	xorl	132(%esp),%edx
-	# 		s += r
-	add	%esi,%ecx
-	# 		s <<<= 9
-	rol	$9,%ecx
-	# 		s ^= x7
-	xorl	128(%esp),%ecx
-	# 						w += v
-	add	%edi,%ebx
-	# 						w <<<= 9
-	rol	$9,%ebx
-	# 						w ^= x13
-	xorl	152(%esp),%ebx
-	# x2 = p
-	movl	%eax,108(%esp)
-	# 				x8 = t
-	movl	%edx,132(%esp)
-	# p += x1
-	addl	104(%esp),%eax
-	# 		x7 = s
-	movl	%ecx,128(%esp)
-	# 				t += x11
-	addl	144(%esp),%edx
-	# 						x13 = w
-	movl	%ebx,152(%esp)
-	# p <<<= 13
-	rol	$13,%eax
-	# p ^= x3
-	xorl	112(%esp),%eax
-	# 				t <<<= 13
-	rol	$13,%edx
-	# 				t ^= x9
-	xorl	136(%esp),%edx
-	# 		r += s
-	add	%ecx,%esi
-	# 		r <<<= 13
-	rol	$13,%esi
-	# 		r ^= x4
-	xorl	116(%esp),%esi
-	# 						v += w
-	add	%ebx,%edi
-	# 						v <<<= 13
-	rol	$13,%edi
-	# 						v ^= x14
-	xorl	156(%esp),%edi
-	# x3 = p
-	movl	%eax,112(%esp)
-	# 				x9 = t
-	movl	%edx,136(%esp)
-	# p += x2
-	addl	108(%esp),%eax
-	# 		x4 = r
-	movl	%esi,116(%esp)
-	# 				t += x8
-	addl	132(%esp),%edx
-	# 						x14 = v
-	movl	%edi,156(%esp)
-	# p <<<= 18
-	rol	$18,%eax
-	# p ^= x0
-	xorl	100(%esp),%eax
-	# 				t <<<= 18
-	rol	$18,%edx
-	# 				t ^= x10
-	xorl	140(%esp),%edx
-	# 		s += r
-	add	%esi,%ecx
-	# 		s <<<= 18
-	rol	$18,%ecx
-	# 		s ^= x5
-	xorl	120(%esp),%ecx
-	# 						w += v
-	add	%edi,%ebx
-	# 						w <<<= 18
-	rol	$18,%ebx
-	# 						w ^= x15
-	xorl	160(%esp),%ebx
-	# x0 = p
-	movl	%eax,100(%esp)
-	# 				x10 = t
-	movl	%edx,140(%esp)
-	# p += x12
-	addl	148(%esp),%eax
-	# 		x5 = s
-	movl	%ecx,120(%esp)
-	# 				t += x6
-	addl	124(%esp),%edx
-	# 						x15 = w
-	movl	%ebx,160(%esp)
-	# 		r = x1
-	movl	104(%esp),%esi
-	# 		r += s
-	add	%ecx,%esi
-	# 						v = x11
-	movl	144(%esp),%edi
-	# 						v += w
-	add	%ebx,%edi
-	# p <<<= 7
-	rol	$7,%eax
-	# p ^= x4
-	xorl	116(%esp),%eax
-	# 				t <<<= 7
-	rol	$7,%edx
-	# 				t ^= x14
-	xorl	156(%esp),%edx
-	# 		r <<<= 7
-	rol	$7,%esi
-	# 		r ^= x9
-	xorl	136(%esp),%esi
-	# 						v <<<= 7
-	rol	$7,%edi
-	# 						v ^= x3
-	xorl	112(%esp),%edi
-	# x4 = p
-	movl	%eax,116(%esp)
-	# 				x14 = t
-	movl	%edx,156(%esp)
-	# p += x0
-	addl	100(%esp),%eax
-	# 		x9 = r
-	movl	%esi,136(%esp)
-	# 				t += x10
-	addl	140(%esp),%edx
-	# 						x3 = v
-	movl	%edi,112(%esp)
-	# p <<<= 9
-	rol	$9,%eax
-	# p ^= x8
-	xorl	132(%esp),%eax
-	# 				t <<<= 9
-	rol	$9,%edx
-	# 				t ^= x2
-	xorl	108(%esp),%edx
-	# 		s += r
-	add	%esi,%ecx
-	# 		s <<<= 9
-	rol	$9,%ecx
-	# 		s ^= x13
-	xorl	152(%esp),%ecx
-	# 						w += v
-	add	%edi,%ebx
-	# 						w <<<= 9
-	rol	$9,%ebx
-	# 						w ^= x7
-	xorl	128(%esp),%ebx
-	# x8 = p
-	movl	%eax,132(%esp)
-	# 				x2 = t
-	movl	%edx,108(%esp)
-	# p += x4
-	addl	116(%esp),%eax
-	# 		x13 = s
-	movl	%ecx,152(%esp)
-	# 				t += x14
-	addl	156(%esp),%edx
-	# 						x7 = w
-	movl	%ebx,128(%esp)
-	# p <<<= 13
-	rol	$13,%eax
-	# p ^= x12
-	xorl	148(%esp),%eax
-	# 				t <<<= 13
-	rol	$13,%edx
-	# 				t ^= x6
-	xorl	124(%esp),%edx
-	# 		r += s
-	add	%ecx,%esi
-	# 		r <<<= 13
-	rol	$13,%esi
-	# 		r ^= x1
-	xorl	104(%esp),%esi
-	# 						v += w
-	add	%ebx,%edi
-	# 						v <<<= 13
-	rol	$13,%edi
-	# 						v ^= x11
-	xorl	144(%esp),%edi
-	# x12 = p
-	movl	%eax,148(%esp)
-	# 				x6 = t
-	movl	%edx,124(%esp)
-	# p += x8
-	addl	132(%esp),%eax
-	# 		x1 = r
-	movl	%esi,104(%esp)
-	# 				t += x2
-	addl	108(%esp),%edx
-	# 						x11 = v
-	movl	%edi,144(%esp)
-	# p <<<= 18
-	rol	$18,%eax
-	# p ^= x0
-	xorl	100(%esp),%eax
-	# 				t <<<= 18
-	rol	$18,%edx
-	# 				t ^= x10
-	xorl	140(%esp),%edx
-	# 		s += r
-	add	%esi,%ecx
-	# 		s <<<= 18
-	rol	$18,%ecx
-	# 		s ^= x5
-	xorl	120(%esp),%ecx
-	# 						w += v
-	add	%edi,%ebx
-	# 						w <<<= 18
-	rol	$18,%ebx
-	# 						w ^= x15
-	xorl	160(%esp),%ebx
-	# x0 = p
-	movl	%eax,100(%esp)
-	# 				x10 = t
-	movl	%edx,140(%esp)
-	# p += x3
-	addl	112(%esp),%eax
-	# p <<<= 7
-	rol	$7,%eax
-	# 		x5 = s
-	movl	%ecx,120(%esp)
-	# 				t += x9
-	addl	136(%esp),%edx
-	# 						x15 = w
-	movl	%ebx,160(%esp)
-	# 		r = x4
-	movl	116(%esp),%esi
-	# 		r += s
-	add	%ecx,%esi
-	# 						v = x14
-	movl	156(%esp),%edi
-	# 						v += w
-	add	%ebx,%edi
-	# p ^= x1
-	xorl	104(%esp),%eax
-	# 				t <<<= 7
-	rol	$7,%edx
-	# 				t ^= x11
-	xorl	144(%esp),%edx
-	# 		r <<<= 7
-	rol	$7,%esi
-	# 		r ^= x6
-	xorl	124(%esp),%esi
-	# 						v <<<= 7
-	rol	$7,%edi
-	# 						v ^= x12
-	xorl	148(%esp),%edi
-	# x1 = p
-	movl	%eax,104(%esp)
-	# 				x11 = t
-	movl	%edx,144(%esp)
-	# p += x0
-	addl	100(%esp),%eax
-	# 		x6 = r
-	movl	%esi,124(%esp)
-	# 				t += x10
-	addl	140(%esp),%edx
-	# 						x12 = v
-	movl	%edi,148(%esp)
-	# p <<<= 9
-	rol	$9,%eax
-	# p ^= x2
-	xorl	108(%esp),%eax
-	# 				t <<<= 9
-	rol	$9,%edx
-	# 				t ^= x8
-	xorl	132(%esp),%edx
-	# 		s += r
-	add	%esi,%ecx
-	# 		s <<<= 9
-	rol	$9,%ecx
-	# 		s ^= x7
-	xorl	128(%esp),%ecx
-	# 						w += v
-	add	%edi,%ebx
-	# 						w <<<= 9
-	rol	$9,%ebx
-	# 						w ^= x13
-	xorl	152(%esp),%ebx
-	# x2 = p
-	movl	%eax,108(%esp)
-	# 				x8 = t
-	movl	%edx,132(%esp)
-	# p += x1
-	addl	104(%esp),%eax
-	# 		x7 = s
-	movl	%ecx,128(%esp)
-	# 				t += x11
-	addl	144(%esp),%edx
-	# 						x13 = w
-	movl	%ebx,152(%esp)
-	# p <<<= 13
-	rol	$13,%eax
-	# p ^= x3
-	xorl	112(%esp),%eax
-	# 				t <<<= 13
-	rol	$13,%edx
-	# 				t ^= x9
-	xorl	136(%esp),%edx
-	# 		r += s
-	add	%ecx,%esi
-	# 		r <<<= 13
-	rol	$13,%esi
-	# 		r ^= x4
-	xorl	116(%esp),%esi
-	# 						v += w
-	add	%ebx,%edi
-	# 						v <<<= 13
-	rol	$13,%edi
-	# 						v ^= x14
-	xorl	156(%esp),%edi
-	# x3 = p
-	movl	%eax,112(%esp)
-	# 				x9 = t
-	movl	%edx,136(%esp)
-	# p += x2
-	addl	108(%esp),%eax
-	# 		x4 = r
-	movl	%esi,116(%esp)
-	# 				t += x8
-	addl	132(%esp),%edx
-	# 						x14 = v
-	movl	%edi,156(%esp)
-	# p <<<= 18
-	rol	$18,%eax
-	# p ^= x0
-	xorl	100(%esp),%eax
-	# 				t <<<= 18
-	rol	$18,%edx
-	# 				t ^= x10
-	xorl	140(%esp),%edx
-	# 		s += r
-	add	%esi,%ecx
-	# 		s <<<= 18
-	rol	$18,%ecx
-	# 		s ^= x5
-	xorl	120(%esp),%ecx
-	# 						w += v
-	add	%edi,%ebx
-	# 						w <<<= 18
-	rol	$18,%ebx
-	# 						w ^= x15
-	xorl	160(%esp),%ebx
-	# i -= 4
-	sub	$4,%ebp
-	# goto mainloop if unsigned >
-	ja	._mainloop
-	# x0 = p
-	movl	%eax,100(%esp)
-	# x5 = s
-	movl	%ecx,120(%esp)
-	# x10 = t
-	movl	%edx,140(%esp)
-	# x15 = w
-	movl	%ebx,160(%esp)
-	#   out = out_backup
-	movl	72(%esp),%edi
-	#   m = m_backup
-	movl	68(%esp),%esi
-	#   in0 = x0
-	movl	100(%esp),%eax
-	#   in1 = x1
-	movl	104(%esp),%ecx
-	#   in0 += j0
-	addl	164(%esp),%eax
-	#   in1 += j1
-	addl	168(%esp),%ecx
-	#   in0 ^= *(uint32 *) (m + 0)
-	xorl	0(%esi),%eax
-	#   in1 ^= *(uint32 *) (m + 4)
-	xorl	4(%esi),%ecx
-	#   *(uint32 *) (out + 0) = in0
-	movl	%eax,0(%edi)
-	#   *(uint32 *) (out + 4) = in1
-	movl	%ecx,4(%edi)
-	#   in2 = x2
-	movl	108(%esp),%eax
-	#   in3 = x3
-	movl	112(%esp),%ecx
-	#   in2 += j2
-	addl	172(%esp),%eax
-	#   in3 += j3
-	addl	176(%esp),%ecx
-	#   in2 ^= *(uint32 *) (m + 8)
-	xorl	8(%esi),%eax
-	#   in3 ^= *(uint32 *) (m + 12)
-	xorl	12(%esi),%ecx
-	#   *(uint32 *) (out + 8) = in2
-	movl	%eax,8(%edi)
-	#   *(uint32 *) (out + 12) = in3
-	movl	%ecx,12(%edi)
-	#   in4 = x4
-	movl	116(%esp),%eax
-	#   in5 = x5
-	movl	120(%esp),%ecx
-	#   in4 += j4
-	addl	180(%esp),%eax
-	#   in5 += j5
-	addl	184(%esp),%ecx
-	#   in4 ^= *(uint32 *) (m + 16)
-	xorl	16(%esi),%eax
-	#   in5 ^= *(uint32 *) (m + 20)
-	xorl	20(%esi),%ecx
-	#   *(uint32 *) (out + 16) = in4
-	movl	%eax,16(%edi)
-	#   *(uint32 *) (out + 20) = in5
-	movl	%ecx,20(%edi)
-	#   in6 = x6
-	movl	124(%esp),%eax
-	#   in7 = x7
-	movl	128(%esp),%ecx
-	#   in6 += j6
-	addl	188(%esp),%eax
-	#   in7 += j7
-	addl	192(%esp),%ecx
-	#   in6 ^= *(uint32 *) (m + 24)
-	xorl	24(%esi),%eax
-	#   in7 ^= *(uint32 *) (m + 28)
-	xorl	28(%esi),%ecx
-	#   *(uint32 *) (out + 24) = in6
-	movl	%eax,24(%edi)
-	#   *(uint32 *) (out + 28) = in7
-	movl	%ecx,28(%edi)
-	#   in8 = x8
-	movl	132(%esp),%eax
-	#   in9 = x9
-	movl	136(%esp),%ecx
-	#   in8 += j8
-	addl	196(%esp),%eax
-	#   in9 += j9
-	addl	200(%esp),%ecx
-	#   in8 ^= *(uint32 *) (m + 32)
-	xorl	32(%esi),%eax
-	#   in9 ^= *(uint32 *) (m + 36)
-	xorl	36(%esi),%ecx
-	#   *(uint32 *) (out + 32) = in8
-	movl	%eax,32(%edi)
-	#   *(uint32 *) (out + 36) = in9
-	movl	%ecx,36(%edi)
-	#   in10 = x10
-	movl	140(%esp),%eax
-	#   in11 = x11
-	movl	144(%esp),%ecx
-	#   in10 += j10
-	addl	204(%esp),%eax
-	#   in11 += j11
-	addl	208(%esp),%ecx
-	#   in10 ^= *(uint32 *) (m + 40)
-	xorl	40(%esi),%eax
-	#   in11 ^= *(uint32 *) (m + 44)
-	xorl	44(%esi),%ecx
-	#   *(uint32 *) (out + 40) = in10
-	movl	%eax,40(%edi)
-	#   *(uint32 *) (out + 44) = in11
-	movl	%ecx,44(%edi)
-	#   in12 = x12
-	movl	148(%esp),%eax
-	#   in13 = x13
-	movl	152(%esp),%ecx
-	#   in12 += j12
-	addl	212(%esp),%eax
-	#   in13 += j13
-	addl	216(%esp),%ecx
-	#   in12 ^= *(uint32 *) (m + 48)
-	xorl	48(%esi),%eax
-	#   in13 ^= *(uint32 *) (m + 52)
-	xorl	52(%esi),%ecx
-	#   *(uint32 *) (out + 48) = in12
-	movl	%eax,48(%edi)
-	#   *(uint32 *) (out + 52) = in13
-	movl	%ecx,52(%edi)
-	#   in14 = x14
-	movl	156(%esp),%eax
-	#   in15 = x15
-	movl	160(%esp),%ecx
-	#   in14 += j14
-	addl	220(%esp),%eax
-	#   in15 += j15
-	addl	224(%esp),%ecx
-	#   in14 ^= *(uint32 *) (m + 56)
-	xorl	56(%esi),%eax
-	#   in15 ^= *(uint32 *) (m + 60)
-	xorl	60(%esi),%ecx
-	#   *(uint32 *) (out + 56) = in14
-	movl	%eax,56(%edi)
-	#   *(uint32 *) (out + 60) = in15
-	movl	%ecx,60(%edi)
-	#   bytes = bytes_backup
-	movl	76(%esp),%ebx
-	#   in8 = j8
-	movl	196(%esp),%eax
-	#   in9 = j9
-	movl	200(%esp),%ecx
-	#   in8 += 1
-	add	$1,%eax
-	#   in9 += 0 + carry
-	adc	$0,%ecx
-	#   j8 = in8
-	movl	%eax,196(%esp)
-	#   j9 = in9
-	movl	%ecx,200(%esp)
-	#   bytes - 64
-	cmp	$64,%ebx
-	#   goto bytesatleast65 if unsigned>
-	ja	._bytesatleast65
-	#     goto bytesatleast64 if unsigned>=
-	jae	._bytesatleast64
-	#       m = out
-	mov	%edi,%esi
-	#       out = ctarget
-	movl	228(%esp),%edi
-	#       i = bytes
-	mov	%ebx,%ecx
-	#       while (i) { *out++ = *m++; --i }
-	rep	movsb
-._bytesatleast64:
-	#     x = x_backup
-	movl	64(%esp),%eax
-	#     in8 = j8
-	movl	196(%esp),%ecx
-	#     in9 = j9
-	movl	200(%esp),%edx
-	#     *(uint32 *) (x + 32) = in8
-	movl	%ecx,32(%eax)
-	#     *(uint32 *) (x + 36) = in9
-	movl	%edx,36(%eax)
-._done:
-	#     eax = eax_stack
-	movl	80(%esp),%eax
-	#     ebx = ebx_stack
-	movl	84(%esp),%ebx
-	#     esi = esi_stack
-	movl	88(%esp),%esi
-	#     edi = edi_stack
-	movl	92(%esp),%edi
-	#     ebp = ebp_stack
-	movl	96(%esp),%ebp
-	#     leave
-	add	%eax,%esp
-	ret
-._bytesatleast65:
-	#   bytes -= 64
-	sub	$64,%ebx
-	#   out += 64
-	add	$64,%edi
-	#   m += 64
-	add	$64,%esi
-	# goto bytesatleast1
-	jmp	._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
deleted file mode 100644
index 03a4918f41ee..000000000000
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ /dev/null
@@ -1,805 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
-	mov	%rsp,%r11
-	and	$31,%r11
-	add	$256,%r11
-	sub	%r11,%rsp
-	# x = arg1
-	mov	%rdi,%r8
-	# m = arg2
-	mov	%rsi,%rsi
-	# out = arg3
-	mov	%rdx,%rdi
-	# bytes = arg4
-	mov	%rcx,%rdx
-	#               unsigned>? bytes - 0
-	cmp	$0,%rdx
-	# comment:fp stack unchanged by jump
-	# goto done if !unsigned>
-	jbe	._done
-	# comment:fp stack unchanged by fallthrough
-# start:
-._start:
-	# r11_stack = r11
-	movq	%r11,0(%rsp)
-	# r12_stack = r12
-	movq	%r12,8(%rsp)
-	# r13_stack = r13
-	movq	%r13,16(%rsp)
-	# r14_stack = r14
-	movq	%r14,24(%rsp)
-	# r15_stack = r15
-	movq	%r15,32(%rsp)
-	# rbx_stack = rbx
-	movq	%rbx,40(%rsp)
-	# rbp_stack = rbp
-	movq	%rbp,48(%rsp)
-	# in0 = *(uint64 *) (x + 0)
-	movq	0(%r8),%rcx
-	# in2 = *(uint64 *) (x + 8)
-	movq	8(%r8),%r9
-	# in4 = *(uint64 *) (x + 16)
-	movq	16(%r8),%rax
-	# in6 = *(uint64 *) (x + 24)
-	movq	24(%r8),%r10
-	# in8 = *(uint64 *) (x + 32)
-	movq	32(%r8),%r11
-	# in10 = *(uint64 *) (x + 40)
-	movq	40(%r8),%r12
-	# in12 = *(uint64 *) (x + 48)
-	movq	48(%r8),%r13
-	# in14 = *(uint64 *) (x + 56)
-	movq	56(%r8),%r14
-	# j0 = in0
-	movq	%rcx,56(%rsp)
-	# j2 = in2
-	movq	%r9,64(%rsp)
-	# j4 = in4
-	movq	%rax,72(%rsp)
-	# j6 = in6
-	movq	%r10,80(%rsp)
-	# j8 = in8
-	movq	%r11,88(%rsp)
-	# j10 = in10
-	movq	%r12,96(%rsp)
-	# j12 = in12
-	movq	%r13,104(%rsp)
-	# j14 = in14
-	movq	%r14,112(%rsp)
-	# x_backup = x
-	movq	%r8,120(%rsp)
-# bytesatleast1:
-._bytesatleast1:
-	#                   unsigned<? bytes - 64
-	cmp	$64,%rdx
-	# comment:fp stack unchanged by jump
-	#   goto nocopy if !unsigned<
-	jae	._nocopy
-	#     ctarget = out
-	movq	%rdi,128(%rsp)
-	#     out = &tmp
-	leaq	192(%rsp),%rdi
-	#     i = bytes
-	mov	%rdx,%rcx
-	#     while (i) { *out++ = *m++; --i }
-	rep	movsb
-	#     out = &tmp
-	leaq	192(%rsp),%rdi
-	#     m = &tmp
-	leaq	192(%rsp),%rsi
-	# comment:fp stack unchanged by fallthrough
-#   nocopy:
-._nocopy:
-	#   out_backup = out
-	movq	%rdi,136(%rsp)
-	#   m_backup = m
-	movq	%rsi,144(%rsp)
-	#   bytes_backup = bytes
-	movq	%rdx,152(%rsp)
-	#   x1 = j0
-	movq	56(%rsp),%rdi
-	#   x0 = x1
-	mov	%rdi,%rdx
-	#   (uint64) x1 >>= 32
-	shr	$32,%rdi
-	#   		x3 = j2
-	movq	64(%rsp),%rsi
-	#   		x2 = x3
-	mov	%rsi,%rcx
-	#   		(uint64) x3 >>= 32
-	shr	$32,%rsi
-	#   x5 = j4
-	movq	72(%rsp),%r8
-	#   x4 = x5
-	mov	%r8,%r9
-	#   (uint64) x5 >>= 32
-	shr	$32,%r8
-	#   x5_stack = x5
-	movq	%r8,160(%rsp)
-	#   		x7 = j6
-	movq	80(%rsp),%r8
-	#   		x6 = x7
-	mov	%r8,%rax
-	#   		(uint64) x7 >>= 32
-	shr	$32,%r8
-	#   x9 = j8
-	movq	88(%rsp),%r10
-	#   x8 = x9
-	mov	%r10,%r11
-	#   (uint64) x9 >>= 32
-	shr	$32,%r10
-	#   		x11 = j10
-	movq	96(%rsp),%r12
-	#   		x10 = x11
-	mov	%r12,%r13
-	#   		x10_stack = x10
-	movq	%r13,168(%rsp)
-	#   		(uint64) x11 >>= 32
-	shr	$32,%r12
-	#   x13 = j12
-	movq	104(%rsp),%r13
-	#   x12 = x13
-	mov	%r13,%r14
-	#   (uint64) x13 >>= 32
-	shr	$32,%r13
-	#   		x15 = j14
-	movq	112(%rsp),%r15
-	#   		x14 = x15
-	mov	%r15,%rbx
-	#   		(uint64) x15 >>= 32
-	shr	$32,%r15
-	#   		x15_stack = x15
-	movq	%r15,176(%rsp)
-	#   i = 20
-	mov	$20,%r15
-#   mainloop:
-._mainloop:
-	#   i_backup = i
-	movq	%r15,184(%rsp)
-	# 		x5 = x5_stack
-	movq	160(%rsp),%r15
-	# a = x12 + x0
-	lea	(%r14,%rdx),%rbp
-	# (uint32) a <<<= 7
-	rol	$7,%ebp
-	# x4 ^= a
-	xor	%rbp,%r9
-	# 		b = x1 + x5
-	lea	(%rdi,%r15),%rbp
-	# 		(uint32) b <<<= 7
-	rol	$7,%ebp
-	# 		x9 ^= b
-	xor	%rbp,%r10
-	# a = x0 + x4
-	lea	(%rdx,%r9),%rbp
-	# (uint32) a <<<= 9
-	rol	$9,%ebp
-	# x8 ^= a
-	xor	%rbp,%r11
-	# 		b = x5 + x9
-	lea	(%r15,%r10),%rbp
-	# 		(uint32) b <<<= 9
-	rol	$9,%ebp
-	# 		x13 ^= b
-	xor	%rbp,%r13
-	# a = x4 + x8
-	lea	(%r9,%r11),%rbp
-	# (uint32) a <<<= 13
-	rol	$13,%ebp
-	# x12 ^= a
-	xor	%rbp,%r14
-	# 		b = x9 + x13
-	lea	(%r10,%r13),%rbp
-	# 		(uint32) b <<<= 13
-	rol	$13,%ebp
-	# 		x1 ^= b
-	xor	%rbp,%rdi
-	# a = x8 + x12
-	lea	(%r11,%r14),%rbp
-	# (uint32) a <<<= 18
-	rol	$18,%ebp
-	# x0 ^= a
-	xor	%rbp,%rdx
-	# 		b = x13 + x1
-	lea	(%r13,%rdi),%rbp
-	# 		(uint32) b <<<= 18
-	rol	$18,%ebp
-	# 		x5 ^= b
-	xor	%rbp,%r15
-	# 				x10 = x10_stack
-	movq	168(%rsp),%rbp
-	# 		x5_stack = x5
-	movq	%r15,160(%rsp)
-	# 				c = x6 + x10
-	lea	(%rax,%rbp),%r15
-	# 				(uint32) c <<<= 7
-	rol	$7,%r15d
-	# 				x14 ^= c
-	xor	%r15,%rbx
-	# 				c = x10 + x14
-	lea	(%rbp,%rbx),%r15
-	# 				(uint32) c <<<= 9
-	rol	$9,%r15d
-	# 				x2 ^= c
-	xor	%r15,%rcx
-	# 				c = x14 + x2
-	lea	(%rbx,%rcx),%r15
-	# 				(uint32) c <<<= 13
-	rol	$13,%r15d
-	# 				x6 ^= c
-	xor	%r15,%rax
-	# 				c = x2 + x6
-	lea	(%rcx,%rax),%r15
-	# 				(uint32) c <<<= 18
-	rol	$18,%r15d
-	# 				x10 ^= c
-	xor	%r15,%rbp
-	# 						x15 = x15_stack
-	movq	176(%rsp),%r15
-	# 				x10_stack = x10
-	movq	%rbp,168(%rsp)
-	# 						d = x11 + x15
-	lea	(%r12,%r15),%rbp
-	# 						(uint32) d <<<= 7
-	rol	$7,%ebp
-	# 						x3 ^= d
-	xor	%rbp,%rsi
-	# 						d = x15 + x3
-	lea	(%r15,%rsi),%rbp
-	# 						(uint32) d <<<= 9
-	rol	$9,%ebp
-	# 						x7 ^= d
-	xor	%rbp,%r8
-	# 						d = x3 + x7
-	lea	(%rsi,%r8),%rbp
-	# 						(uint32) d <<<= 13
-	rol	$13,%ebp
-	# 						x11 ^= d
-	xor	%rbp,%r12
-	# 						d = x7 + x11
-	lea	(%r8,%r12),%rbp
-	# 						(uint32) d <<<= 18
-	rol	$18,%ebp
-	# 						x15 ^= d
-	xor	%rbp,%r15
-	# 						x15_stack = x15
-	movq	%r15,176(%rsp)
-	# 		x5 = x5_stack
-	movq	160(%rsp),%r15
-	# a = x3 + x0
-	lea	(%rsi,%rdx),%rbp
-	# (uint32) a <<<= 7
-	rol	$7,%ebp
-	# x1 ^= a
-	xor	%rbp,%rdi
-	# 		b = x4 + x5
-	lea	(%r9,%r15),%rbp
-	# 		(uint32) b <<<= 7
-	rol	$7,%ebp
-	# 		x6 ^= b
-	xor	%rbp,%rax
-	# a = x0 + x1
-	lea	(%rdx,%rdi),%rbp
-	# (uint32) a <<<= 9
-	rol	$9,%ebp
-	# x2 ^= a
-	xor	%rbp,%rcx
-	# 		b = x5 + x6
-	lea	(%r15,%rax),%rbp
-	# 		(uint32) b <<<= 9
-	rol	$9,%ebp
-	# 		x7 ^= b
-	xor	%rbp,%r8
-	# a = x1 + x2
-	lea	(%rdi,%rcx),%rbp
-	# (uint32) a <<<= 13
-	rol	$13,%ebp
-	# x3 ^= a
-	xor	%rbp,%rsi
-	# 		b = x6 + x7
-	lea	(%rax,%r8),%rbp
-	# 		(uint32) b <<<= 13
-	rol	$13,%ebp
-	# 		x4 ^= b
-	xor	%rbp,%r9
-	# a = x2 + x3
-	lea	(%rcx,%rsi),%rbp
-	# (uint32) a <<<= 18
-	rol	$18,%ebp
-	# x0 ^= a
-	xor	%rbp,%rdx
-	# 		b = x7 + x4
-	lea	(%r8,%r9),%rbp
-	# 		(uint32) b <<<= 18
-	rol	$18,%ebp
-	# 		x5 ^= b
-	xor	%rbp,%r15
-	# 				x10 = x10_stack
-	movq	168(%rsp),%rbp
-	# 		x5_stack = x5
-	movq	%r15,160(%rsp)
-	# 				c = x9 + x10
-	lea	(%r10,%rbp),%r15
-	# 				(uint32) c <<<= 7
-	rol	$7,%r15d
-	# 				x11 ^= c
-	xor	%r15,%r12
-	# 				c = x10 + x11
-	lea	(%rbp,%r12),%r15
-	# 				(uint32) c <<<= 9
-	rol	$9,%r15d
-	# 				x8 ^= c
-	xor	%r15,%r11
-	# 				c = x11 + x8
-	lea	(%r12,%r11),%r15
-	# 				(uint32) c <<<= 13
-	rol	$13,%r15d
-	# 				x9 ^= c
-	xor	%r15,%r10
-	# 				c = x8 + x9
-	lea	(%r11,%r10),%r15
-	# 				(uint32) c <<<= 18
-	rol	$18,%r15d
-	# 				x10 ^= c
-	xor	%r15,%rbp
-	# 						x15 = x15_stack
-	movq	176(%rsp),%r15
-	# 				x10_stack = x10
-	movq	%rbp,168(%rsp)
-	# 						d = x14 + x15
-	lea	(%rbx,%r15),%rbp
-	# 						(uint32) d <<<= 7
-	rol	$7,%ebp
-	# 						x12 ^= d
-	xor	%rbp,%r14
-	# 						d = x15 + x12
-	lea	(%r15,%r14),%rbp
-	# 						(uint32) d <<<= 9
-	rol	$9,%ebp
-	# 						x13 ^= d
-	xor	%rbp,%r13
-	# 						d = x12 + x13
-	lea	(%r14,%r13),%rbp
-	# 						(uint32) d <<<= 13
-	rol	$13,%ebp
-	# 						x14 ^= d
-	xor	%rbp,%rbx
-	# 						d = x13 + x14
-	lea	(%r13,%rbx),%rbp
-	# 						(uint32) d <<<= 18
-	rol	$18,%ebp
-	# 						x15 ^= d
-	xor	%rbp,%r15
-	# 						x15_stack = x15
-	movq	%r15,176(%rsp)
-	# 		x5 = x5_stack
-	movq	160(%rsp),%r15
-	# a = x12 + x0
-	lea	(%r14,%rdx),%rbp
-	# (uint32) a <<<= 7
-	rol	$7,%ebp
-	# x4 ^= a
-	xor	%rbp,%r9
-	# 		b = x1 + x5
-	lea	(%rdi,%r15),%rbp
-	# 		(uint32) b <<<= 7
-	rol	$7,%ebp
-	# 		x9 ^= b
-	xor	%rbp,%r10
-	# a = x0 + x4
-	lea	(%rdx,%r9),%rbp
-	# (uint32) a <<<= 9
-	rol	$9,%ebp
-	# x8 ^= a
-	xor	%rbp,%r11
-	# 		b = x5 + x9
-	lea	(%r15,%r10),%rbp
-	# 		(uint32) b <<<= 9
-	rol	$9,%ebp
-	# 		x13 ^= b
-	xor	%rbp,%r13
-	# a = x4 + x8
-	lea	(%r9,%r11),%rbp
-	# (uint32) a <<<= 13
-	rol	$13,%ebp
-	# x12 ^= a
-	xor	%rbp,%r14
-	# 		b = x9 + x13
-	lea	(%r10,%r13),%rbp
-	# 		(uint32) b <<<= 13
-	rol	$13,%ebp
-	# 		x1 ^= b
-	xor	%rbp,%rdi
-	# a = x8 + x12
-	lea	(%r11,%r14),%rbp
-	# (uint32) a <<<= 18
-	rol	$18,%ebp
-	# x0 ^= a
-	xor	%rbp,%rdx
-	# 		b = x13 + x1
-	lea	(%r13,%rdi),%rbp
-	# 		(uint32) b <<<= 18
-	rol	$18,%ebp
-	# 		x5 ^= b
-	xor	%rbp,%r15
-	# 				x10 = x10_stack
-	movq	168(%rsp),%rbp
-	# 		x5_stack = x5
-	movq	%r15,160(%rsp)
-	# 				c = x6 + x10
-	lea	(%rax,%rbp),%r15
-	# 				(uint32) c <<<= 7
-	rol	$7,%r15d
-	# 				x14 ^= c
-	xor	%r15,%rbx
-	# 				c = x10 + x14
-	lea	(%rbp,%rbx),%r15
-	# 				(uint32) c <<<= 9
-	rol	$9,%r15d
-	# 				x2 ^= c
-	xor	%r15,%rcx
-	# 				c = x14 + x2
-	lea	(%rbx,%rcx),%r15
-	# 				(uint32) c <<<= 13
-	rol	$13,%r15d
-	# 				x6 ^= c
-	xor	%r15,%rax
-	# 				c = x2 + x6
-	lea	(%rcx,%rax),%r15
-	# 				(uint32) c <<<= 18
-	rol	$18,%r15d
-	# 				x10 ^= c
-	xor	%r15,%rbp
-	# 						x15 = x15_stack
-	movq	176(%rsp),%r15
-	# 				x10_stack = x10
-	movq	%rbp,168(%rsp)
-	# 						d = x11 + x15
-	lea	(%r12,%r15),%rbp
-	# 						(uint32) d <<<= 7
-	rol	$7,%ebp
-	# 						x3 ^= d
-	xor	%rbp,%rsi
-	# 						d = x15 + x3
-	lea	(%r15,%rsi),%rbp
-	# 						(uint32) d <<<= 9
-	rol	$9,%ebp
-	# 						x7 ^= d
-	xor	%rbp,%r8
-	# 						d = x3 + x7
-	lea	(%rsi,%r8),%rbp
-	# 						(uint32) d <<<= 13
-	rol	$13,%ebp
-	# 						x11 ^= d
-	xor	%rbp,%r12
-	# 						d = x7 + x11
-	lea	(%r8,%r12),%rbp
-	# 						(uint32) d <<<= 18
-	rol	$18,%ebp
-	# 						x15 ^= d
-	xor	%rbp,%r15
-	# 						x15_stack = x15
-	movq	%r15,176(%rsp)
-	# 		x5 = x5_stack
-	movq	160(%rsp),%r15
-	# a = x3 + x0
-	lea	(%rsi,%rdx),%rbp
-	# (uint32) a <<<= 7
-	rol	$7,%ebp
-	# x1 ^= a
-	xor	%rbp,%rdi
-	# 		b = x4 + x5
-	lea	(%r9,%r15),%rbp
-	# 		(uint32) b <<<= 7
-	rol	$7,%ebp
-	# 		x6 ^= b
-	xor	%rbp,%rax
-	# a = x0 + x1
-	lea	(%rdx,%rdi),%rbp
-	# (uint32) a <<<= 9
-	rol	$9,%ebp
-	# x2 ^= a
-	xor	%rbp,%rcx
-	# 		b = x5 + x6
-	lea	(%r15,%rax),%rbp
-	# 		(uint32) b <<<= 9
-	rol	$9,%ebp
-	# 		x7 ^= b
-	xor	%rbp,%r8
-	# a = x1 + x2
-	lea	(%rdi,%rcx),%rbp
-	# (uint32) a <<<= 13
-	rol	$13,%ebp
-	# x3 ^= a
-	xor	%rbp,%rsi
-	# 		b = x6 + x7
-	lea	(%rax,%r8),%rbp
-	# 		(uint32) b <<<= 13
-	rol	$13,%ebp
-	# 		x4 ^= b
-	xor	%rbp,%r9
-	# a = x2 + x3
-	lea	(%rcx,%rsi),%rbp
-	# (uint32) a <<<= 18
-	rol	$18,%ebp
-	# x0 ^= a
-	xor	%rbp,%rdx
-	# 		b = x7 + x4
-	lea	(%r8,%r9),%rbp
-	# 		(uint32) b <<<= 18
-	rol	$18,%ebp
-	# 		x5 ^= b
-	xor	%rbp,%r15
-	# 				x10 = x10_stack
-	movq	168(%rsp),%rbp
-	# 		x5_stack = x5
-	movq	%r15,160(%rsp)
-	# 				c = x9 + x10
-	lea	(%r10,%rbp),%r15
-	# 				(uint32) c <<<= 7
-	rol	$7,%r15d
-	# 				x11 ^= c
-	xor	%r15,%r12
-	# 				c = x10 + x11
-	lea	(%rbp,%r12),%r15
-	# 				(uint32) c <<<= 9
-	rol	$9,%r15d
-	# 				x8 ^= c
-	xor	%r15,%r11
-	# 				c = x11 + x8
-	lea	(%r12,%r11),%r15
-	# 				(uint32) c <<<= 13
-	rol	$13,%r15d
-	# 				x9 ^= c
-	xor	%r15,%r10
-	# 				c = x8 + x9
-	lea	(%r11,%r10),%r15
-	# 				(uint32) c <<<= 18
-	rol	$18,%r15d
-	# 				x10 ^= c
-	xor	%r15,%rbp
-	# 						x15 = x15_stack
-	movq	176(%rsp),%r15
-	# 				x10_stack = x10
-	movq	%rbp,168(%rsp)
-	# 						d = x14 + x15
-	lea	(%rbx,%r15),%rbp
-	# 						(uint32) d <<<= 7
-	rol	$7,%ebp
-	# 						x12 ^= d
-	xor	%rbp,%r14
-	# 						d = x15 + x12
-	lea	(%r15,%r14),%rbp
-	# 						(uint32) d <<<= 9
-	rol	$9,%ebp
-	# 						x13 ^= d
-	xor	%rbp,%r13
-	# 						d = x12 + x13
-	lea	(%r14,%r13),%rbp
-	# 						(uint32) d <<<= 13
-	rol	$13,%ebp
-	# 						x14 ^= d
-	xor	%rbp,%rbx
-	# 						d = x13 + x14
-	lea	(%r13,%rbx),%rbp
-	# 						(uint32) d <<<= 18
-	rol	$18,%ebp
-	# 						x15 ^= d
-	xor	%rbp,%r15
-	# 						x15_stack = x15
-	movq	%r15,176(%rsp)
-	#   i = i_backup
-	movq	184(%rsp),%r15
-	#                  unsigned>? i -= 4
-	sub	$4,%r15
-	# comment:fp stack unchanged by jump
-	# goto mainloop if unsigned>
-	ja	._mainloop
-	#   (uint32) x2 += j2
-	addl	64(%rsp),%ecx
-	#   x3 <<= 32
-	shl	$32,%rsi
-	#   x3 += j2
-	addq	64(%rsp),%rsi
-	#   (uint64) x3 >>= 32
-	shr	$32,%rsi
-	#   x3 <<= 32
-	shl	$32,%rsi
-	#   x2 += x3
-	add	%rsi,%rcx
-	#   (uint32) x6 += j6
-	addl	80(%rsp),%eax
-	#   x7 <<= 32
-	shl	$32,%r8
-	#   x7 += j6
-	addq	80(%rsp),%r8
-	#   (uint64) x7 >>= 32
-	shr	$32,%r8
-	#   x7 <<= 32
-	shl	$32,%r8
-	#   x6 += x7
-	add	%r8,%rax
-	#   (uint32) x8 += j8
-	addl	88(%rsp),%r11d
-	#   x9 <<= 32
-	shl	$32,%r10
-	#   x9 += j8
-	addq	88(%rsp),%r10
-	#   (uint64) x9 >>= 32
-	shr	$32,%r10
-	#   x9 <<= 32
-	shl	$32,%r10
-	#   x8 += x9
-	add	%r10,%r11
-	#   (uint32) x12 += j12
-	addl	104(%rsp),%r14d
-	#   x13 <<= 32
-	shl	$32,%r13
-	#   x13 += j12
-	addq	104(%rsp),%r13
-	#   (uint64) x13 >>= 32
-	shr	$32,%r13
-	#   x13 <<= 32
-	shl	$32,%r13
-	#   x12 += x13
-	add	%r13,%r14
-	#   (uint32) x0 += j0
-	addl	56(%rsp),%edx
-	#   x1 <<= 32
-	shl	$32,%rdi
-	#   x1 += j0
-	addq	56(%rsp),%rdi
-	#   (uint64) x1 >>= 32
-	shr	$32,%rdi
-	#   x1 <<= 32
-	shl	$32,%rdi
-	#   x0 += x1
-	add	%rdi,%rdx
-	#   x5 = x5_stack
-	movq	160(%rsp),%rdi
-	#   (uint32) x4 += j4
-	addl	72(%rsp),%r9d
-	#   x5 <<= 32
-	shl	$32,%rdi
-	#   x5 += j4
-	addq	72(%rsp),%rdi
-	#   (uint64) x5 >>= 32
-	shr	$32,%rdi
-	#   x5 <<= 32
-	shl	$32,%rdi
-	#   x4 += x5
-	add	%rdi,%r9
-	#   x10 = x10_stack
-	movq	168(%rsp),%r8
-	#   (uint32) x10 += j10
-	addl	96(%rsp),%r8d
-	#   x11 <<= 32
-	shl	$32,%r12
-	#   x11 += j10
-	addq	96(%rsp),%r12
-	#   (uint64) x11 >>= 32
-	shr	$32,%r12
-	#   x11 <<= 32
-	shl	$32,%r12
-	#   x10 += x11
-	add	%r12,%r8
-	#   x15 = x15_stack
-	movq	176(%rsp),%rdi
-	#   (uint32) x14 += j14
-	addl	112(%rsp),%ebx
-	#   x15 <<= 32
-	shl	$32,%rdi
-	#   x15 += j14
-	addq	112(%rsp),%rdi
-	#   (uint64) x15 >>= 32
-	shr	$32,%rdi
-	#   x15 <<= 32
-	shl	$32,%rdi
-	#   x14 += x15
-	add	%rdi,%rbx
-	#   out = out_backup
-	movq	136(%rsp),%rdi
-	#   m = m_backup
-	movq	144(%rsp),%rsi
-	#   x0 ^= *(uint64 *) (m + 0)
-	xorq	0(%rsi),%rdx
-	#   *(uint64 *) (out + 0) = x0
-	movq	%rdx,0(%rdi)
-	#   x2 ^= *(uint64 *) (m + 8)
-	xorq	8(%rsi),%rcx
-	#   *(uint64 *) (out + 8) = x2
-	movq	%rcx,8(%rdi)
-	#   x4 ^= *(uint64 *) (m + 16)
-	xorq	16(%rsi),%r9
-	#   *(uint64 *) (out + 16) = x4
-	movq	%r9,16(%rdi)
-	#   x6 ^= *(uint64 *) (m + 24)
-	xorq	24(%rsi),%rax
-	#   *(uint64 *) (out + 24) = x6
-	movq	%rax,24(%rdi)
-	#   x8 ^= *(uint64 *) (m + 32)
-	xorq	32(%rsi),%r11
-	#   *(uint64 *) (out + 32) = x8
-	movq	%r11,32(%rdi)
-	#   x10 ^= *(uint64 *) (m + 40)
-	xorq	40(%rsi),%r8
-	#   *(uint64 *) (out + 40) = x10
-	movq	%r8,40(%rdi)
-	#   x12 ^= *(uint64 *) (m + 48)
-	xorq	48(%rsi),%r14
-	#   *(uint64 *) (out + 48) = x12
-	movq	%r14,48(%rdi)
-	#   x14 ^= *(uint64 *) (m + 56)
-	xorq	56(%rsi),%rbx
-	#   *(uint64 *) (out + 56) = x14
-	movq	%rbx,56(%rdi)
-	#   bytes = bytes_backup
-	movq	152(%rsp),%rdx
-	#   in8 = j8
-	movq	88(%rsp),%rcx
-	#   in8 += 1
-	add	$1,%rcx
-	#   j8 = in8
-	movq	%rcx,88(%rsp)
-	#                          unsigned>? unsigned<? bytes - 64
-	cmp	$64,%rdx
-	# comment:fp stack unchanged by jump
-	#   goto bytesatleast65 if unsigned>
-	ja	._bytesatleast65
-	# comment:fp stack unchanged by jump
-	#     goto bytesatleast64 if !unsigned<
-	jae	._bytesatleast64
-	#       m = out
-	mov	%rdi,%rsi
-	#       out = ctarget
-	movq	128(%rsp),%rdi
-	#       i = bytes
-	mov	%rdx,%rcx
-	#       while (i) { *out++ = *m++; --i }
-	rep	movsb
-	# comment:fp stack unchanged by fallthrough
-#     bytesatleast64:
-._bytesatleast64:
-	#     x = x_backup
-	movq	120(%rsp),%rdi
-	#     in8 = j8
-	movq	88(%rsp),%rsi
-	#     *(uint64 *) (x + 32) = in8
-	movq	%rsi,32(%rdi)
-	#     r11 = r11_stack
-	movq	0(%rsp),%r11
-	#     r12 = r12_stack
-	movq	8(%rsp),%r12
-	#     r13 = r13_stack
-	movq	16(%rsp),%r13
-	#     r14 = r14_stack
-	movq	24(%rsp),%r14
-	#     r15 = r15_stack
-	movq	32(%rsp),%r15
-	#     rbx = rbx_stack
-	movq	40(%rsp),%rbx
-	#     rbp = rbp_stack
-	movq	48(%rsp),%rbp
-	# comment:fp stack unchanged by fallthrough
-#     done:
-._done:
-	#     leave
-	add	%r11,%rsp
-	mov	%rdi,%rax
-	mov	%rsi,%rdx
-	ret
-#   bytesatleast65:
-._bytesatleast65:
-	#   bytes -= 64
-	sub	$64,%rdx
-	#   out += 64
-	add	$64,%rdi
-	#   m += 64
-	add	$64,%rsi
-	# comment:fp stack unchanged by jump
-	# goto bytesatleast1
-	jmp	._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
deleted file mode 100644
index b07d7d959806..000000000000
--- a/arch/x86/crypto/salsa20_glue.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Glue code for optimized assembly version of  Salsa20.
- *
- * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
- *
- * The assembly codes are public domain assembly codes written by Daniel. J.
- * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
- * and to remove extraneous comments and functions that are not needed.
- * - i586 version, renamed as salsa20-i586-asm_32.S
- *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
- * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
- *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
- *
- * Also modified to set up the initial state using the generic C code rather
- * than in assembly.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <asm/unaligned.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/salsa20.h>
-#include <linux/module.h>
-
-asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst,
-				      u32 bytes);
-
-static int salsa20_asm_crypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_salsa20_init(state, ctx, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		salsa20_encrypt_bytes(state, walk.src.virt.addr,
-				      walk.dst.virt.addr, nbytes);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static struct skcipher_alg alg = {
-	.base.cra_name		= "salsa20",
-	.base.cra_driver_name	= "salsa20-asm",
-	.base.cra_priority	= 200,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct salsa20_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= SALSA20_MIN_KEY_SIZE,
-	.max_keysize		= SALSA20_MAX_KEY_SIZE,
-	.ivsize			= SALSA20_IV_SIZE,
-	.chunksize		= SALSA20_BLOCK_SIZE,
-	.setkey			= crypto_salsa20_setkey,
-	.encrypt		= salsa20_asm_crypt,
-	.decrypt		= salsa20_asm_crypt,
-};
-
-static int __init init(void)
-{
-	return crypto_register_skcipher(&alg);
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_skcipher(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
-MODULE_ALIAS_CRYPTO("salsa20");
-MODULE_ALIAS_CRYPTO("salsa20-asm");
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 9af927e59d49..9de7f1e1dede 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -84,13 +84,13 @@ ENTRY(entry_SYSENTER_compat)
 	pushq	%rdx			/* pt_regs->dx */
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	$-ENOSYS		/* pt_regs->ax */
-	pushq   $0			/* pt_regs->r8  = 0 */
+	pushq   %r8			/* pt_regs->r8 */
 	xorl	%r8d, %r8d		/* nospec   r8 */
-	pushq   $0			/* pt_regs->r9  = 0 */
+	pushq   %r9			/* pt_regs->r9 */
 	xorl	%r9d, %r9d		/* nospec   r9 */
-	pushq   $0			/* pt_regs->r10 = 0 */
+	pushq   %r10			/* pt_regs->r10 */
 	xorl	%r10d, %r10d		/* nospec   r10 */
-	pushq   $0			/* pt_regs->r11 = 0 */
+	pushq   %r11			/* pt_regs->r11 */
 	xorl	%r11d, %r11d		/* nospec   r11 */
 	pushq   %rbx                    /* pt_regs->rbx */
 	xorl	%ebx, %ebx		/* nospec   rbx */
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index d6b27dab1b30..14a2f996e543 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -396,3 +396,4 @@
 382	i386	pkey_free		sys_pkey_free			__ia32_sys_pkey_free
 383	i386	statx			sys_statx			__ia32_sys_statx
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
+385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 4dfe42666d0c..cd36232ab62f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -341,6 +341,7 @@
 330	common	pkey_alloc		__x64_sys_pkey_alloc
 331	common	pkey_free		__x64_sys_pkey_free
 332	common	statx			__x64_sys_statx
+333	common	io_pgetevents		__x64_sys_io_pgetevents
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index d998a487c9b1..261802b1cc50 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -44,14 +44,14 @@ obj-y += $(vdso_img_objs)
 targets += $(vdso_img_cfiles)
 targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
 
-export CPPFLAGS_vdso.lds += -P -C
+CPPFLAGS_vdso.lds += -P -C
 
 VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
 			-Wl,--no-undefined \
 			-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
 			$(DISABLE_LTO)
 
-$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
+$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
 	$(call if_changed,vdso)
 
 HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
@@ -100,11 +100,8 @@ VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
 			   -Wl,-z,max-page-size=4096 \
 			   -Wl,-z,common-page-size=4096
 
-# 64-bit objects to re-brand as x32
-vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
-
 # x32-rebranded versions
-vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o)
+vobjx32s-y := $(vobjs-y:.o=-x32.o)
 
 # same thing, but in the output directory
 vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
@@ -122,7 +119,7 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
 $(obj)/%.so: $(obj)/%.so.dbg
 	$(call if_changed,objcopy)
 
-$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
+$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
 	$(call if_changed,vdso)
 
 CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
diff --git a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
deleted file mode 100644
index 541468e25265..000000000000
--- a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../vdso-fakesections.c"
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 70b7845434cb..7782cdbcd67d 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -107,7 +107,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
 		thread->cr2		= ptr;
 		thread->trap_nr		= X86_TRAP_PF;
 
-		memset(&info, 0, sizeof(info));
+		clear_siginfo(&info);
 		info.si_signo		= SIGSEGV;
 		info.si_errno		= 0;
 		info.si_code		= SEGV_MAPERR;
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 786fd875de92..4b98101209a1 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -889,7 +889,7 @@ static void force_ibs_eilvt_setup(void)
 	if (!ibs_eilvt_valid())
 		goto out;
 
-	pr_info("IBS: LVT offset %d assigned\n", offset);
+	pr_info("LVT offset %d assigned\n", offset);
 
 	return;
 out:
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index f5cbbba99283..981ba5e8241b 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -19,6 +19,7 @@
 #include <asm/cpufeature.h>
 #include <asm/perf_event.h>
 #include <asm/msr.h>
+#include <asm/smp.h>
 
 #define NUM_COUNTERS_NB		4
 #define NUM_COUNTERS_L2		4
@@ -399,26 +400,8 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
 	}
 
 	if (amd_uncore_llc) {
-		unsigned int apicid = cpu_data(cpu).apicid;
-		unsigned int nshared, subleaf, prev_eax = 0;
-
 		uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
-		/*
-		 * Iterate over Cache Topology Definition leaves until no
-		 * more cache descriptions are available.
-		 */
-		for (subleaf = 0; subleaf < 5; subleaf++) {
-			cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx);
-
-			/* EAX[0:4] gives type of cache */
-			if (!(eax & 0x1f))
-				break;
-
-			prev_eax = eax;
-		}
-		nshared = ((prev_eax >> 14) & 0xfff) + 1;
-
-		uncore->id = apicid - (apicid % nshared);
+		uncore->id = per_cpu(cpu_llc_id, cpu);
 
 		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
 		*per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a6006e7bb729..6e461fb1e0d4 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 #include <linux/device.h>
+#include <linux/nospec.h>
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
@@ -304,17 +305,20 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 
 	config = attr->config;
 
-	cache_type = (config >>  0) & 0xff;
+	cache_type = (config >> 0) & 0xff;
 	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
 		return -EINVAL;
+	cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
 
 	cache_op = (config >>  8) & 0xff;
 	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
 		return -EINVAL;
+	cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
 
 	cache_result = (config >> 16) & 0xff;
 	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
 		return -EINVAL;
+	cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
 
 	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
 
@@ -421,6 +425,8 @@ int x86_setup_perfctr(struct perf_event *event)
 	if (attr->config >= x86_pmu.max_events)
 		return -EINVAL;
 
+	attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
+
 	/*
 	 * The generic map:
 	 */
@@ -2391,7 +2397,7 @@ static unsigned long get_segment_base(unsigned int segment)
 
 #ifdef CONFIG_IA32_EMULATION
 
-#include <asm/compat.h>
+#include <linux/compat.h>
 
 static inline int
 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 607bf565a90c..707b2a96e516 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3339,7 +3339,8 @@ static void intel_pmu_cpu_starting(int cpu)
 
 	cpuc->lbr_sel = NULL;
 
-	flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+	if (x86_pmu.version > 1)
+		flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
 
 	if (!cpuc->shared_regs)
 		return;
@@ -3502,6 +3503,8 @@ static __initconst const struct x86_pmu core_pmu = {
 	.cpu_dying		= intel_pmu_cpu_dying,
 };
 
+static struct attribute *intel_pmu_attrs[];
+
 static __initconst const struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
@@ -3533,6 +3536,8 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.format_attrs		= intel_arch3_formats_attr,
 	.events_sysfs_show	= intel_event_sysfs_show,
 
+	.attrs			= intel_pmu_attrs,
+
 	.cpu_prepare		= intel_pmu_cpu_prepare,
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
@@ -3911,8 +3916,6 @@ __init int intel_pmu_init(void)
 
 	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
 
-
-	x86_pmu.attrs			= intel_pmu_attrs;
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose events, so
 	 * assume at least 3 events, when not running in a hypervisor:
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 9aca448bb8e6..9f8084f18d58 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -92,6 +92,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
+#include <linux/nospec.h>
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
 #include "../perf_event.h"
@@ -302,6 +303,7 @@ static int cstate_pmu_event_init(struct perf_event *event)
 	} else if (event->pmu == &cstate_pkg_pmu) {
 		if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
 			return -EINVAL;
+		cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
 		if (!pkg_msr[cfg].attr)
 			return -EINVAL;
 		event->hw.event_base = pkg_msr[cfg].msr;
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b993942a0e4..8d016ce5b80d 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1194,7 +1194,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
 		    filter->action == PERF_ADDR_FILTER_ACTION_START)
 			return -EOPNOTSUPP;
 
-		if (!filter->inode) {
+		if (!filter->path.dentry) {
 			if (!valid_kernel_ip(filter->offset))
 				return -EINVAL;
 
@@ -1221,7 +1221,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
 		return;
 
 	list_for_each_entry(filter, &head->list, entry) {
-		if (filter->inode && !offs[range]) {
+		if (filter->path.dentry && !offs[range]) {
 			msr_a = msr_b = 0;
 		} else {
 			/* apply the offset */
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index a7956fc7ca1d..15b07379e72d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -203,7 +203,7 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box,
 	hwc->idx = idx;
 	hwc->last_tag = ++box->tags[idx];
 
-	if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+	if (uncore_pmc_fixed(hwc->idx)) {
 		hwc->event_base = uncore_fixed_ctr(box);
 		hwc->config_base = uncore_fixed_ctl(box);
 		return;
@@ -218,7 +218,9 @@ void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *e
 	u64 prev_count, new_count, delta;
 	int shift;
 
-	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+	if (uncore_pmc_freerunning(event->hw.idx))
+		shift = 64 - uncore_freerunning_bits(box, event);
+	else if (uncore_pmc_fixed(event->hw.idx))
 		shift = 64 - uncore_fixed_ctr_bits(box);
 	else
 		shift = 64 - uncore_perf_ctr_bits(box);
@@ -449,15 +451,30 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	return ret ? -EINVAL : 0;
 }
 
-static void uncore_pmu_event_start(struct perf_event *event, int flags)
+void uncore_pmu_event_start(struct perf_event *event, int flags)
 {
 	struct intel_uncore_box *box = uncore_event_to_box(event);
 	int idx = event->hw.idx;
 
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
 		return;
 
-	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+	/*
+	 * Free running counter is read-only and always active.
+	 * Use the current counter value as start point.
+	 * There is no overflow interrupt for free running counter.
+	 * Use hrtimer to periodically poll the counter to avoid overflow.
+	 */
+	if (uncore_pmc_freerunning(event->hw.idx)) {
+		list_add_tail(&event->active_entry, &box->active_list);
+		local64_set(&event->hw.prev_count,
+			    uncore_read_counter(box, event));
+		if (box->n_active++ == 0)
+			uncore_pmu_start_hrtimer(box);
+		return;
+	}
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
 		return;
 
 	event->hw.state = 0;
@@ -474,11 +491,20 @@ static void uncore_pmu_event_start(struct perf_event *event, int flags)
 	}
 }
 
-static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+void uncore_pmu_event_stop(struct perf_event *event, int flags)
 {
 	struct intel_uncore_box *box = uncore_event_to_box(event);
 	struct hw_perf_event *hwc = &event->hw;
 
+	/* Cannot disable free running counter which is read-only */
+	if (uncore_pmc_freerunning(hwc->idx)) {
+		list_del(&event->active_entry);
+		if (--box->n_active == 0)
+			uncore_pmu_cancel_hrtimer(box);
+		uncore_perf_event_update(box, event);
+		return;
+	}
+
 	if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
 		uncore_disable_event(box, event);
 		box->n_active--;
@@ -502,7 +528,7 @@ static void uncore_pmu_event_stop(struct perf_event *event, int flags)
 	}
 }
 
-static int uncore_pmu_event_add(struct perf_event *event, int flags)
+int uncore_pmu_event_add(struct perf_event *event, int flags)
 {
 	struct intel_uncore_box *box = uncore_event_to_box(event);
 	struct hw_perf_event *hwc = &event->hw;
@@ -512,6 +538,17 @@ static int uncore_pmu_event_add(struct perf_event *event, int flags)
 	if (!box)
 		return -ENODEV;
 
+	/*
+	 * The free funning counter is assigned in event_init().
+	 * The free running counter event and free running counter
+	 * are 1:1 mapped. It doesn't need to be tracked in event_list.
+	 */
+	if (uncore_pmc_freerunning(hwc->idx)) {
+		if (flags & PERF_EF_START)
+			uncore_pmu_event_start(event, 0);
+		return 0;
+	}
+
 	ret = n = uncore_collect_events(box, event, false);
 	if (ret < 0)
 		return ret;
@@ -563,13 +600,21 @@ static int uncore_pmu_event_add(struct perf_event *event, int flags)
 	return 0;
 }
 
-static void uncore_pmu_event_del(struct perf_event *event, int flags)
+void uncore_pmu_event_del(struct perf_event *event, int flags)
 {
 	struct intel_uncore_box *box = uncore_event_to_box(event);
 	int i;
 
 	uncore_pmu_event_stop(event, PERF_EF_UPDATE);
 
+	/*
+	 * The event for free running counter is not tracked by event_list.
+	 * It doesn't need to force event->hw.idx = -1 to reassign the counter.
+	 * Because the event and the free running counter are 1:1 mapped.
+	 */
+	if (uncore_pmc_freerunning(event->hw.idx))
+		return;
+
 	for (i = 0; i < box->n_events; i++) {
 		if (event == box->event_list[i]) {
 			uncore_put_event_constraint(box, event);
@@ -603,6 +648,10 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu,
 	struct intel_uncore_box *fake_box;
 	int ret = -EINVAL, n;
 
+	/* The free running counter is always active. */
+	if (uncore_pmc_freerunning(event->hw.idx))
+		return 0;
+
 	fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
 	if (!fake_box)
 		return -ENOMEM;
@@ -690,6 +739,17 @@ static int uncore_pmu_event_init(struct perf_event *event)
 
 		/* fixed counters have event field hardcoded to zero */
 		hwc->config = 0ULL;
+	} else if (is_freerunning_event(event)) {
+		if (!check_valid_freerunning_event(box, event))
+			return -EINVAL;
+		event->hw.idx = UNCORE_PMC_IDX_FREERUNNING;
+		/*
+		 * The free running counter event and free running counter
+		 * are always 1:1 mapped.
+		 * The free running counter is always active.
+		 * Assign the free running counter here.
+		 */
+		event->hw.event_base = uncore_freerunning_counter(box, event);
 	} else {
 		hwc->config = event->attr.config &
 			      (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32));
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 414dc7e7c950..c9e1e0bef3c3 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -12,8 +12,13 @@
 
 #define UNCORE_FIXED_EVENT		0xff
 #define UNCORE_PMC_IDX_MAX_GENERIC	8
+#define UNCORE_PMC_IDX_MAX_FIXED	1
+#define UNCORE_PMC_IDX_MAX_FREERUNNING	1
 #define UNCORE_PMC_IDX_FIXED		UNCORE_PMC_IDX_MAX_GENERIC
-#define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FIXED + 1)
+#define UNCORE_PMC_IDX_FREERUNNING	(UNCORE_PMC_IDX_FIXED + \
+					UNCORE_PMC_IDX_MAX_FIXED)
+#define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FREERUNNING + \
+					UNCORE_PMC_IDX_MAX_FREERUNNING)
 
 #define UNCORE_PCI_DEV_FULL_DATA(dev, func, type, idx)	\
 		((dev << 24) | (func << 16) | (type << 8) | idx)
@@ -35,6 +40,7 @@ struct intel_uncore_ops;
 struct intel_uncore_pmu;
 struct intel_uncore_box;
 struct uncore_event_desc;
+struct freerunning_counters;
 
 struct intel_uncore_type {
 	const char *name;
@@ -42,6 +48,7 @@ struct intel_uncore_type {
 	int num_boxes;
 	int perf_ctr_bits;
 	int fixed_ctr_bits;
+	int num_freerunning_types;
 	unsigned perf_ctr;
 	unsigned event_ctl;
 	unsigned event_mask;
@@ -59,6 +66,7 @@ struct intel_uncore_type {
 	struct intel_uncore_pmu *pmus;
 	struct intel_uncore_ops *ops;
 	struct uncore_event_desc *event_descs;
+	struct freerunning_counters *freerunning;
 	const struct attribute_group *attr_groups[4];
 	struct pmu *pmu; /* for custom pmu ops */
 };
@@ -129,6 +137,14 @@ struct uncore_event_desc {
 	const char *config;
 };
 
+struct freerunning_counters {
+	unsigned int counter_base;
+	unsigned int counter_offset;
+	unsigned int box_offset;
+	unsigned int num_counters;
+	unsigned int bits;
+};
+
 struct pci2phy_map {
 	struct list_head list;
 	int segment;
@@ -157,6 +173,16 @@ static ssize_t __uncore_##_var##_show(struct kobject *kobj,		\
 static struct kobj_attribute format_attr_##_var =			\
 	__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
 
+static inline bool uncore_pmc_fixed(int idx)
+{
+	return idx == UNCORE_PMC_IDX_FIXED;
+}
+
+static inline bool uncore_pmc_freerunning(int idx)
+{
+	return idx == UNCORE_PMC_IDX_FREERUNNING;
+}
+
 static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
 {
 	return box->pmu->type->box_ctl;
@@ -214,6 +240,60 @@ static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
 	return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
 }
 
+
+/*
+ * In the uncore document, there is no event-code assigned to free running
+ * counters. Some events need to be defined to indicate the free running
+ * counters. The events are encoded as event-code + umask-code.
+ *
+ * The event-code for all free running counters is 0xff, which is the same as
+ * the fixed counters.
+ *
+ * The umask-code is used to distinguish a fixed counter and a free running
+ * counter, and different types of free running counters.
+ * - For fixed counters, the umask-code is 0x0X.
+ *   X indicates the index of the fixed counter, which starts from 0.
+ * - For free running counters, the umask-code uses the rest of the space.
+ *   It would bare the format of 0xXY.
+ *   X stands for the type of free running counters, which starts from 1.
+ *   Y stands for the index of free running counters of same type, which
+ *   starts from 0.
+ *
+ * For example, there are three types of IIO free running counters on Skylake
+ * server, IO CLOCKS counters, BANDWIDTH counters and UTILIZATION counters.
+ * The event-code for all the free running counters is 0xff.
+ * 'ioclk' is the first counter of IO CLOCKS. IO CLOCKS is the first type,
+ * which umask-code starts from 0x10.
+ * So 'ioclk' is encoded as event=0xff,umask=0x10
+ * 'bw_in_port2' is the third counter of BANDWIDTH counters. BANDWIDTH is
+ * the second type, which umask-code starts from 0x20.
+ * So 'bw_in_port2' is encoded as event=0xff,umask=0x22
+ */
+static inline unsigned int uncore_freerunning_idx(u64 config)
+{
+	return ((config >> 8) & 0xf);
+}
+
+#define UNCORE_FREERUNNING_UMASK_START		0x10
+
+static inline unsigned int uncore_freerunning_type(u64 config)
+{
+	return ((((config >> 8) - UNCORE_FREERUNNING_UMASK_START) >> 4) & 0xf);
+}
+
+static inline
+unsigned int uncore_freerunning_counter(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	unsigned int type = uncore_freerunning_type(event->attr.config);
+	unsigned int idx = uncore_freerunning_idx(event->attr.config);
+	struct intel_uncore_pmu *pmu = box->pmu;
+
+	return pmu->type->freerunning[type].counter_base +
+	       pmu->type->freerunning[type].counter_offset * idx +
+	       pmu->type->freerunning[type].box_offset * pmu->pmu_idx;
+}
+
 static inline
 unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
 {
@@ -276,11 +356,52 @@ static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
 	return box->pmu->type->fixed_ctr_bits;
 }
 
+static inline
+unsigned int uncore_freerunning_bits(struct intel_uncore_box *box,
+				     struct perf_event *event)
+{
+	unsigned int type = uncore_freerunning_type(event->attr.config);
+
+	return box->pmu->type->freerunning[type].bits;
+}
+
+static inline int uncore_num_freerunning(struct intel_uncore_box *box,
+					 struct perf_event *event)
+{
+	unsigned int type = uncore_freerunning_type(event->attr.config);
+
+	return box->pmu->type->freerunning[type].num_counters;
+}
+
+static inline int uncore_num_freerunning_types(struct intel_uncore_box *box,
+					       struct perf_event *event)
+{
+	return box->pmu->type->num_freerunning_types;
+}
+
+static inline bool check_valid_freerunning_event(struct intel_uncore_box *box,
+						 struct perf_event *event)
+{
+	unsigned int type = uncore_freerunning_type(event->attr.config);
+	unsigned int idx = uncore_freerunning_idx(event->attr.config);
+
+	return (type < uncore_num_freerunning_types(box, event)) &&
+	       (idx < uncore_num_freerunning(box, event));
+}
+
 static inline int uncore_num_counters(struct intel_uncore_box *box)
 {
 	return box->pmu->type->num_counters;
 }
 
+static inline bool is_freerunning_event(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+
+	return ((cfg & UNCORE_FIXED_EVENT) == UNCORE_FIXED_EVENT) &&
+	       (((cfg >> 8) & 0xff) >= UNCORE_FREERUNNING_UMASK_START);
+}
+
 static inline void uncore_disable_box(struct intel_uncore_box *box)
 {
 	if (box->pmu->type->ops->disable_box)
@@ -346,6 +467,10 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
 void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
 void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_start(struct perf_event *event, int flags);
+void uncore_pmu_event_stop(struct perf_event *event, int flags);
+int uncore_pmu_event_add(struct perf_event *event, int flags);
+void uncore_pmu_event_del(struct perf_event *event, int flags);
 void uncore_pmu_event_read(struct perf_event *event);
 void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
 struct event_constraint *
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index 93e7a8397cde..173e2674be6e 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -246,7 +246,7 @@ static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct p
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+	if (hwc->idx == UNCORE_PMC_IDX_FIXED)
 		wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
 	else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
 		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index aee5e8496be4..8527c3e1038b 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -285,6 +285,15 @@ static struct uncore_event_desc snb_uncore_imc_events[] = {
 #define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054
 #define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE
 
+enum perf_snb_uncore_imc_freerunning_types {
+	SNB_PCI_UNCORE_IMC_DATA		= 0,
+	SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters snb_uncore_imc_freerunning[] = {
+	[SNB_PCI_UNCORE_IMC_DATA]     = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, 0x4, 0x0, 2, 32 },
+};
+
 static struct attribute *snb_uncore_imc_formats_attr[] = {
 	&format_attr_event.attr,
 	NULL,
@@ -341,9 +350,8 @@ static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf
 }
 
 /*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
+ * Keep the custom event_init() function compatible with old event
+ * encoding for free running counters.
  */
 static int snb_uncore_imc_event_init(struct perf_event *event)
 {
@@ -405,11 +413,11 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
 	switch (cfg) {
 	case SNB_UNCORE_PCI_IMC_DATA_READS:
 		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
-		idx = UNCORE_PMC_IDX_FIXED;
+		idx = UNCORE_PMC_IDX_FREERUNNING;
 		break;
 	case SNB_UNCORE_PCI_IMC_DATA_WRITES:
 		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
-		idx = UNCORE_PMC_IDX_FIXED + 1;
+		idx = UNCORE_PMC_IDX_FREERUNNING;
 		break;
 	default:
 		return -EINVAL;
@@ -430,75 +438,6 @@ static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_ev
 	return 0;
 }
 
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	u64 count;
-
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	event->hw.state = 0;
-	box->n_active++;
-
-	list_add_tail(&event->active_entry, &box->active_list);
-
-	count = snb_uncore_imc_read_counter(box, event);
-	local64_set(&event->hw.prev_count, count);
-
-	if (box->n_active == 1)
-		uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (!(hwc->state & PERF_HES_STOPPED)) {
-		box->n_active--;
-
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-
-		list_del(&event->active_entry);
-
-		if (box->n_active == 0)
-			uncore_pmu_cancel_hrtimer(box);
-	}
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		uncore_perf_event_update(box, event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (!box)
-		return -ENODEV;
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-	if (!(flags & PERF_EF_START))
-		hwc->state |= PERF_HES_ARCH;
-
-	snb_uncore_imc_event_start(event, 0);
-
-	return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
-	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-}
-
 int snb_pci2phy_map_init(int devid)
 {
 	struct pci_dev *dev = NULL;
@@ -530,10 +469,10 @@ int snb_pci2phy_map_init(int devid)
 static struct pmu snb_uncore_imc_pmu = {
 	.task_ctx_nr	= perf_invalid_context,
 	.event_init	= snb_uncore_imc_event_init,
-	.add		= snb_uncore_imc_event_add,
-	.del		= snb_uncore_imc_event_del,
-	.start		= snb_uncore_imc_event_start,
-	.stop		= snb_uncore_imc_event_stop,
+	.add		= uncore_pmu_event_add,
+	.del		= uncore_pmu_event_del,
+	.start		= uncore_pmu_event_start,
+	.stop		= uncore_pmu_event_stop,
 	.read		= uncore_pmu_event_read,
 };
 
@@ -552,12 +491,10 @@ static struct intel_uncore_type snb_uncore_imc = {
 	.name		= "imc",
 	.num_counters   = 2,
 	.num_boxes	= 1,
-	.fixed_ctr_bits	= 32,
-	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE,
+	.num_freerunning_types	= SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+	.freerunning	= snb_uncore_imc_freerunning,
 	.event_descs	= snb_uncore_imc_events,
 	.format_group	= &snb_uncore_imc_format_group,
-	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
-	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK,
 	.ops		= &snb_uncore_imc_ops,
 	.pmu		= &snb_uncore_imc_pmu,
 };
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index c98b943e58b4..87dc0263a2e1 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3028,10 +3028,27 @@ static struct intel_uncore_type bdx_uncore_cbox = {
 	.format_group		= &hswep_uncore_cbox_format_group,
 };
 
+static struct intel_uncore_type bdx_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 4,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
+	.ops			= &hswep_uncore_sbox_msr_ops,
+	.format_group		= &hswep_uncore_sbox_format_group,
+};
+
+#define BDX_MSR_UNCORE_SBOX	3
+
 static struct intel_uncore_type *bdx_msr_uncores[] = {
 	&bdx_uncore_ubox,
 	&bdx_uncore_cbox,
 	&hswep_uncore_pcu,
+	&bdx_uncore_sbox,
 	NULL,
 };
 
@@ -3043,10 +3060,25 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = {
 
 void bdx_uncore_cpu_init(void)
 {
+	int pkg = topology_phys_to_logical_pkg(0);
+
 	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
 		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
 	uncore_msr_uncores = bdx_msr_uncores;
 
+	/* BDX-DE doesn't have SBOX */
+	if (boot_cpu_data.x86_model == 86) {
+		uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+	/* Detect systems with no SBOXes */
+	} else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
+		struct pci_dev *pdev;
+		u32 capid4;
+
+		pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3];
+		pci_read_config_dword(pdev, 0x94, &capid4);
+		if (((capid4 >> 6) & 0x3) == 0)
+			bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+	}
 	hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
 }
 
@@ -3264,6 +3296,11 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
 		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
 	},
+	{ /* PCU.3 (for Capability registers) */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   HSWEP_PCI_PCU_3),
+	},
 	{ /* end: all zeroes */ }
 };
 
@@ -3485,6 +3522,87 @@ static struct intel_uncore_type skx_uncore_iio = {
 	.format_group		= &skx_uncore_iio_format_group,
 };
 
+enum perf_uncore_iio_freerunning_type_id {
+	SKX_IIO_MSR_IOCLK			= 0,
+	SKX_IIO_MSR_BW				= 1,
+	SKX_IIO_MSR_UTIL			= 2,
+
+	SKX_IIO_FREERUNNING_TYPE_MAX,
+};
+
+
+static struct freerunning_counters skx_iio_freerunning[] = {
+	[SKX_IIO_MSR_IOCLK]	= { 0xa45, 0x1, 0x20, 1, 36 },
+	[SKX_IIO_MSR_BW]	= { 0xb00, 0x1, 0x10, 8, 36 },
+	[SKX_IIO_MSR_UTIL]	= { 0xb08, 0x1, 0x10, 8, 36 },
+};
+
+static struct uncore_event_desc skx_uncore_iio_freerunning_events[] = {
+	/* Free-Running IO CLOCKS Counter */
+	INTEL_UNCORE_EVENT_DESC(ioclk,			"event=0xff,umask=0x10"),
+	/* Free-Running IIO BANDWIDTH Counters */
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0,		"event=0xff,umask=0x20"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1,		"event=0xff,umask=0x21"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2,		"event=0xff,umask=0x22"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3,		"event=0xff,umask=0x23"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0,		"event=0xff,umask=0x24"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1,		"event=0xff,umask=0x25"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2,		"event=0xff,umask=0x26"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit,	"MiB"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3,		"event=0xff,umask=0x27"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale,	"3.814697266e-6"),
+	INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit,	"MiB"),
+	/* Free-running IIO UTILIZATION Counters */
+	INTEL_UNCORE_EVENT_DESC(util_in_port0,		"event=0xff,umask=0x30"),
+	INTEL_UNCORE_EVENT_DESC(util_out_port0,		"event=0xff,umask=0x31"),
+	INTEL_UNCORE_EVENT_DESC(util_in_port1,		"event=0xff,umask=0x32"),
+	INTEL_UNCORE_EVENT_DESC(util_out_port1,		"event=0xff,umask=0x33"),
+	INTEL_UNCORE_EVENT_DESC(util_in_port2,		"event=0xff,umask=0x34"),
+	INTEL_UNCORE_EVENT_DESC(util_out_port2,		"event=0xff,umask=0x35"),
+	INTEL_UNCORE_EVENT_DESC(util_in_port3,		"event=0xff,umask=0x36"),
+	INTEL_UNCORE_EVENT_DESC(util_out_port3,		"event=0xff,umask=0x37"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops skx_uncore_iio_freerunning_ops = {
+	.read_counter		= uncore_msr_read_counter,
+};
+
+static struct attribute *skx_uncore_iio_freerunning_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	NULL,
+};
+
+static const struct attribute_group skx_uncore_iio_freerunning_format_group = {
+	.name = "format",
+	.attrs = skx_uncore_iio_freerunning_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_iio_free_running = {
+	.name			= "iio_free_running",
+	.num_counters		= 17,
+	.num_boxes		= 6,
+	.num_freerunning_types	= SKX_IIO_FREERUNNING_TYPE_MAX,
+	.freerunning		= skx_iio_freerunning,
+	.ops			= &skx_uncore_iio_freerunning_ops,
+	.event_descs		= skx_uncore_iio_freerunning_events,
+	.format_group		= &skx_uncore_iio_freerunning_format_group,
+};
+
 static struct attribute *skx_uncore_formats_attr[] = {
 	&format_attr_event.attr,
 	&format_attr_umask.attr,
@@ -3558,6 +3676,7 @@ static struct intel_uncore_type *skx_msr_uncores[] = {
 	&skx_uncore_ubox,
 	&skx_uncore_chabox,
 	&skx_uncore_iio,
+	&skx_uncore_iio_free_running,
 	&skx_uncore_irp,
 	&skx_uncore_pcu,
 	NULL,
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index e7edf19e64c2..b4771a6ddbc1 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/perf_event.h>
+#include <linux/nospec.h>
 #include <asm/intel-family.h>
 
 enum perf_msr_id {
@@ -158,9 +159,6 @@ static int msr_event_init(struct perf_event *event)
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
 
-	if (cfg >= PERF_MSR_EVENT_MAX)
-		return -EINVAL;
-
 	/* unsupported modes and filters */
 	if (event->attr.exclude_user   ||
 	    event->attr.exclude_kernel ||
@@ -171,6 +169,11 @@ static int msr_event_init(struct perf_event *event)
 	    event->attr.sample_period) /* no sampling */
 		return -EINVAL;
 
+	if (cfg >= PERF_MSR_EVENT_MAX)
+		return -EINVAL;
+
+	cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX);
+
 	if (!msr[cfg].attr)
 		return -EINVAL;
 
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 367a8203cfcf..b173d404e3df 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1 +1,2 @@
-obj-y		:= hv_init.o mmu.o
+obj-y			:= hv_init.o mmu.o
+obj-$(CONFIG_X86_64)	+= hv_apic.o
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
new file mode 100644
index 000000000000..f68855499391
--- /dev/null
+++ b/arch/x86/hyperv/hv_apic.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V specific APIC code.
+ *
+ * Copyright (C) 2018, Microsoft, Inc.
+ *
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/hyperv.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
+#include <asm/apic.h>
+
+static struct apic orig_apic;
+
+static u64 hv_apic_icr_read(void)
+{
+	u64 reg_val;
+
+	rdmsrl(HV_X64_MSR_ICR, reg_val);
+	return reg_val;
+}
+
+static void hv_apic_icr_write(u32 low, u32 id)
+{
+	u64 reg_val;
+
+	reg_val = SET_APIC_DEST_FIELD(id);
+	reg_val = reg_val << 32;
+	reg_val |= low;
+
+	wrmsrl(HV_X64_MSR_ICR, reg_val);
+}
+
+static u32 hv_apic_read(u32 reg)
+{
+	u32 reg_val, hi;
+
+	switch (reg) {
+	case APIC_EOI:
+		rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+		return reg_val;
+	case APIC_TASKPRI:
+		rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+		return reg_val;
+
+	default:
+		return native_apic_mem_read(reg);
+	}
+}
+
+static void hv_apic_write(u32 reg, u32 val)
+{
+	switch (reg) {
+	case APIC_EOI:
+		wrmsr(HV_X64_MSR_EOI, val, 0);
+		break;
+	case APIC_TASKPRI:
+		wrmsr(HV_X64_MSR_TPR, val, 0);
+		break;
+	default:
+		native_apic_mem_write(reg, val);
+	}
+}
+
+static void hv_apic_eoi_write(u32 reg, u32 val)
+{
+	wrmsr(HV_X64_MSR_EOI, val, 0);
+}
+
+/*
+ * IPI implementation on Hyper-V.
+ */
+static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
+{
+	struct ipi_arg_ex **arg;
+	struct ipi_arg_ex *ipi_arg;
+	unsigned long flags;
+	int nr_bank = 0;
+	int ret = 1;
+
+	local_irq_save(flags);
+	arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	ipi_arg = *arg;
+	if (unlikely(!ipi_arg))
+		goto ipi_mask_ex_done;
+
+	ipi_arg->vector = vector;
+	ipi_arg->reserved = 0;
+	ipi_arg->vp_set.valid_bank_mask = 0;
+
+	if (!cpumask_equal(mask, cpu_present_mask)) {
+		ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
+	}
+	if (!nr_bank)
+		ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
+
+	ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
+			      ipi_arg, NULL);
+
+ipi_mask_ex_done:
+	local_irq_restore(flags);
+	return ((ret == 0) ? true : false);
+}
+
+static bool __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+	int cur_cpu, vcpu;
+	struct ipi_arg_non_ex **arg;
+	struct ipi_arg_non_ex *ipi_arg;
+	int ret = 1;
+	unsigned long flags;
+
+	if (cpumask_empty(mask))
+		return true;
+
+	if (!hv_hypercall_pg)
+		return false;
+
+	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+		return false;
+
+	if ((ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+		return __send_ipi_mask_ex(mask, vector);
+
+	local_irq_save(flags);
+	arg = (struct ipi_arg_non_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	ipi_arg = *arg;
+	if (unlikely(!ipi_arg))
+		goto ipi_mask_done;
+
+	ipi_arg->vector = vector;
+	ipi_arg->reserved = 0;
+	ipi_arg->cpu_mask = 0;
+
+	for_each_cpu(cur_cpu, mask) {
+		vcpu = hv_cpu_number_to_vp_number(cur_cpu);
+		/*
+		 * This particular version of the IPI hypercall can
+		 * only target upto 64 CPUs.
+		 */
+		if (vcpu >= 64)
+			goto ipi_mask_done;
+
+		__set_bit(vcpu, (unsigned long *)&ipi_arg->cpu_mask);
+	}
+
+	ret = hv_do_hypercall(HVCALL_SEND_IPI, ipi_arg, NULL);
+
+ipi_mask_done:
+	local_irq_restore(flags);
+	return ((ret == 0) ? true : false);
+}
+
+static bool __send_ipi_one(int cpu, int vector)
+{
+	struct cpumask mask = CPU_MASK_NONE;
+
+	cpumask_set_cpu(cpu, &mask);
+	return __send_ipi_mask(&mask, vector);
+}
+
+static void hv_send_ipi(int cpu, int vector)
+{
+	if (!__send_ipi_one(cpu, vector))
+		orig_apic.send_IPI(cpu, vector);
+}
+
+static void hv_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+	if (!__send_ipi_mask(mask, vector))
+		orig_apic.send_IPI_mask(mask, vector);
+}
+
+static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+	unsigned int this_cpu = smp_processor_id();
+	struct cpumask new_mask;
+	const struct cpumask *local_mask;
+
+	cpumask_copy(&new_mask, mask);
+	cpumask_clear_cpu(this_cpu, &new_mask);
+	local_mask = &new_mask;
+	if (!__send_ipi_mask(local_mask, vector))
+		orig_apic.send_IPI_mask_allbutself(mask, vector);
+}
+
+static void hv_send_ipi_allbutself(int vector)
+{
+	hv_send_ipi_mask_allbutself(cpu_online_mask, vector);
+}
+
+static void hv_send_ipi_all(int vector)
+{
+	if (!__send_ipi_mask(cpu_online_mask, vector))
+		orig_apic.send_IPI_all(vector);
+}
+
+static void hv_send_ipi_self(int vector)
+{
+	if (!__send_ipi_one(smp_processor_id(), vector))
+		orig_apic.send_IPI_self(vector);
+}
+
+void __init hv_apic_init(void)
+{
+	if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
+		if ((ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+			pr_info("Hyper-V: Using ext hypercalls for IPI\n");
+		else
+			pr_info("Hyper-V: Using IPI hypercalls\n");
+		/*
+		 * Set the IPI entry points.
+		 */
+		orig_apic = *apic;
+
+		apic->send_IPI = hv_send_ipi;
+		apic->send_IPI_mask = hv_send_ipi_mask;
+		apic->send_IPI_mask_allbutself = hv_send_ipi_mask_allbutself;
+		apic->send_IPI_allbutself = hv_send_ipi_allbutself;
+		apic->send_IPI_all = hv_send_ipi_all;
+		apic->send_IPI_self = hv_send_ipi_self;
+	}
+
+	if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
+		pr_info("Hyper-V: Using MSR based APIC access\n");
+		apic_set_eoi_write(hv_apic_eoi_write);
+		apic->read      = hv_apic_read;
+		apic->write     = hv_apic_write;
+		apic->icr_write = hv_apic_icr_write;
+		apic->icr_read  = hv_apic_icr_read;
+	}
+}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index cfecc2272f2d..4c431e1c1eff 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -91,12 +91,19 @@ EXPORT_SYMBOL_GPL(hv_vp_index);
 struct hv_vp_assist_page **hv_vp_assist_page;
 EXPORT_SYMBOL_GPL(hv_vp_assist_page);
 
+void  __percpu **hyperv_pcpu_input_arg;
+EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
+
 u32 hv_max_vp_index;
 
 static int hv_cpu_init(unsigned int cpu)
 {
 	u64 msr_vp_index;
 	struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
+	void **input_arg;
+
+	input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
+	*input_arg = page_address(alloc_page(GFP_KERNEL));
 
 	hv_get_vp_index(msr_vp_index);
 
@@ -217,6 +224,16 @@ static int hv_cpu_die(unsigned int cpu)
 {
 	struct hv_reenlightenment_control re_ctrl;
 	unsigned int new_cpu;
+	unsigned long flags;
+	void **input_arg;
+	void *input_pg = NULL;
+
+	local_irq_save(flags);
+	input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
+	input_pg = *input_arg;
+	*input_arg = NULL;
+	local_irq_restore(flags);
+	free_page((unsigned long)input_pg);
 
 	if (hv_vp_assist_page && hv_vp_assist_page[cpu])
 		wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
@@ -242,8 +259,9 @@ static int hv_cpu_die(unsigned int cpu)
  *
  * 1. Setup the hypercall page.
  * 2. Register Hyper-V specific clocksource.
+ * 3. Setup Hyper-V specific APIC entry points.
  */
-void hyperv_init(void)
+void __init hyperv_init(void)
 {
 	u64 guest_id, required_msrs;
 	union hv_x64_msr_hypercall_contents hypercall_msr;
@@ -259,6 +277,16 @@ void hyperv_init(void)
 	if ((ms_hyperv.features & required_msrs) != required_msrs)
 		return;
 
+	/*
+	 * Allocate the per-CPU state for the hypercall input arg.
+	 * If this allocation fails, we will not be able to setup
+	 * (per-CPU) hypercall input page and thus this failure is
+	 * fatal on Hyper-V.
+	 */
+	hyperv_pcpu_input_arg = alloc_percpu(void  *);
+
+	BUG_ON(hyperv_pcpu_input_arg == NULL);
+
 	/* Allocate percpu VP index */
 	hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
 				    GFP_KERNEL);
@@ -296,7 +324,7 @@ void hyperv_init(void)
 	hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
 	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
 
-	hyper_alloc_mmu();
+	hv_apic_init();
 
 	/*
 	 * Register Hyper-V specific clocksource.
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 56c9ebac946f..5f053d7d1bd9 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -25,20 +25,13 @@ struct hv_flush_pcpu {
 struct hv_flush_pcpu_ex {
 	u64 address_space;
 	u64 flags;
-	struct {
-		u64 format;
-		u64 valid_bank_mask;
-		u64 bank_contents[];
-	} hv_vp_set;
+	struct hv_vpset hv_vp_set;
 	u64 gva_list[];
 };
 
 /* Each gva in gva_list encodes up to 4096 pages to flush */
 #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
 
-static struct hv_flush_pcpu __percpu **pcpu_flush;
-
-static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
 
 /*
  * Fills in gva_list starting from offset. Returns the number of items added.
@@ -70,41 +63,6 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
 	return gva_n - offset;
 }
 
-/* Return the number of banks in the resulting vp_set */
-static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
-				    const struct cpumask *cpus)
-{
-	int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
-
-	/* valid_bank_mask can represent up to 64 banks */
-	if (hv_max_vp_index / 64 >= 64)
-		return 0;
-
-	/*
-	 * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
-	 * structs are not cleared between calls, we risk flushing unneeded
-	 * vCPUs otherwise.
-	 */
-	for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
-		flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
-
-	/*
-	 * Some banks may end up being empty but this is acceptable.
-	 */
-	for_each_cpu(cpu, cpus) {
-		vcpu = hv_cpu_number_to_vp_number(cpu);
-		vcpu_bank = vcpu / 64;
-		vcpu_offset = vcpu % 64;
-		__set_bit(vcpu_offset, (unsigned long *)
-			  &flush->hv_vp_set.bank_contents[vcpu_bank]);
-		if (vcpu_bank >= nr_bank)
-			nr_bank = vcpu_bank + 1;
-	}
-	flush->hv_vp_set.valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0);
-
-	return nr_bank;
-}
-
 static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 				    const struct flush_tlb_info *info)
 {
@@ -116,7 +74,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 
 	trace_hyperv_mmu_flush_tlb_others(cpus, info);
 
-	if (!pcpu_flush || !hv_hypercall_pg)
+	if (!hv_hypercall_pg)
 		goto do_native;
 
 	if (cpumask_empty(cpus))
@@ -124,10 +82,8 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 
 	local_irq_save(flags);
 
-	flush_pcpu = this_cpu_ptr(pcpu_flush);
-
-	if (unlikely(!*flush_pcpu))
-		*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+	flush_pcpu = (struct hv_flush_pcpu **)
+		     this_cpu_ptr(hyperv_pcpu_input_arg);
 
 	flush = *flush_pcpu;
 
@@ -203,7 +159,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 
 	trace_hyperv_mmu_flush_tlb_others(cpus, info);
 
-	if (!pcpu_flush_ex || !hv_hypercall_pg)
+	if (!hv_hypercall_pg)
 		goto do_native;
 
 	if (cpumask_empty(cpus))
@@ -211,10 +167,8 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 
 	local_irq_save(flags);
 
-	flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
-
-	if (unlikely(!*flush_pcpu))
-		*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+	flush_pcpu = (struct hv_flush_pcpu_ex **)
+		     this_cpu_ptr(hyperv_pcpu_input_arg);
 
 	flush = *flush_pcpu;
 
@@ -239,8 +193,8 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 	flush->hv_vp_set.valid_bank_mask = 0;
 
 	if (!cpumask_equal(cpus, cpu_present_mask)) {
-		flush->hv_vp_set.format = HV_GENERIC_SET_SPARCE_4K;
-		nr_bank = cpumask_to_vp_set(flush, cpus);
+		flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
 	}
 
 	if (!nr_bank) {
@@ -296,14 +250,3 @@ void hyperv_setup_mmu_ops(void)
 		pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex;
 	}
 }
-
-void hyper_alloc_mmu(void)
-{
-	if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED))
-		return;
-
-	if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
-		pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
-	else
-		pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
-}
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 386a6900e206..219faaec51df 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -136,7 +136,6 @@
 #endif
 
 #ifndef __ASSEMBLY__
-#ifndef __BPF__
 /*
  * This output constraint should be used for any inline asm which has a "call"
  * instruction.  Otherwise the asm may be inserted before the frame pointer
@@ -146,6 +145,5 @@
 register unsigned long current_stack_pointer asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
-#endif
 
 #endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
new file mode 100644
index 000000000000..e958e28f7ab5
--- /dev/null
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CACHEINFO_H
+#define _ASM_X86_CACHEINFO_H
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
+
+#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index e1c8dab86670..fb97cf7c4137 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -17,7 +17,6 @@
 
 typedef u32		compat_size_t;
 typedef s32		compat_ssize_t;
-typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
 typedef u16		__compat_uid_t;
@@ -46,16 +45,6 @@ typedef u32		compat_u32;
 typedef u64 __attribute__((aligned(4))) compat_u64;
 typedef u32		compat_uptr_t;
 
-struct compat_timespec {
-	compat_time_t	tv_sec;
-	s32		tv_nsec;
-};
-
-struct compat_timeval {
-	compat_time_t	tv_sec;
-	s32		tv_usec;
-};
-
 struct compat_stat {
 	compat_dev_t	st_dev;
 	u16		__pad1;
@@ -145,10 +134,10 @@ struct compat_ipc64_perm {
 
 struct compat_semid64_ds {
 	struct compat_ipc64_perm sem_perm;
-	compat_time_t  sem_otime;
-	compat_ulong_t __unused1;
-	compat_time_t  sem_ctime;
-	compat_ulong_t __unused2;
+	compat_ulong_t sem_otime;
+	compat_ulong_t sem_otime_high;
+	compat_ulong_t sem_ctime;
+	compat_ulong_t sem_ctime_high;
 	compat_ulong_t sem_nsems;
 	compat_ulong_t __unused3;
 	compat_ulong_t __unused4;
@@ -156,12 +145,12 @@ struct compat_semid64_ds {
 
 struct compat_msqid64_ds {
 	struct compat_ipc64_perm msg_perm;
-	compat_time_t  msg_stime;
-	compat_ulong_t __unused1;
-	compat_time_t  msg_rtime;
-	compat_ulong_t __unused2;
-	compat_time_t  msg_ctime;
-	compat_ulong_t __unused3;
+	compat_ulong_t msg_stime;
+	compat_ulong_t msg_stime_high;
+	compat_ulong_t msg_rtime;
+	compat_ulong_t msg_rtime_high;
+	compat_ulong_t msg_ctime;
+	compat_ulong_t msg_ctime_high;
 	compat_ulong_t msg_cbytes;
 	compat_ulong_t msg_qnum;
 	compat_ulong_t msg_qbytes;
@@ -174,12 +163,12 @@ struct compat_msqid64_ds {
 struct compat_shmid64_ds {
 	struct compat_ipc64_perm shm_perm;
 	compat_size_t  shm_segsz;
-	compat_time_t  shm_atime;
-	compat_ulong_t __unused1;
-	compat_time_t  shm_dtime;
-	compat_ulong_t __unused2;
-	compat_time_t  shm_ctime;
-	compat_ulong_t __unused3;
+	compat_ulong_t shm_atime;
+	compat_ulong_t shm_atime_high;
+	compat_ulong_t shm_dtime;
+	compat_ulong_t shm_dtime_high;
+	compat_ulong_t shm_ctime;
+	compat_ulong_t shm_ctime_high;
 	compat_pid_t   shm_cpid;
 	compat_pid_t   shm_lpid;
 	compat_ulong_t shm_nattch;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index b27da9602a6d..aced6c9290d6 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -140,6 +140,20 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
 
 #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
 
+#if defined(__clang__) && !defined(CC_HAVE_ASM_GOTO)
+
+/*
+ * Workaround for the sake of BPF compilation which utilizes kernel
+ * headers, but clang does not support ASM GOTO and fails the build.
+ */
+#ifndef __BPF_TRACING__
+#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
+#endif
+
+#define static_cpu_has(bit)            boot_cpu_has(bit)
+
+#else
+
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * These will statically patch the target code for additional
@@ -195,6 +209,7 @@ t_no:
 		boot_cpu_has(bit) :				\
 		_static_cpu_has(bit)				\
 )
+#endif
 
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))
 #define set_cpu_bug(c, bit)		set_cpu_cap(c, (bit))
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d554c11e01ff..fb00a2fca990 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -198,7 +198,6 @@
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
 #define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
-
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
@@ -207,13 +206,19 @@
 #define X86_FEATURE_RETPOLINE_AMD	( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_CDP_L2		( 7*32+15) /* Code and Data Prioritization L2 */
-
+#define X86_FEATURE_MSR_SPEC_CTRL	( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD		( 7*32+17) /* Speculative Store Bypass Disable */
 #define X86_FEATURE_MBA			( 7*32+18) /* Memory Bandwidth Allocation */
 #define X86_FEATURE_RSB_CTXSW		( 7*32+19) /* "" Fill RSB on context switches */
 #define X86_FEATURE_SEV			( 7*32+20) /* AMD Secure Encrypted Virtualization */
-
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 #define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE	( 7*32+23) /* "" Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD		( 7*32+24)  /* "" AMD SSBD implementation via LS_CFG MSR */
+#define X86_FEATURE_IBRS		( 7*32+25) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB		( 7*32+26) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP		( 7*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN			( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
@@ -274,9 +279,10 @@
 #define X86_FEATURE_CLZERO		(13*32+ 0) /* CLZERO instruction */
 #define X86_FEATURE_IRPERF		(13*32+ 1) /* Instructions Retired Count */
 #define X86_FEATURE_XSAVEERPTR		(13*32+ 2) /* Always save/restore FP error pointers */
-#define X86_FEATURE_IBPB		(13*32+12) /* Indirect Branch Prediction Barrier */
-#define X86_FEATURE_IBRS		(13*32+14) /* Indirect Branch Restricted Speculation */
-#define X86_FEATURE_STIBP		(13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_IBPB		(13*32+12) /* "" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS		(13*32+14) /* "" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP		(13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_VIRT_SSBD		(13*32+25) /* Virtualized Speculative Store Bypass Disable */
 
 /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
 #define X86_FEATURE_DTHERM		(14*32+ 0) /* Digital Thermal Sensor */
@@ -320,6 +326,7 @@
 #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_LA57		(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID		(16*32+22) /* RDPID instruction */
+#define X86_FEATURE_CLDEMOTE		(16*32+25) /* CLDEMOTE instruction */
 
 /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
 #define X86_FEATURE_OVERFLOW_RECOV	(17*32+ 0) /* MCA overflow recovery support */
@@ -333,6 +340,7 @@
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_SPEC_CTRL_SSBD	(18*32+31) /* "" Speculative Store Bypass Disable */
 
 /*
  * BUG word(s)
@@ -362,5 +370,6 @@
 #define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
 #define X86_BUG_SPECTRE_V1		X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
 #define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS	X86_BUG(17) /* CPU is affected by speculative store bypass attack */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 89ce4bfd241f..ce4d176b3d13 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -30,10 +30,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return dma_ops;
 }
 
-int arch_dma_supported(struct device *dev, u64 mask);
-#define arch_dma_supported arch_dma_supported
-
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+bool arch_dma_alloc_attrs(struct device **dev);
 #define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
 #endif
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 09ad88572746..c18ed65287d5 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -46,10 +46,24 @@ int ftrace_int3_handler(struct pt_regs *regs);
 #endif /* CONFIG_FUNCTION_TRACER */
 
 
-#if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS)
+#ifndef __ASSEMBLY__
+
+#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+	/*
+	 * Compare the symbol name with the system call name. Skip the
+	 * "__x64_sys", "__ia32_sys" or simple "sys" prefix.
+	 */
+	return !strcmp(sym + 3, name + 3) ||
+		(!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) ||
+		(!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3));
+}
+
+#ifndef COMPILE_OFFSETS
 
 #if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION)
-#include <asm/compat.h>
+#include <linux/compat.h>
 
 /*
  * Because ia32 syscalls do not map to x86_64 syscall numbers
@@ -67,6 +81,7 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
 	return false;
 }
 #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */
-#endif /* !__ASSEMBLY__  && !COMPILE_OFFSETS */
+#endif /* !COMPILE_OFFSETS */
+#endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 5ea2afd4c871..740a428acf1e 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -50,14 +50,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
 #define inc_irq_stat(member)	this_cpu_inc(irq_stat.member)
 
-#define local_softirq_pending()	this_cpu_read(irq_stat.__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING
-
-#define set_softirq_pending(x)	\
-		this_cpu_write(irq_stat.__softirq_pending, (x))
-#define or_softirq_pending(x)	this_cpu_or(irq_stat.__softirq_pending, (x))
-
 extern void ack_bad_irq(unsigned int irq);
 
 extern u64 arch_irq_stat_cpu(unsigned int cpu);
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 416cb0e0c496..3bfa92c2793c 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -164,6 +164,11 @@
  */
 #define HV_X64_DEPRECATING_AEOI_RECOMMENDED	(1 << 9)
 
+/*
+ * Recommend using cluster IPI hypercalls.
+ */
+#define HV_X64_CLUSTER_IPI_RECOMMENDED         (1 << 10)
+
 /* Recommend using the newer ExProcessorMasks interface */
 #define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED	(1 << 11)
 
@@ -329,12 +334,17 @@ struct hv_tsc_emulation_status {
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
 		(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
 
+#define HV_IPI_LOW_VECTOR	0x10
+#define HV_IPI_HIGH_VECTOR	0xff
+
 /* Declare the various hypercall operations. */
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST	0x0003
 #define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
+#define HVCALL_SEND_IPI				0x000b
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX  0x0013
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX   0x0014
+#define HVCALL_SEND_IPI_EX			0x0015
 #define HVCALL_POST_MESSAGE			0x005c
 #define HVCALL_SIGNAL_EVENT			0x005d
 
@@ -360,7 +370,7 @@ struct hv_tsc_emulation_status {
 #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
 
 enum HV_GENERIC_SET_FORMAT {
-	HV_GENERIC_SET_SPARCE_4K,
+	HV_GENERIC_SET_SPARSE_4K,
 	HV_GENERIC_SET_ALL,
 };
 
@@ -706,4 +716,22 @@ struct hv_enlightened_vmcs {
 #define HV_STIMER_AUTOENABLE		(1ULL << 3)
 #define HV_STIMER_SINT(config)		(__u8)(((config) >> 16) & 0x0F)
 
+struct ipi_arg_non_ex {
+	u32 vector;
+	u32 reserved;
+	u64 cpu_mask;
+};
+
+struct hv_vpset {
+	u64 format;
+	u64 valid_bank_mask;
+	u64 bank_contents[];
+};
+
+struct ipi_arg_ex {
+	u32 vector;
+	u32 reserved;
+	struct hv_vpset vp_set;
+};
+
 #endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index b3e32b010ab1..c2c01f84df75 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -208,4 +208,22 @@ static inline int insn_offset_immediate(struct insn *insn)
 	return insn_offset_displacement(insn) + insn->displacement.nbytes;
 }
 
+#define POP_SS_OPCODE 0x1f
+#define MOV_SREG_OPCODE 0x8e
+
+/*
+ * Intel SDM Vol.3A 6.8.3 states;
+ * "Any single-step trap that would be delivered following the MOV to SS
+ * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
+ * suppressed."
+ * This function returns true if @insn is MOV SS or POP SS. On these
+ * instructions, single stepping is suppressed.
+ */
+static inline int insn_masking_exception(struct insn *insn)
+{
+	return insn->opcode.bytes[0] == POP_SS_OPCODE ||
+		(insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
+		 X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
+}
+
 #endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/intel_mid_vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h
index 35555016b1be..0b44b1abe4d9 100644
--- a/arch/x86/include/asm/intel_mid_vrtc.h
+++ b/arch/x86/include/asm/intel_mid_vrtc.h
@@ -4,7 +4,7 @@
 
 extern unsigned char vrtc_cmos_read(unsigned char reg);
 extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
-extern void vrtc_get_time(struct timespec *now);
-extern int vrtc_set_mmss(const struct timespec *now);
+extern void vrtc_get_time(struct timespec64 *now);
+extern int vrtc_set_mmss(const struct timespec64 *now);
 
 #endif
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6e5b9375d8c..6de64840dd22 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -94,10 +94,10 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
 
 #ifdef CONFIG_X86_64
 
-build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
-build_mmio_read(__readq, "q", unsigned long, "=r", )
-build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
-build_mmio_write(__writeq, "q", unsigned long, "r", )
+build_mmio_read(readq, "q", u64, "=r", :"memory")
+build_mmio_read(__readq, "q", u64, "=r", )
+build_mmio_write(writeq, "q", u64, "r", :"memory")
+build_mmio_write(__writeq, "q", u64, "r", )
 
 #define readq_relaxed(a)	__readq(a)
 #define writeq_relaxed(v, a)	__writeq(v, a)
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 404c5fdff859..548d90bbf919 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -34,11 +34,6 @@
  * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
  */
 #define FIRST_EXTERNAL_VECTOR		0x20
-/*
- * We start allocating at 0x21 to spread out vectors evenly between
- * priority levels. (0x80 is the syscall vector)
- */
-#define VECTOR_OFFSET_START		1
 
 /*
  * Reserve the lowest usable vector (and hence lowest priority)  0x20 for
@@ -119,8 +114,6 @@
 #define FIRST_SYSTEM_VECTOR		NR_VECTORS
 #endif
 
-#define FPU_IRQ				  13
-
 /*
  * Size the maximum number of interrupts.
  *
diff --git a/arch/x86/include/asm/jailhouse_para.h b/arch/x86/include/asm/jailhouse_para.h
index b885a961a150..a34897aef2c2 100644
--- a/arch/x86/include/asm/jailhouse_para.h
+++ b/arch/x86/include/asm/jailhouse_para.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL2.0 */
+/* SPDX-License-Identifier: GPL-2.0 */
 
 /*
  * Jailhouse paravirt detection
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 949c977bc4c9..f4b2588865e9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -924,7 +924,7 @@ struct kvm_x86_ops {
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 	bool (*cpu_has_accelerated_tpr)(void);
-	bool (*cpu_has_high_real_mode_segbase)(void);
+	bool (*has_emulated_msr)(int index);
 	void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
 	struct kvm *(*vm_alloc)(void);
@@ -1013,6 +1013,7 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
+	u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 1775a32f7ea6..97198001e567 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -95,8 +95,8 @@ static inline unsigned char current_lock_cmos_reg(void)
 unsigned char rtc_cmos_read(unsigned char addr);
 void rtc_cmos_write(unsigned char val, unsigned char addr);
 
-extern int mach_set_rtc_mmss(const struct timespec *now);
-extern void mach_get_cmos_time(struct timespec *now);
+extern int mach_set_rtc_mmss(const struct timespec64 *now);
+extern void mach_get_cmos_time(struct timespec64 *now);
 
 #define RTC_IRQ 8
 
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 57e3785d0d26..bbc796eb0a3b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -193,7 +193,7 @@ static inline int init_new_context(struct task_struct *tsk,
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
-		/* pkey 0 is the default and always allocated */
+		/* pkey 0 is the default and allocated implicitly */
 		mm->context.pkey_allocation_map = 0x1;
 		/* -1 means unallocated or invalid */
 		mm->context.execute_only_pkey = -1;
@@ -288,21 +288,6 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		mpx_notify_unmap(mm, vma, start, end);
 }
 
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-static inline int vma_pkey(struct vm_area_struct *vma)
-{
-	unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
-				      VM_PKEY_BIT2 | VM_PKEY_BIT3;
-
-	return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
-}
-#else
-static inline int vma_pkey(struct vm_area_struct *vma)
-{
-	return 0;
-}
-#endif
-
 /*
  * We only want to enforce protection keys on the current process
  * because we effectively have no access to PKRU for other
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index b90e79610cf7..997192131b7b 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -122,6 +122,7 @@ static inline void hv_disable_stimer0_percpu_irq(int irq) {}
 #if IS_ENABLED(CONFIG_HYPERV)
 extern struct clocksource *hyperv_cs;
 extern void *hv_hypercall_pg;
+extern void  __percpu  **hyperv_pcpu_input_arg;
 
 static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 {
@@ -258,9 +259,41 @@ static inline int hv_cpu_number_to_vp_number(int cpu_number)
 	return hv_vp_index[cpu_number];
 }
 
-void hyperv_init(void);
+static inline int cpumask_to_vpset(struct hv_vpset *vpset,
+				    const struct cpumask *cpus)
+{
+	int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
+
+	/* valid_bank_mask can represent up to 64 banks */
+	if (hv_max_vp_index / 64 >= 64)
+		return 0;
+
+	/*
+	 * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
+	 * structs are not cleared between calls, we risk flushing unneeded
+	 * vCPUs otherwise.
+	 */
+	for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+		vpset->bank_contents[vcpu_bank] = 0;
+
+	/*
+	 * Some banks may end up being empty but this is acceptable.
+	 */
+	for_each_cpu(cpu, cpus) {
+		vcpu = hv_cpu_number_to_vp_number(cpu);
+		vcpu_bank = vcpu / 64;
+		vcpu_offset = vcpu % 64;
+		__set_bit(vcpu_offset, (unsigned long *)
+			  &vpset->bank_contents[vcpu_bank]);
+		if (vcpu_bank >= nr_bank)
+			nr_bank = vcpu_bank + 1;
+	}
+	vpset->valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0);
+	return nr_bank;
+}
+
+void __init hyperv_init(void);
 void hyperv_setup_mmu_ops(void);
-void hyper_alloc_mmu(void);
 void hyperv_report_panic(struct pt_regs *regs, long err);
 bool hv_is_hyperv_initialized(void);
 void hyperv_cleanup(void);
@@ -269,6 +302,13 @@ void hyperv_reenlightenment_intr(struct pt_regs *regs);
 void set_hv_tscchange_cb(void (*cb)(void));
 void clear_hv_tscchange_cb(void);
 void hyperv_stop_tsc_emulation(void);
+
+#ifdef CONFIG_X86_64
+void hv_apic_init(void);
+#else
+static inline void hv_apic_init(void) {}
+#endif
+
 #else /* CONFIG_HYPERV */
 static inline void hyperv_init(void) {}
 static inline bool hv_is_hyperv_initialized(void) { return false; }
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b9255e..68b2c3150de1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -42,6 +42,8 @@
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
 #define SPEC_CTRL_IBRS			(1 << 0)   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP			(1 << 1)   /* Single Thread Indirect Branch Predictors */
+#define SPEC_CTRL_SSBD_SHIFT		2	   /* Speculative Store Bypass Disable bit */
+#define SPEC_CTRL_SSBD			(1 << SPEC_CTRL_SSBD_SHIFT)   /* Speculative Store Bypass Disable */
 
 #define MSR_IA32_PRED_CMD		0x00000049 /* Prediction Command */
 #define PRED_CMD_IBPB			(1 << 0)   /* Indirect Branch Prediction Barrier */
@@ -60,14 +62,19 @@
 #define NHM_C3_AUTO_DEMOTE		(1UL << 25)
 #define NHM_C1_AUTO_DEMOTE		(1UL << 26)
 #define ATM_LNC_C6_AUTO_DEMOTE		(1UL << 25)
-#define SNB_C1_AUTO_UNDEMOTE		(1UL << 27)
-#define SNB_C3_AUTO_UNDEMOTE		(1UL << 28)
+#define SNB_C3_AUTO_UNDEMOTE		(1UL << 27)
+#define SNB_C1_AUTO_UNDEMOTE		(1UL << 28)
 
 #define MSR_MTRRcap			0x000000fe
 
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
 #define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
 #define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
+#define ARCH_CAP_SSB_NO			(1 << 4)   /*
+						    * Not susceptible to Speculative Store Bypass
+						    * attack, so no Speculative Store Bypass
+						    * control required.
+						    */
 
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
@@ -340,6 +347,8 @@
 #define MSR_AMD64_SEV_ENABLED_BIT	0
 #define MSR_AMD64_SEV_ENABLED		BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
 
+#define MSR_AMD64_VIRT_SPEC_CTRL	0xc001011f
+
 /* Fam 17h MSRs */
 #define MSR_F17H_IRPERF			0xc00000e9
 
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f928ad9b143f..f6f6c63da62f 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -217,6 +217,14 @@ enum spectre_v2_mitigation {
 	SPECTRE_V2_IBRS,
 };
 
+/* The Speculative Store Bypass disable variants */
+enum ssb_mitigation {
+	SPEC_STORE_BYPASS_NONE,
+	SPEC_STORE_BYPASS_DISABLE,
+	SPEC_STORE_BYPASS_PRCTL,
+	SPEC_STORE_BYPASS_SECCOMP,
+};
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
@@ -241,22 +249,27 @@ static inline void vmexit_fill_RSB(void)
 #endif
 }
 
-#define alternative_msr_write(_msr, _val, _feature)		\
-	asm volatile(ALTERNATIVE("",				\
-				 "movl %[msr], %%ecx\n\t"	\
-				 "movl %[val], %%eax\n\t"	\
-				 "movl $0, %%edx\n\t"		\
-				 "wrmsr",			\
-				 _feature)			\
-		     : : [msr] "i" (_msr), [val] "i" (_val)	\
-		     : "eax", "ecx", "edx", "memory")
+static __always_inline
+void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
+{
+	asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
+		: : "c" (msr),
+		    "a" ((u32)val),
+		    "d" ((u32)(val >> 32)),
+		    [feature] "i" (feature)
+		: "memory");
+}
 
 static inline void indirect_branch_prediction_barrier(void)
 {
-	alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,
-			      X86_FEATURE_USE_IBPB);
+	u64 val = PRED_CMD_IBPB;
+
+	alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
 }
 
+/* The Intel SPEC CTRL MSR base value cache */
+extern u64 x86_spec_ctrl_base;
+
 /*
  * With retpoline, we must use IBRS to restrict branch prediction
  * before calling into firmware.
@@ -265,14 +278,18 @@ static inline void indirect_branch_prediction_barrier(void)
  */
 #define firmware_restrict_branch_speculation_start()			\
 do {									\
+	u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS;			\
+									\
 	preempt_disable();						\
-	alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS,	\
+	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
 			      X86_FEATURE_USE_IBRS_FW);			\
 } while (0)
 
 #define firmware_restrict_branch_speculation_end()			\
 do {									\
-	alternative_msr_write(MSR_IA32_SPEC_CTRL, 0,			\
+	u64 val = x86_spec_ctrl_base;					\
+									\
+	alternative_msr_write(MSR_IA32_SPEC_CTRL, val,			\
 			      X86_FEATURE_USE_IBRS_FW);			\
 	preempt_enable();						\
 } while (0)
@@ -291,16 +308,20 @@ do {									\
  *    lfence
  *    jmp spec_trap
  *  do_rop:
- *    mov %rax,(%rsp)
+ *    mov %rax,(%rsp) for x86_64
+ *    mov %edx,(%esp) for x86_32
  *    retq
  *
  * Without retpolines configured:
  *
- *    jmp *%rax
+ *    jmp *%rax for x86_64
+ *    jmp *%edx for x86_32
  */
 #ifdef CONFIG_RETPOLINE
-# define RETPOLINE_RAX_BPF_JIT_SIZE	17
-# define RETPOLINE_RAX_BPF_JIT()				\
+# ifdef CONFIG_X86_64
+#  define RETPOLINE_RAX_BPF_JIT_SIZE	17
+#  define RETPOLINE_RAX_BPF_JIT()				\
+do {								\
 	EMIT1_off32(0xE8, 7);	 /* callq do_rop */		\
 	/* spec_trap: */					\
 	EMIT2(0xF3, 0x90);       /* pause */			\
@@ -308,11 +329,30 @@ do {									\
 	EMIT2(0xEB, 0xF9);       /* jmp spec_trap */		\
 	/* do_rop: */						\
 	EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */	\
-	EMIT1(0xC3);             /* retq */
-#else
-# define RETPOLINE_RAX_BPF_JIT_SIZE	2
-# define RETPOLINE_RAX_BPF_JIT()				\
-	EMIT2(0xFF, 0xE0);	 /* jmp *%rax */
+	EMIT1(0xC3);             /* retq */			\
+} while (0)
+# else /* !CONFIG_X86_64 */
+#  define RETPOLINE_EDX_BPF_JIT()				\
+do {								\
+	EMIT1_off32(0xE8, 7);	 /* call do_rop */		\
+	/* spec_trap: */					\
+	EMIT2(0xF3, 0x90);       /* pause */			\
+	EMIT3(0x0F, 0xAE, 0xE8); /* lfence */			\
+	EMIT2(0xEB, 0xF9);       /* jmp spec_trap */		\
+	/* do_rop: */						\
+	EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */		\
+	EMIT1(0xC3);             /* ret */			\
+} while (0)
+# endif
+#else /* !CONFIG_RETPOLINE */
+# ifdef CONFIG_X86_64
+#  define RETPOLINE_RAX_BPF_JIT_SIZE	2
+#  define RETPOLINE_RAX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE0);       /* jmp *%rax */
+# else /* !CONFIG_X86_64 */
+#  define RETPOLINE_EDX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE2)        /* jmp *%edx */
+# endif
 #endif
 
 #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 2c5a966dc222..6afac386a434 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -53,7 +53,7 @@
 #define __PHYSICAL_MASK_SHIFT	52
 
 #ifdef CONFIG_X86_5LEVEL
-#define __VIRTUAL_MASK_SHIFT	(pgtable_l5_enabled ? 56 : 47)
+#define __VIRTUAL_MASK_SHIFT	(pgtable_l5_enabled() ? 56 : 47)
 #else
 #define __VIRTUAL_MASK_SHIFT	47
 #endif
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9be2bf13825b..d49bbf4bb5c8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -574,14 +574,14 @@ static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
 }
 
 #define set_pgd(pgdp, pgdval) do {					\
-	if (pgtable_l5_enabled)						\
+	if (pgtable_l5_enabled())						\
 		__set_pgd(pgdp, pgdval);				\
 	else								\
 		set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd });	\
 } while (0)
 
 #define pgd_clear(pgdp) do {						\
-	if (pgtable_l5_enabled)						\
+	if (pgtable_l5_enabled())						\
 		set_pgd(pgdp, __pgd(0));				\
 } while (0)
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d32175e30259..662963681ea6 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -117,9 +117,6 @@ void native_restore_msi_irqs(struct pci_dev *dev);
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #endif
-
-#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-
 #endif  /* __KERNEL__ */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 263c142a6a6c..ada6410fd2ec 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -167,7 +167,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 #if CONFIG_PGTABLE_LEVELS > 4
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
 {
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return;
 	paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
 	set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
@@ -193,7 +193,7 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
 static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
 				  unsigned long address)
 {
-	if (pgtable_l5_enabled)
+	if (pgtable_l5_enabled())
 		___p4d_free_tlb(tlb, p4d);
 }
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5f49b4ff0c24..99ecde23c3ec 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags;
 
 #ifndef __PAGETABLE_P4D_FOLDED
 #define set_pgd(pgdp, pgd)		native_set_pgd(pgdp, pgd)
-#define pgd_clear(pgd)			(pgtable_l5_enabled ? native_pgd_clear(pgd) : 0)
+#define pgd_clear(pgd)			(pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
 #endif
 
 #ifndef set_p4d
@@ -601,6 +601,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 
 #define canon_pgprot(p) __pgprot(massage_pgprot(p))
 
+static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
+{
+	return canon_pgprot(prot);
+}
+
 static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
 					 enum page_cache_mode pcm,
 					 enum page_cache_mode new_pcm)
@@ -876,7 +881,7 @@ static inline unsigned long p4d_index(unsigned long address)
 #if CONFIG_PGTABLE_LEVELS > 4
 static inline int pgd_present(pgd_t pgd)
 {
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return 1;
 	return pgd_flags(pgd) & _PAGE_PRESENT;
 }
@@ -893,9 +898,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
 #define pgd_page(pgd)	pfn_to_page(pgd_pfn(pgd))
 
 /* to find an entry in a page-table-directory. */
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static __always_inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 {
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return (p4d_t *)pgd;
 	return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
 }
@@ -904,7 +909,7 @@ static inline int pgd_bad(pgd_t pgd)
 {
 	unsigned long ignore_flags = _PAGE_USER;
 
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return 0;
 
 	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
@@ -915,7 +920,7 @@ static inline int pgd_bad(pgd_t pgd)
 
 static inline int pgd_none(pgd_t pgd)
 {
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return 0;
 	/*
 	 * There is no need to do a workaround for the KNL stray
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index e3225e83db7d..d9a001a4a872 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -15,7 +15,7 @@
 # include <asm/pgtable-2level_types.h>
 #endif
 
-#define pgtable_l5_enabled 0
+#define pgtable_l5_enabled() 0
 
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 877bc27718ae..3c5385f9a88f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -220,7 +220,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 {
 	pgd_t pgd;
 
-	if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
+	if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
 		*p4dp = p4d;
 		return;
 	}
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index d5c21a382475..054765ab2da2 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -22,12 +22,23 @@ typedef struct { pteval_t pte; } pte_t;
 
 #ifdef CONFIG_X86_5LEVEL
 extern unsigned int __pgtable_l5_enabled;
-#ifndef pgtable_l5_enabled
-#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57)
-#endif
+
+#ifdef USE_EARLY_PGTABLE_L5
+/*
+ * cpu_feature_enabled() is not available in early boot code.
+ * Use variable instead.
+ */
+static inline bool pgtable_l5_enabled(void)
+{
+	return __pgtable_l5_enabled;
+}
 #else
-#define pgtable_l5_enabled 0
-#endif
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+#endif /* USE_EARLY_PGTABLE_L5 */
+
+#else
+#define pgtable_l5_enabled() 0
+#endif /* CONFIG_X86_5LEVEL */
 
 extern unsigned int pgdir_shift;
 extern unsigned int ptrs_per_p4d;
@@ -102,21 +113,21 @@ extern unsigned int ptrs_per_p4d;
 
 #define LDT_PGD_ENTRY_L4	-3UL
 #define LDT_PGD_ENTRY_L5	-112UL
-#define LDT_PGD_ENTRY		(pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
+#define LDT_PGD_ENTRY		(pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
 #define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 
-#define __VMALLOC_BASE_L4	0xffffc90000000000
-#define __VMALLOC_BASE_L5 	0xffa0000000000000
+#define __VMALLOC_BASE_L4	0xffffc90000000000UL
+#define __VMALLOC_BASE_L5 	0xffa0000000000000UL
 
 #define VMALLOC_SIZE_TB_L4	32UL
 #define VMALLOC_SIZE_TB_L5	12800UL
 
-#define __VMEMMAP_BASE_L4	0xffffea0000000000
-#define __VMEMMAP_BASE_L5	0xffd4000000000000
+#define __VMEMMAP_BASE_L4	0xffffea0000000000UL
+#define __VMEMMAP_BASE_L5	0xffd4000000000000UL
 
 #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
 # define VMALLOC_START		vmalloc_base
-# define VMALLOC_SIZE_TB	(pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
+# define VMALLOC_SIZE_TB	(pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
 # define VMEMMAP_START		vmemmap_base
 #else
 # define VMALLOC_START		__VMALLOC_BASE_L4
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index a0ba1ffda0df..19b137f1b3be 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -2,11 +2,18 @@
 #ifndef _ASM_X86_PKEYS_H
 #define _ASM_X86_PKEYS_H
 
+#define ARCH_DEFAULT_PKEY	0
+
 #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 
+static inline bool arch_pkeys_enabled(void)
+{
+	return boot_cpu_has(X86_FEATURE_OSPKE);
+}
+
 /*
  * Try to dedicate one of the protection keys to be used as an
  * execute-only protection key.
@@ -15,7 +22,7 @@ extern int __execute_only_pkey(struct mm_struct *mm);
 static inline int execute_only_pkey(struct mm_struct *mm)
 {
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
-		return 0;
+		return ARCH_DEFAULT_PKEY;
 
 	return __execute_only_pkey(mm);
 }
@@ -49,13 +56,21 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 {
 	/*
 	 * "Allocated" pkeys are those that have been returned
-	 * from pkey_alloc().  pkey 0 is special, and never
-	 * returned from pkey_alloc().
+	 * from pkey_alloc() or pkey 0 which is allocated
+	 * implicitly when the mm is created.
 	 */
-	if (pkey <= 0)
+	if (pkey < 0)
 		return false;
 	if (pkey >= arch_max_pkey())
 		return false;
+	/*
+	 * The exec-only pkey is set in the allocation map, but
+	 * is not available to any of the user interfaces like
+	 * mprotect_pkey().
+	 */
+	if (pkey == mm->context.execute_only_pkey)
+		return false;
+
 	return mm_pkey_allocation_map(mm) & (1U << pkey);
 }
 
@@ -106,4 +121,12 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 extern void copy_init_pkru_to_fpregs(void);
 
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+	unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
+				      VM_PKEY_BIT2 | VM_PKEY_BIT3;
+
+	return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+}
+
 #endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4fa4206029e3..e28add6b791f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -186,15 +186,6 @@ extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 void print_cpu_msr(struct cpuinfo_x86 *);
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern u32 get_scattered_cpuid_leaf(unsigned int level,
-				    unsigned int sub_leaf,
-				    enum cpuid_regs_idx reg);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
-
-extern void detect_extended_topology(struct cpuinfo_x86 *c);
-extern void detect_ht(struct cpuinfo_x86 *c);
 
 #ifdef CONFIG_X86_32
 extern int have_cpuid_p(void);
@@ -749,13 +740,11 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
-extern void early_trap_init(void);
 void early_trap_pf_init(void);
 
 /* Defined in head.S */
 extern struct desc_ptr		early_gdt_descr;
 
-extern void cpu_set_gdt(int);
 extern void switch_to_new_gdt(int);
 extern void load_direct_gdt(int);
 extern void load_fixmap_gdt(int);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index a7471dcd2205..b6033680d458 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,7 +12,7 @@ void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
-			    struct timespec *ts);
+			    struct timespec64 *ts);
 void pvclock_resume(void);
 
 void pvclock_touch_watchdogs(void);
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 5e16b5d40d32..3e70bed8a978 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -7,6 +7,14 @@
 #include <asm-generic/qspinlock_types.h>
 #include <asm/paravirt.h>
 
+#define _Q_PENDING_LOOPS	(1 << 9)
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
 #define	queued_spin_unlock queued_spin_unlock
 /**
  * queued_spin_unlock - release a queued spinlock
@@ -16,15 +24,9 @@
  */
 static inline void native_queued_spin_unlock(struct qspinlock *lock)
 {
-	smp_store_release((u8 *)lock, 0);
+	smp_store_release(&lock->locked, 0);
 }
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __pv_init_lock_hash(void);
-extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
-
 static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
 	pv_queued_spin_lock_slowpath(lock, val);
@@ -40,11 +42,6 @@ static inline bool vcpu_is_preempted(long cpu)
 {
 	return pv_vcpu_is_preempted(cpu);
 }
-#else
-static inline void queued_spin_unlock(struct qspinlock *lock)
-{
-	native_queued_spin_unlock(lock);
-}
 #endif
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 923307ea11c7..9ef5ee03d2d7 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -22,8 +22,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
  *
  * void __pv_queued_spin_unlock(struct qspinlock *lock)
  * {
- *	struct __qspinlock *l = (void *)lock;
- *	u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *	u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
  *
  *	if (likely(lockval == _Q_LOCKED_VAL))
  *		return;
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f75bff8f9d82..547c4fe50711 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -171,7 +171,6 @@ static inline int wbinvd_on_all_cpus(void)
 	wbinvd();
 	return 0;
 }
-#define smp_num_siblings	1
 #endif /* CONFIG_SMP */
 
 extern unsigned disabled_cpus;
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4617a2bf123c..199218719a86 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,8 +27,8 @@
 # endif
 #else /* CONFIG_X86_32 */
 # define SECTION_SIZE_BITS	27 /* matt - 128 is convenient right now */
-# define MAX_PHYSADDR_BITS	(pgtable_l5_enabled ? 52 : 44)
-# define MAX_PHYSMEM_BITS	(pgtable_l5_enabled ? 52 : 46)
+# define MAX_PHYSADDR_BITS	(pgtable_l5_enabled() ? 52 : 44)
+# define MAX_PHYSMEM_BITS	(pgtable_l5_enabled() ? 52 : 46)
 #endif
 
 #endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
new file mode 100644
index 000000000000..ae7c2c5cd7f0
--- /dev/null
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPECCTRL_H_
+#define _ASM_X86_SPECCTRL_H_
+
+#include <linux/thread_info.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
+ * the guest has, while on VMEXIT we restore the host view. This
+ * would be easier if SPEC_CTRL were architecturally maskable or
+ * shadowable for guests but this is not (currently) the case.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter and also
+ * the guest's version of VIRT_SPEC_CTRL, if emulated.
+ */
+extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest);
+
+/**
+ * x86_spec_ctrl_set_guest - Set speculation control registers for the guest
+ * @guest_spec_ctrl:		The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl:	The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ *				(may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+{
+	x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true);
+}
+
+/**
+ * x86_spec_ctrl_restore_host - Restore host speculation control registers
+ * @guest_spec_ctrl:		The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl:	The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ *				(may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+{
+	x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false);
+}
+
+/* AMD specific Speculative Store Bypass MSR data */
+extern u64 x86_amd_ls_cfg_base;
+extern u64 x86_amd_ls_cfg_ssbd_mask;
+
+static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
+{
+	BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+	return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
+static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+	BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+	return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
+static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
+{
+	return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
+}
+
+#ifdef CONFIG_SMP
+extern void speculative_store_bypass_ht_init(void);
+#else
+static inline void speculative_store_bypass_ht_init(void) { }
+#endif
+
+extern void speculative_store_bypass_update(unsigned long tif);
+
+static inline void speculative_store_bypass_update_current(void)
+{
+	speculative_store_bypass_update(current_thread_info()->flags);
+}
+
+#endif
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 133d9425fced..b6dc698f992a 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -111,4 +111,6 @@ static inline unsigned long caller_frame_pointer(void)
 	return (unsigned long)frame;
 }
 
+void show_opcodes(u8 *rip, const char *loglvl);
+void show_ip(struct pt_regs *regs, const char *loglvl);
 #endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 533f74c300c2..d33f92b9fa22 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -116,7 +116,8 @@ int strcmp(const char *cs, const char *ct);
 #endif
 
 #define __HAVE_ARCH_MEMCPY_MCSAFE 1
-__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
+__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src,
+		size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
 /**
@@ -131,14 +132,15 @@ DECLARE_STATIC_KEY_FALSE(mcsafe_key);
  * actually do machine check recovery. Everyone else can just
  * use memcpy().
  *
- * Return 0 for success, -EFAULT for fail
+ * Return 0 for success, or number of bytes not copied if there was an
+ * exception.
  */
-static __always_inline __must_check int
+static __always_inline __must_check unsigned long
 memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 {
 #ifdef CONFIG_X86_MCE
 	if (static_branch_unlikely(&mcsafe_key))
-		return memcpy_mcsafe_unrolled(dst, src, cnt);
+		return __memcpy_mcsafe(dst, src, cnt);
 	else
 #endif
 		memcpy(dst, src, cnt);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a5d9521bb2cb..2ff2a30a264f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
+#define TIF_SSBD			5	/* Reduced data speculation */
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
@@ -105,6 +106,7 @@ struct thread_info {
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
+#define _TIF_SSBD		(1 << TIF_SSBD)
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
@@ -144,7 +146,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
+	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 62546b3a398e..62acb613114b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -47,6 +47,17 @@ copy_user_generic(void *to, const void *from, unsigned len)
 }
 
 static __always_inline __must_check unsigned long
+copy_to_user_mcsafe(void *to, const void *from, unsigned len)
+{
+	unsigned long ret;
+
+	__uaccess_begin();
+	ret = memcpy_mcsafe(to, from, len);
+	__uaccess_end();
+	return ret;
+}
+
+static __always_inline __must_check unsigned long
 raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 {
 	int ret = 0;
@@ -194,4 +205,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 unsigned long
 copy_user_handle_tail(char *to, char *from, unsigned len);
 
+unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len);
+
 #endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ce8b4da07e35..2d27236c16a3 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -170,7 +170,7 @@ struct x86_cpuinit_ops {
 	void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
 };
 
-struct timespec;
+struct timespec64;
 
 /**
  * struct x86_legacy_devices - legacy x86 devices
@@ -264,8 +264,8 @@ struct x86_hyper_runtime {
 struct x86_platform_ops {
 	unsigned long (*calibrate_cpu)(void);
 	unsigned long (*calibrate_tsc)(void);
-	void (*get_wallclock)(struct timespec *ts);
-	int (*set_wallclock)(const struct timespec *ts);
+	void (*get_wallclock)(struct timespec64 *ts);
+	int (*set_wallclock)(const struct timespec64 *ts);
 	void (*iommu_shutdown)(void);
 	bool (*is_untracked_pat_range)(u64 start, u64 end);
 	void (*nmi_init)(void);
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 4c851ebb3ceb..0ede697c3961 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -29,7 +29,7 @@
 #define KVM_FEATURE_PV_TLB_FLUSH	9
 #define KVM_FEATURE_ASYNC_PF_VMEXIT	10
 
-#define KVM_HINTS_DEDICATED      0
+#define KVM_HINTS_REALTIME      0
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h
index 809134c644a6..90ab9a795b49 100644
--- a/arch/x86/include/uapi/asm/msgbuf.h
+++ b/arch/x86/include/uapi/asm/msgbuf.h
@@ -1 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X64_MSGBUF_H
+#define __ASM_X64_MSGBUF_H
+
+#if !defined(__x86_64__) || !defined(__ILP32__)
 #include <asm-generic/msgbuf.h>
+#else
+/*
+ * The msqid64_ds structure for x86 architecture with x32 ABI.
+ *
+ * On x86-32 and x86-64 we can just use the generic definition, but
+ * x32 uses the same binary layout as x86_64, which is differnet
+ * from other 32-bit architectures.
+ */
+
+struct msqid64_ds {
+	struct ipc64_perm msg_perm;
+	__kernel_time_t msg_stime;	/* last msgsnd time */
+	__kernel_time_t msg_rtime;	/* last msgrcv time */
+	__kernel_time_t msg_ctime;	/* last change time */
+	__kernel_ulong_t msg_cbytes;	/* current number of bytes on queue */
+	__kernel_ulong_t msg_qnum;	/* number of messages in queue */
+	__kernel_ulong_t msg_qbytes;	/* max number of bytes on queue */
+	__kernel_pid_t msg_lspid;	/* pid of last msgsnd */
+	__kernel_pid_t msg_lrpid;	/* last receive pid */
+	__kernel_ulong_t __unused4;
+	__kernel_ulong_t __unused5;
+};
+
+#endif
+
+#endif /* __ASM_GENERIC_MSGBUF_H */
diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h
index cabd7476bd6c..89de6cd9f0a7 100644
--- a/arch/x86/include/uapi/asm/sembuf.h
+++ b/arch/x86/include/uapi/asm/sembuf.h
@@ -8,15 +8,24 @@
  * between kernel and user space.
  *
  * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
  * - 2 miscellaneous 32-bit values
+ *
+ * x86_64 and x32 incorrectly added padding here, so the structures
+ * are still incompatible with the padding on x86.
  */
 struct semid64_ds {
 	struct ipc64_perm sem_perm;	/* permissions .. see ipc.h */
+#ifdef __i386__
+	unsigned long	sem_otime;	/* last semop time */
+	unsigned long	sem_otime_high;
+	unsigned long	sem_ctime;	/* last change time */
+	unsigned long	sem_ctime_high;
+#else
 	__kernel_time_t	sem_otime;	/* last semop time */
 	__kernel_ulong_t __unused1;
 	__kernel_time_t	sem_ctime;	/* last change time */
 	__kernel_ulong_t __unused2;
+#endif
 	__kernel_ulong_t sem_nsems;	/* no. of semaphores in array */
 	__kernel_ulong_t __unused3;
 	__kernel_ulong_t __unused4;
diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h
index 83c05fc2de38..644421f3823b 100644
--- a/arch/x86/include/uapi/asm/shmbuf.h
+++ b/arch/x86/include/uapi/asm/shmbuf.h
@@ -1 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X86_SHMBUF_H
+#define __ASM_X86_SHMBUF_H
+
+#if !defined(__x86_64__) || !defined(__ILP32__)
 #include <asm-generic/shmbuf.h>
+#else
+/*
+ * The shmid64_ds structure for x86 architecture with x32 ABI.
+ *
+ * On x86-32 and x86-64 we can just use the generic definition, but
+ * x32 uses the same binary layout as x86_64, which is differnet
+ * from other 32-bit architectures.
+ */
+
+struct shmid64_ds {
+	struct ipc64_perm	shm_perm;	/* operation perms */
+	size_t			shm_segsz;	/* size of segment (bytes) */
+	__kernel_time_t		shm_atime;	/* last attach time */
+	__kernel_time_t		shm_dtime;	/* last detach time */
+	__kernel_time_t		shm_ctime;	/* last change time */
+	__kernel_pid_t		shm_cpid;	/* pid of creator */
+	__kernel_pid_t		shm_lpid;	/* pid of last operator */
+	__kernel_ulong_t	shm_nattch;	/* no. of current attaches */
+	__kernel_ulong_t	__unused4;
+	__kernel_ulong_t	__unused5;
+};
+
+struct shminfo64 {
+	__kernel_ulong_t	shmmax;
+	__kernel_ulong_t	shmmin;
+	__kernel_ulong_t	shmmni;
+	__kernel_ulong_t	shmseg;
+	__kernel_ulong_t	shmall;
+	__kernel_ulong_t	__unused1;
+	__kernel_ulong_t	__unused2;
+	__kernel_ulong_t	__unused3;
+	__kernel_ulong_t	__unused4;
+};
+
+#endif
+
+#endif /* __ASM_X86_SHMBUF_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dde444f932c1..3b20607d581b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -215,6 +215,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 	apic_id = processor->local_apic_id;
 	enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
 
+	/* Ignore invalid ID */
+	if (apic_id == 0xffffffff)
+		return 0;
+
 	/*
 	 * We need to register disabled CPU as well to permit
 	 * counting disabled CPUs. This allows us to size
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index c88e0b127810..b481b95bd8f6 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -14,8 +14,11 @@
 #include <asm/amd_nb.h>
 
 #define PCI_DEVICE_ID_AMD_17H_ROOT	0x1450
+#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT	0x15d0
 #define PCI_DEVICE_ID_AMD_17H_DF_F3	0x1463
 #define PCI_DEVICE_ID_AMD_17H_DF_F4	0x1464
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
 
 /* Protect the PCI config register pairs used for SMN and DF indirect access. */
 static DEFINE_MUTEX(smn_mutex);
@@ -24,6 +27,7 @@ static u32 *flush_words;
 
 static const struct pci_device_id amd_root_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
 	{}
 };
 
@@ -39,6 +43,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
 	{}
 };
@@ -51,6 +56,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
 	{}
 };
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8b04234e010b..7685444a106b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -116,6 +116,7 @@ static void init_x2apic_ldr(void)
 			goto update;
 	}
 	cmsk = cluster_hotplug_mask;
+	cmsk->clusterid = cluster;
 	cluster_hotplug_mask = NULL;
 update:
 	this_cpu_write(cluster_masks, cmsk);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index dfcbe6924eaf..5d0de79fdab0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1715,19 +1715,6 @@ static int proc_apm_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_file_ops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_apm_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int apm(void *unused)
 {
 	unsigned short	bx;
@@ -2360,7 +2347,7 @@ static int __init apm_init(void)
 	set_desc_base(&gdt[APM_DS >> 3],
 		 (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
 
-	proc_create("apm", 0, NULL, &apm_file_ops);
+	proc_create_single("apm", 0, NULL, proc_apm_show);
 
 	kapmd_task = kthread_create(apm, NULL, "kapmd");
 	if (IS_ERR(kapmd_task)) {
@@ -2446,7 +2433,7 @@ MODULE_PARM_DESC(idle_threshold,
 	"System idle percentage above which to make APM BIOS idle calls");
 module_param(idle_period, int, 0444);
 MODULE_PARM_DESC(idle_period,
-	"Period (in sec/100) over which to caculate the idle percentage");
+	"Period (in sec/100) over which to calculate the idle percentage");
 module_param(smp, bool, 0444);
 MODULE_PARM_DESC(smp,
 	"Set this to enable APM use on an SMP platform. Use with caution on older systems");
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a66229f51b12..7a40196967cb 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,7 +17,7 @@ KCOV_INSTRUMENT_perf_event.o := n
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
 
-obj-y			:= intel_cacheinfo.o scattered.o topology.o
+obj-y			:= cacheinfo.o scattered.o topology.o
 obj-y			+= common.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 12bc0a1139da..082d7875cef8 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -9,7 +9,9 @@
 #include <linux/random.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/cacheinfo.h>
 #include <asm/cpu.h>
+#include <asm/spec-ctrl.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
 #include <asm/delay.h>
@@ -297,7 +299,6 @@ static int nearby_node(int apicid)
 }
 #endif
 
-#ifdef CONFIG_SMP
 /*
  * Fix up cpu_core_id for pre-F17h systems to be in the
  * [0 .. cores_per_node - 1] range. Not really needed but
@@ -327,6 +328,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 
 	/* get information required for multi-node processors */
 	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+		int err;
 		u32 eax, ebx, ecx, edx;
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -345,21 +347,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		}
 
 		/*
-		 * We may have multiple LLCs if L3 caches exist, so check if we
-		 * have an L3 cache by looking at the L3 cache CPUID leaf.
+		 * In case leaf B is available, use it to derive
+		 * topology information.
 		 */
-		if (cpuid_edx(0x80000006)) {
-			if (c->x86 == 0x17) {
-				/*
-				 * LLC is at the core complex level.
-				 * Core complex id is ApicId[3].
-				 */
-				per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
-			} else {
-				/* LLC is at the node level. */
-				per_cpu(cpu_llc_id, cpu) = node_id;
-			}
-		}
+		err = detect_extended_topology(c);
+		if (!err)
+			c->x86_coreid_bits = get_count_order(c->x86_max_cores);
+
+		cacheinfo_amd_init_llc_id(c, cpu, node_id);
+
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
 		u64 value;
 
@@ -375,7 +371,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		legacy_fixup_core_id(c);
 	}
 }
-#endif
 
 /*
  * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
@@ -383,7 +378,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
  */
 static void amd_detect_cmp(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_SMP
 	unsigned bits;
 	int cpu = smp_processor_id();
 
@@ -394,17 +388,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
 	c->phys_proc_id = c->initial_apicid >> bits;
 	/* use socket ID also for last level cache */
 	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-	amd_get_topology(c);
-#endif
 }
 
 u16 amd_get_nb_id(int cpu)
 {
-	u16 id = 0;
-#ifdef CONFIG_SMP
-	id = per_cpu(cpu_llc_id, cpu);
-#endif
-	return id;
+	return per_cpu(cpu_llc_id, cpu);
 }
 EXPORT_SYMBOL_GPL(amd_get_nb_id);
 
@@ -554,6 +542,26 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
 		nodes_per_socket = ((value >> 3) & 7) + 1;
 	}
+
+	if (c->x86 >= 0x15 && c->x86 <= 0x17) {
+		unsigned int bit;
+
+		switch (c->x86) {
+		case 0x15: bit = 54; break;
+		case 0x16: bit = 33; break;
+		case 0x17: bit = 10; break;
+		default: return;
+		}
+		/*
+		 * Try to cache the base value so further operations can
+		 * avoid RMW. If that faults, do not enable SSBD.
+		 */
+		if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+			setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+			setup_force_cpu_cap(X86_FEATURE_SSBD);
+			x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
+		}
+	}
 }
 
 static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -791,6 +799,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 
 static void init_amd_zn(struct cpuinfo_x86 *c)
 {
+	set_cpu_cap(c, X86_FEATURE_ZEN);
 	/*
 	 * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
 	 * all up to and including B1.
@@ -842,6 +851,7 @@ static void init_amd(struct cpuinfo_x86 *c)
 	/* Multi core CPU? */
 	if (c->extended_cpuid_level >= 0x80000008) {
 		amd_detect_cmp(c);
+		amd_get_topology(c);
 		srat_detect_node(c);
 	}
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bfca937bdcc3..7416fc206b4a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -12,8 +12,10 @@
 #include <linux/utsname.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
 
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 #include <asm/cmdline.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
@@ -27,6 +29,27 @@
 #include <asm/intel-family.h>
 
 static void __init spectre_v2_select_mitigation(void);
+static void __init ssb_select_mitigation(void);
+
+/*
+ * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
+ * writes to SPEC_CTRL contain whatever reserved bits have been set.
+ */
+u64 __ro_after_init x86_spec_ctrl_base;
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+
+/*
+ * The vendor and possibly platform specific bits which can be modified in
+ * x86_spec_ctrl_base.
+ */
+static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
+
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
 
 void __init check_bugs(void)
 {
@@ -37,9 +60,27 @@ void __init check_bugs(void)
 		print_cpu_info(&boot_cpu_data);
 	}
 
+	/*
+	 * Read the SPEC_CTRL MSR to account for reserved bits which may
+	 * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+	 * init code as it is not enumerated and depends on the family.
+	 */
+	if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+		rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+	/* Allow STIBP in MSR_SPEC_CTRL if supported */
+	if (boot_cpu_has(X86_FEATURE_STIBP))
+		x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
+
 	/* Select the proper spectre mitigation before patching alternatives */
 	spectre_v2_select_mitigation();
 
+	/*
+	 * Select proper mitigation for any exposure to the Speculative Store
+	 * Bypass vulnerability.
+	 */
+	ssb_select_mitigation();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -93,7 +134,76 @@ static const char *spectre_v2_strings[] = {
 #undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V2 : " fmt
 
-static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+	SPECTRE_V2_NONE;
+
+void
+x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
+{
+	u64 msrval, guestval, hostval = x86_spec_ctrl_base;
+	struct thread_info *ti = current_thread_info();
+
+	/* Is MSR_SPEC_CTRL implemented ? */
+	if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
+		/*
+		 * Restrict guest_spec_ctrl to supported values. Clear the
+		 * modifiable bits in the host base value and or the
+		 * modifiable bits from the guest value.
+		 */
+		guestval = hostval & ~x86_spec_ctrl_mask;
+		guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
+
+		/* SSBD controlled in MSR_SPEC_CTRL */
+		if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+			hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
+
+		if (hostval != guestval) {
+			msrval = setguest ? guestval : hostval;
+			wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
+		}
+	}
+
+	/*
+	 * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+	 * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+	 */
+	if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+	    !static_cpu_has(X86_FEATURE_VIRT_SSBD))
+		return;
+
+	/*
+	 * If the host has SSBD mitigation enabled, force it in the host's
+	 * virtual MSR value. If its not permanently enabled, evaluate
+	 * current's TIF_SSBD thread flag.
+	 */
+	if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE))
+		hostval = SPEC_CTRL_SSBD;
+	else
+		hostval = ssbd_tif_to_spec_ctrl(ti->flags);
+
+	/* Sanitize the guest value */
+	guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD;
+
+	if (hostval != guestval) {
+		unsigned long tif;
+
+		tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
+				 ssbd_spec_ctrl_to_tif(hostval);
+
+		speculative_store_bypass_update(tif);
+	}
+}
+EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
+
+static void x86_amd_ssb_disable(void)
+{
+	u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
+
+	if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+		wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+	else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+		wrmsrl(MSR_AMD64_LS_CFG, msrval);
+}
 
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
@@ -312,32 +422,289 @@ retpoline_auto:
 }
 
 #undef pr_fmt
+#define pr_fmt(fmt)	"Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE;
+
+/* The kernel command line selection */
+enum ssb_mitigation_cmd {
+	SPEC_STORE_BYPASS_CMD_NONE,
+	SPEC_STORE_BYPASS_CMD_AUTO,
+	SPEC_STORE_BYPASS_CMD_ON,
+	SPEC_STORE_BYPASS_CMD_PRCTL,
+	SPEC_STORE_BYPASS_CMD_SECCOMP,
+};
+
+static const char *ssb_strings[] = {
+	[SPEC_STORE_BYPASS_NONE]	= "Vulnerable",
+	[SPEC_STORE_BYPASS_DISABLE]	= "Mitigation: Speculative Store Bypass disabled",
+	[SPEC_STORE_BYPASS_PRCTL]	= "Mitigation: Speculative Store Bypass disabled via prctl",
+	[SPEC_STORE_BYPASS_SECCOMP]	= "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
+};
+
+static const struct {
+	const char *option;
+	enum ssb_mitigation_cmd cmd;
+} ssb_mitigation_options[] = {
+	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO },    /* Platform decides */
+	{ "on",		SPEC_STORE_BYPASS_CMD_ON },      /* Disable Speculative Store Bypass */
+	{ "off",	SPEC_STORE_BYPASS_CMD_NONE },    /* Don't touch Speculative Store Bypass */
+	{ "prctl",	SPEC_STORE_BYPASS_CMD_PRCTL },   /* Disable Speculative Store Bypass via prctl */
+	{ "seccomp",	SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */
+};
+
+static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
+{
+	enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+	char arg[20];
+	int ret, i;
+
+	if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
+		return SPEC_STORE_BYPASS_CMD_NONE;
+	} else {
+		ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
+					  arg, sizeof(arg));
+		if (ret < 0)
+			return SPEC_STORE_BYPASS_CMD_AUTO;
+
+		for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
+			if (!match_option(arg, ret, ssb_mitigation_options[i].option))
+				continue;
+
+			cmd = ssb_mitigation_options[i].cmd;
+			break;
+		}
+
+		if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
+			pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+			return SPEC_STORE_BYPASS_CMD_AUTO;
+		}
+	}
+
+	return cmd;
+}
+
+static enum ssb_mitigation __init __ssb_select_mitigation(void)
+{
+	enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
+	enum ssb_mitigation_cmd cmd;
+
+	if (!boot_cpu_has(X86_FEATURE_SSBD))
+		return mode;
+
+	cmd = ssb_parse_cmdline();
+	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
+	    (cmd == SPEC_STORE_BYPASS_CMD_NONE ||
+	     cmd == SPEC_STORE_BYPASS_CMD_AUTO))
+		return mode;
+
+	switch (cmd) {
+	case SPEC_STORE_BYPASS_CMD_AUTO:
+	case SPEC_STORE_BYPASS_CMD_SECCOMP:
+		/*
+		 * Choose prctl+seccomp as the default mode if seccomp is
+		 * enabled.
+		 */
+		if (IS_ENABLED(CONFIG_SECCOMP))
+			mode = SPEC_STORE_BYPASS_SECCOMP;
+		else
+			mode = SPEC_STORE_BYPASS_PRCTL;
+		break;
+	case SPEC_STORE_BYPASS_CMD_ON:
+		mode = SPEC_STORE_BYPASS_DISABLE;
+		break;
+	case SPEC_STORE_BYPASS_CMD_PRCTL:
+		mode = SPEC_STORE_BYPASS_PRCTL;
+		break;
+	case SPEC_STORE_BYPASS_CMD_NONE:
+		break;
+	}
+
+	/*
+	 * We have three CPU feature flags that are in play here:
+	 *  - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+	 *  - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
+	 *  - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+	 */
+	if (mode == SPEC_STORE_BYPASS_DISABLE) {
+		setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+		/*
+		 * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
+		 * a completely different MSR and bit dependent on family.
+		 */
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_INTEL:
+			x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+			x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
+			wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+			break;
+		case X86_VENDOR_AMD:
+			x86_amd_ssb_disable();
+			break;
+		}
+	}
+
+	return mode;
+}
+
+static void ssb_select_mitigation(void)
+{
+	ssb_mode = __ssb_select_mitigation();
+
+	if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+		pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Speculation prctl: " fmt
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+	bool update;
+
+	if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
+	    ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
+		return -ENXIO;
+
+	switch (ctrl) {
+	case PR_SPEC_ENABLE:
+		/* If speculation is force disabled, enable is not allowed */
+		if (task_spec_ssb_force_disable(task))
+			return -EPERM;
+		task_clear_spec_ssb_disable(task);
+		update = test_and_clear_tsk_thread_flag(task, TIF_SSBD);
+		break;
+	case PR_SPEC_DISABLE:
+		task_set_spec_ssb_disable(task);
+		update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
+		break;
+	case PR_SPEC_FORCE_DISABLE:
+		task_set_spec_ssb_disable(task);
+		task_set_spec_ssb_force_disable(task);
+		update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	/*
+	 * If being set on non-current task, delay setting the CPU
+	 * mitigation until it is next scheduled.
+	 */
+	if (task == current && update)
+		speculative_store_bypass_update_current();
+
+	return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+			     unsigned long ctrl)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssb_prctl_set(task, ctrl);
+	default:
+		return -ENODEV;
+	}
+}
+
+#ifdef CONFIG_SECCOMP
+void arch_seccomp_spec_mitigate(struct task_struct *task)
+{
+	if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
+		ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+}
+#endif
+
+static int ssb_prctl_get(struct task_struct *task)
+{
+	switch (ssb_mode) {
+	case SPEC_STORE_BYPASS_DISABLE:
+		return PR_SPEC_DISABLE;
+	case SPEC_STORE_BYPASS_SECCOMP:
+	case SPEC_STORE_BYPASS_PRCTL:
+		if (task_spec_ssb_force_disable(task))
+			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+		if (task_spec_ssb_disable(task))
+			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+	default:
+		if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+			return PR_SPEC_ENABLE;
+		return PR_SPEC_NOT_AFFECTED;
+	}
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssb_prctl_get(task);
+	default:
+		return -ENODEV;
+	}
+}
+
+void x86_spec_ctrl_setup_ap(void)
+{
+	if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+		wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+	if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+		x86_amd_ssb_disable();
+}
 
 #ifdef CONFIG_SYSFS
-ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+
+static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+			       char *buf, unsigned int bug)
 {
-	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+	if (!boot_cpu_has_bug(bug))
 		return sprintf(buf, "Not affected\n");
-	if (boot_cpu_has(X86_FEATURE_PTI))
-		return sprintf(buf, "Mitigation: PTI\n");
+
+	switch (bug) {
+	case X86_BUG_CPU_MELTDOWN:
+		if (boot_cpu_has(X86_FEATURE_PTI))
+			return sprintf(buf, "Mitigation: PTI\n");
+
+		break;
+
+	case X86_BUG_SPECTRE_V1:
+		return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+
+	case X86_BUG_SPECTRE_V2:
+		return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+			       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
+			       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+			       spectre_v2_module_string());
+
+	case X86_BUG_SPEC_STORE_BYPASS:
+		return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+
+	default:
+		break;
+	}
+
 	return sprintf(buf, "Vulnerable\n");
 }
 
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
+}
+
 ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
-		return sprintf(buf, "Not affected\n");
-	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
 }
 
 ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-		return sprintf(buf, "Not affected\n");
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
+}
 
-	return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
-		       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
-		       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
-		       spectre_v2_module_string());
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
 }
 #endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 54d04d574148..38354c66df81 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -20,6 +20,8 @@
 #include <asm/amd_nb.h>
 #include <asm/smp.h>
 
+#include "cpu.h"
+
 #define LVL_1_INST	1
 #define LVL_1_DATA	2
 #define LVL_2		3
@@ -637,6 +639,45 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
 	return i;
 }
 
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+{
+	/*
+	 * We may have multiple LLCs if L3 caches exist, so check if we
+	 * have an L3 cache by looking at the L3 cache CPUID leaf.
+	 */
+	if (!cpuid_edx(0x80000006))
+		return;
+
+	if (c->x86 < 0x17) {
+		/* LLC is at the node level. */
+		per_cpu(cpu_llc_id, cpu) = node_id;
+	} else if (c->x86 == 0x17 &&
+		   c->x86_model >= 0 && c->x86_model <= 0x1F) {
+		/*
+		 * LLC is at the core complex level.
+		 * Core complex ID is ApicId[3] for these processors.
+		 */
+		per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+	} else {
+		/*
+		 * LLC ID is calculated from the number of threads sharing the
+		 * cache.
+		 * */
+		u32 eax, ebx, ecx, edx, num_sharing_cache = 0;
+		u32 llc_index = find_num_cache_leaves(c) - 1;
+
+		cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx);
+		if (eax)
+			num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
+
+		if (num_sharing_cache) {
+			int bits = get_count_order(num_sharing_cache) - 1;
+
+			per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+		}
+	}
+}
+
 void init_amd_cacheinfo(struct cpuinfo_x86 *c)
 {
 
@@ -650,7 +691,7 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c)
 	}
 }
 
-unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
 	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
@@ -802,7 +843,8 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
 	c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
 
-	return l2;
+	if (!l2)
+		cpu_detect_cache_sizes(c);
 }
 
 static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e5ec0f11c0de..14433ff5b828 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -18,6 +18,13 @@
 #define RNG_ENABLED	(1 << 3)
 #define RNG_ENABLE	(1 << 6)	/* MSR_VIA_RNG */
 
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW	0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI		0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS	0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC	0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT		0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID		0x00000020
+
 static void init_c3(struct cpuinfo_x86 *c)
 {
 	u32  lo, hi;
@@ -112,6 +119,31 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
 	}
 }
 
+static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+	u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+	msr_ctl = vmx_msr_high | vmx_msr_low;
+
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+		set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+		set_cpu_cap(c, X86_FEATURE_VNMI);
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+		      vmx_msr_low, vmx_msr_high);
+		msr_ctl2 = vmx_msr_high | vmx_msr_low;
+		if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+		    (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+			set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+			set_cpu_cap(c, X86_FEATURE_EPT);
+		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+			set_cpu_cap(c, X86_FEATURE_VPID);
+	}
+}
+
 static void init_centaur(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_32
@@ -128,6 +160,24 @@ static void init_centaur(struct cpuinfo_x86 *c)
 	clear_cpu_cap(c, 0*32+31);
 #endif
 	early_init_centaur(c);
+	init_intel_cacheinfo(c);
+	detect_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+	detect_ht(c);
+#endif
+
+	if (c->cpuid_level > 9) {
+		unsigned int eax = cpuid_eax(10);
+
+		/*
+		 * Check for version and the number of counters
+		 * Version(eax[7:0]) can't be 0;
+		 * Counters(eax[15:8]) should be greater than 1;
+		 */
+		if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+	}
+
 	switch (c->x86) {
 #ifdef CONFIG_X86_32
 	case 5:
@@ -199,6 +249,9 @@ static void init_centaur(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 #endif
+
+	if (cpu_has(c, X86_FEATURE_VMX))
+		centaur_detect_vmx_virtcap(c);
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8a5b185735e1..95c8e507580d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -66,6 +66,13 @@ cpumask_var_t cpu_callin_mask;
 /* representing cpus for which sibling maps can be computed */
 cpumask_var_t cpu_sibling_setup_mask;
 
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+
 /* correctly size the local cpu masks */
 void __init setup_cpu_local_masks(void)
 {
@@ -577,6 +584,19 @@ static void get_model_name(struct cpuinfo_x86 *c)
 	*(s + 1) = '\0';
 }
 
+void detect_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	c->x86_max_cores = 1;
+	if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
+		return;
+
+	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
+	if (eax & 0x1f)
+		c->x86_max_cores = (eax >> 26) + 1;
+}
+
 void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
 {
 	unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -757,17 +777,32 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
 	 * and they also have a different bit for STIBP support. Also,
 	 * a hypervisor might have set the individual AMD bits even on
 	 * Intel CPUs, for finer-grained selection of what's available.
-	 *
-	 * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
-	 * features, which are visible in /proc/cpuinfo and used by the
-	 * kernel. So set those accordingly from the Intel bits.
 	 */
 	if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
 		set_cpu_cap(c, X86_FEATURE_IBRS);
 		set_cpu_cap(c, X86_FEATURE_IBPB);
+		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
 	}
+
 	if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
 		set_cpu_cap(c, X86_FEATURE_STIBP);
+
+	if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+	    cpu_has(c, X86_FEATURE_VIRT_SSBD))
+		set_cpu_cap(c, X86_FEATURE_SSBD);
+
+	if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
+		set_cpu_cap(c, X86_FEATURE_IBRS);
+		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+	}
+
+	if (cpu_has(c, X86_FEATURE_AMD_IBPB))
+		set_cpu_cap(c, X86_FEATURE_IBPB);
+
+	if (cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+		set_cpu_cap(c, X86_FEATURE_STIBP);
+		set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+	}
 }
 
 void get_cpu_cap(struct cpuinfo_x86 *c)
@@ -848,6 +883,11 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_power = edx;
 	}
 
+	if (c->extended_cpuid_level >= 0x80000008) {
+		cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+		c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+	}
+
 	if (c->extended_cpuid_level >= 0x8000000a)
 		c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
 
@@ -871,7 +911,6 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c)
 
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
-		c->x86_capability[CPUID_8000_0008_EBX] = ebx;
 	}
 #ifdef CONFIG_X86_32
 	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
@@ -923,21 +962,47 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
 	{}
 };
 
-static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
+/* Only list CPUs which speculate but are non susceptible to SSB */
+static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT1	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT2	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MERRIFIELD	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_CORE_YONAH		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
+	{ X86_VENDOR_AMD,	0x12,					},
+	{ X86_VENDOR_AMD,	0x11,					},
+	{ X86_VENDOR_AMD,	0x10,					},
+	{ X86_VENDOR_AMD,	0xf,					},
+	{}
+};
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
 
-	if (x86_match_cpu(cpu_no_meltdown))
-		return false;
+	if (x86_match_cpu(cpu_no_speculation))
+		return;
+
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
 
 	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
 
+	if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
+	   !(ia32_cap & ARCH_CAP_SSB_NO))
+		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
+	if (x86_match_cpu(cpu_no_meltdown))
+		return;
+
 	/* Rogue Data Cache Load? No! */
 	if (ia32_cap & ARCH_CAP_RDCL_NO)
-		return false;
+		return;
 
-	return true;
+	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 }
 
 /*
@@ -988,12 +1053,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
-	if (!x86_match_cpu(cpu_no_speculation)) {
-		if (cpu_vulnerable_to_meltdown(c))
-			setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
-		setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
-		setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
-	}
+	cpu_set_bug_bits(c);
 
 	fpu__init_system(c);
 
@@ -1004,6 +1064,21 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	 */
 	setup_clear_cpu_cap(X86_FEATURE_PCID);
 #endif
+
+	/*
+	 * Later in the boot process pgtable_l5_enabled() relies on
+	 * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
+	 * enabled by this point we need to clear the feature bit to avoid
+	 * false-positives at the later stage.
+	 *
+	 * pgtable_l5_enabled() can be false here for several reasons:
+	 *  - 5-level paging is disabled compile-time;
+	 *  - it's 32-bit kernel;
+	 *  - machine doesn't support 5-level paging;
+	 *  - user specified 'no5lvl' in kernel command line.
+	 */
+	if (!pgtable_l5_enabled())
+		setup_clear_cpu_cap(X86_FEATURE_LA57);
 }
 
 void __init early_cpu_init(void)
@@ -1355,6 +1430,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
 #endif
 	mtrr_ap_init();
 	validate_apic_and_package_id(c);
+	x86_spec_ctrl_setup_ap();
 }
 
 static __init int setup_noclflush(char *arg)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index e806b11a99af..38216f678fc3 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -47,7 +47,19 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
 
 extern void get_cpu_cap(struct cpuinfo_x86 *c);
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern u32 get_scattered_cpuid_leaf(unsigned int level,
+				    unsigned int sub_leaf,
+				    enum cpuid_regs_idx reg);
+extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+
+extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
+extern int detect_extended_topology(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
 
 unsigned int aperfmperf_get_khz(int cpu);
 
+extern void x86_spec_ctrl_setup_ap(void);
+
 #endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b9693b80fc21..eb75564f2d25 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -188,7 +188,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		setup_clear_cpu_cap(X86_FEATURE_IBPB);
 		setup_clear_cpu_cap(X86_FEATURE_STIBP);
 		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+		setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
 		setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+		setup_clear_cpu_cap(X86_FEATURE_SSBD);
+		setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
 	}
 
 	/*
@@ -453,24 +456,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 #endif
 }
 
-/*
- * find out the number of processor cores on the die
- */
-static int intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
-		return 1;
-
-	/* Intel has a non-standard dependency on %ecx for this CPUID level. */
-	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
-	if (eax & 0x1f)
-		return (eax >> 26) + 1;
-	else
-		return 1;
-}
-
 static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 {
 	/* Intel VMX MSR indicated features */
@@ -653,8 +638,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 
 static void init_intel(struct cpuinfo_x86 *c)
 {
-	unsigned int l2 = 0;
-
 	early_init_intel(c);
 
 	intel_workarounds(c);
@@ -671,19 +654,13 @@ static void init_intel(struct cpuinfo_x86 *c)
 		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
 		 * detection.
 		 */
-		c->x86_max_cores = intel_num_cpu_cores(c);
+		detect_num_cpu_cores(c);
 #ifdef CONFIG_X86_32
 		detect_ht(c);
 #endif
 	}
 
-	l2 = init_intel_cacheinfo(c);
-
-	/* Detect legacy cache sizes if init_intel_cacheinfo did not */
-	if (l2 == 0) {
-		cpu_detect_cache_sizes(c);
-		l2 = c->x86_cache_size;
-	}
+	init_intel_cacheinfo(c);
 
 	if (c->cpuid_level > 9) {
 		unsigned eax = cpuid_eax(10);
@@ -696,7 +673,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 
 	if (boot_cpu_has(X86_FEATURE_DS)) {
-		unsigned int l1;
+		unsigned int l1, l2;
+
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
 		if (!(l1 & (1<<11)))
 			set_cpu_cap(c, X86_FEATURE_BTS);
@@ -724,6 +702,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 	 * Dixon is NOT a Celeron.
 	 */
 	if (c->x86 == 6) {
+		unsigned int l2 = c->x86_cache_size;
 		char *p = NULL;
 
 		switch (c->x86_model) {
@@ -835,6 +814,9 @@ static const struct _tlb_table intel_tlb_table[] = {
 	{ 0x5d, TLB_DATA_4K_4M,		256,	" TLB_DATA 4 KByte and 4 MByte pages" },
 	{ 0x61, TLB_INST_4K,		48,	" TLB_INST 4 KByte pages, full associative" },
 	{ 0x63, TLB_DATA_1G,		4,	" TLB_DATA 1 GByte pages, 4-way set associative" },
+	{ 0x6b, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 8-way associative" },
+	{ 0x6c, TLB_DATA_2M_4M,		128,	" TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
+	{ 0x6d, TLB_DATA_1G,		16,	" TLB_DATA 1 GByte pages, fully associative" },
 	{ 0x76, TLB_INST_2M_4M,		8,	" TLB_INST 2-MByte or 4-MByte pages, fully associative" },
 	{ 0xb0, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 4-way set associative" },
 	{ 0xb1, TLB_INST_2M_4M,		4,	" TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 589b948e6e01..24bfa63e86cf 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,8 +33,8 @@
 #include <asm/intel_rdt_sched.h>
 #include "intel_rdt.h"
 
-#define MAX_MBA_BW	100u
 #define MBA_IS_LINEAR	0x4
+#define MBA_MAX_MBPS	U32_MAX
 
 /* Mutex to protect rdtgroup access. */
 DEFINE_MUTEX(rdtgroup_mutex);
@@ -178,7 +178,7 @@ struct rdt_resource rdt_resources_all[] = {
 		.msr_update		= mba_wrmsr,
 		.cache_level		= 3,
 		.parse_ctrlval		= parse_bw,
-		.format_str		= "%d=%*d",
+		.format_str		= "%d=%*u",
 		.fflags			= RFTYPE_RES_MB,
 	},
 };
@@ -230,6 +230,14 @@ static inline void cache_alloc_hsw_probe(void)
 	rdt_alloc_capable = true;
 }
 
+bool is_mba_sc(struct rdt_resource *r)
+{
+	if (!r)
+		return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc;
+
+	return r->membw.mba_sc;
+}
+
 /*
  * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
  * exposed to user interface and the h/w understandable delay values.
@@ -341,7 +349,7 @@ static int get_cache_id(int cpu, int level)
  * that can be written to QOS_MSRs.
  * There are currently no SKUs which support non linear delay values.
  */
-static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
 {
 	if (r->membw.delay_linear)
 		return MAX_MBA_BW - bw;
@@ -431,25 +439,40 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
 	return NULL;
 }
 
+void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
+{
+	int i;
+
+	/*
+	 * Initialize the Control MSRs to having no control.
+	 * For Cache Allocation: Set all bits in cbm
+	 * For Memory Allocation: Set b/w requested to 100%
+	 * and the bandwidth in MBps to U32_MAX
+	 */
+	for (i = 0; i < r->num_closid; i++, dc++, dm++) {
+		*dc = r->default_ctrl;
+		*dm = MBA_MAX_MBPS;
+	}
+}
+
 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 {
 	struct msr_param m;
-	u32 *dc;
-	int i;
+	u32 *dc, *dm;
 
 	dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
 	if (!dc)
 		return -ENOMEM;
 
-	d->ctrl_val = dc;
+	dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
+	if (!dm) {
+		kfree(dc);
+		return -ENOMEM;
+	}
 
-	/*
-	 * Initialize the Control MSRs to having no control.
-	 * For Cache Allocation: Set all bits in cbm
-	 * For Memory Allocation: Set b/w requested to 100
-	 */
-	for (i = 0; i < r->num_closid; i++, dc++)
-		*dc = r->default_ctrl;
+	d->ctrl_val = dc;
+	d->mbps_val = dm;
+	setup_default_ctrlval(r, dc, dm);
 
 	m.low = 0;
 	m.high = r->num_closid;
@@ -588,6 +611,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		}
 
 		kfree(d->ctrl_val);
+		kfree(d->mbps_val);
 		kfree(d->rmid_busy_llc);
 		kfree(d->mbm_total);
 		kfree(d->mbm_local);
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 3fd7a70ee04a..39752825e376 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
 
 #define MBM_CNTR_WIDTH			24
 #define MBM_OVERFLOW_INTERVAL		1000
+#define MAX_MBA_BW			100u
 
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
@@ -180,10 +181,20 @@ struct rftype {
  * struct mbm_state - status for each MBM counter in each domain
  * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
  * @prev_msr	Value of IA32_QM_CTR for this RMID last time we read it
+ * @chunks_bw	Total local data moved. Used for bandwidth calculation
+ * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
+ * @prev_bw	The most recent bandwidth in MBps
+ * @delta_bw	Difference between the current and previous bandwidth
+ * @delta_comp	Indicates whether to compute the delta_bw
  */
 struct mbm_state {
 	u64	chunks;
 	u64	prev_msr;
+	u64	chunks_bw;
+	u64	prev_bw_msr;
+	u32	prev_bw;
+	u32	delta_bw;
+	bool	delta_comp;
 };
 
 /**
@@ -202,6 +213,7 @@ struct mbm_state {
  * @cqm_work_cpu:
  *		worker cpu for CQM h/w counters
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
+ * @mbps_val:	When mba_sc is enabled, this holds the bandwidth in MBps
  * @new_ctrl:	new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
  */
@@ -217,6 +229,7 @@ struct rdt_domain {
 	int			mbm_work_cpu;
 	int			cqm_work_cpu;
 	u32			*ctrl_val;
+	u32			*mbps_val;
 	u32			new_ctrl;
 	bool			have_new_ctrl;
 };
@@ -259,6 +272,7 @@ struct rdt_cache {
  * @min_bw:		Minimum memory bandwidth percentage user can request
  * @bw_gran:		Granularity at which the memory bandwidth is allocated
  * @delay_linear:	True if memory B/W delay is in linear scale
+ * @mba_sc:		True if MBA software controller(mba_sc) is enabled
  * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
  */
 struct rdt_membw {
@@ -266,6 +280,7 @@ struct rdt_membw {
 	u32		min_bw;
 	u32		bw_gran;
 	u32		delay_linear;
+	bool		mba_sc;
 	u32		*mb_map;
 };
 
@@ -445,6 +460,9 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
 void mbm_setup_overflow_handler(struct rdt_domain *dom,
 				unsigned long delay_ms);
 void mbm_handle_overflow(struct work_struct *work);
+bool is_mba_sc(struct rdt_resource *r);
+void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void cqm_handle_limbo(struct work_struct *work);
 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 23e1d5c249c6..116d57b248d3 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -53,7 +53,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 		return false;
 	}
 
-	if (bw < r->membw.min_bw || bw > r->default_ctrl) {
+	if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
+	    !is_mba_sc(r)) {
 		rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
 				    r->membw.min_bw, r->default_ctrl);
 		return false;
@@ -179,6 +180,8 @@ static int update_domains(struct rdt_resource *r, int closid)
 	struct msr_param msr_param;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
+	bool mba_sc;
+	u32 *dc;
 	int cpu;
 
 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
@@ -188,13 +191,20 @@ static int update_domains(struct rdt_resource *r, int closid)
 	msr_param.high = msr_param.low + 1;
 	msr_param.res = r;
 
+	mba_sc = is_mba_sc(r);
 	list_for_each_entry(d, &r->domains, list) {
-		if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
+		dc = !mba_sc ? d->ctrl_val : d->mbps_val;
+		if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
 			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
-			d->ctrl_val[closid] = d->new_ctrl;
+			dc[closid] = d->new_ctrl;
 		}
 	}
-	if (cpumask_empty(cpu_mask))
+
+	/*
+	 * Avoid writing the control msr with control values when
+	 * MBA software controller is enabled
+	 */
+	if (cpumask_empty(cpu_mask) || mba_sc)
 		goto done;
 	cpu = get_cpu();
 	/* Update CBM on this cpu if it's in cpu_mask. */
@@ -282,13 +292,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
 {
 	struct rdt_domain *dom;
 	bool sep = false;
+	u32 ctrl_val;
 
 	seq_printf(s, "%*s:", max_name_width, r->name);
 	list_for_each_entry(dom, &r->domains, list) {
 		if (sep)
 			seq_puts(s, ";");
+
+		ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] :
+			    dom->mbps_val[closid]);
 		seq_printf(s, r->format_str, dom->id, max_data_width,
-			   dom->ctrl_val[closid]);
+			   ctrl_val);
 		sep = true;
 	}
 	seq_puts(s, "\n");
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 681450eee428..b0f3aed76b75 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -225,10 +225,18 @@ void free_rmid(u32 rmid)
 		list_add_tail(&entry->list, &rmid_free_lru);
 }
 
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
+{
+	u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
+
+	chunks = (cur_msr << shift) - (prev_msr << shift);
+	return chunks >>= shift;
+}
+
 static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 {
-	u64 chunks, shift, tval;
 	struct mbm_state *m;
+	u64 chunks, tval;
 
 	tval = __rmid_read(rmid, rr->evtid);
 	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
@@ -254,14 +262,12 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 	}
 
 	if (rr->first) {
-		m->prev_msr = tval;
-		m->chunks = 0;
+		memset(m, 0, sizeof(struct mbm_state));
+		m->prev_bw_msr = m->prev_msr = tval;
 		return 0;
 	}
 
-	shift = 64 - MBM_CNTR_WIDTH;
-	chunks = (tval << shift) - (m->prev_msr << shift);
-	chunks >>= shift;
+	chunks = mbm_overflow_count(m->prev_msr, tval);
 	m->chunks += chunks;
 	m->prev_msr = tval;
 
@@ -270,6 +276,32 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 }
 
 /*
+ * Supporting function to calculate the memory bandwidth
+ * and delta bandwidth in MBps.
+ */
+static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct mbm_state *m = &rr->d->mbm_local[rmid];
+	u64 tval, cur_bw, chunks;
+
+	tval = __rmid_read(rmid, rr->evtid);
+	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	chunks = mbm_overflow_count(m->prev_bw_msr, tval);
+	m->chunks_bw += chunks;
+	m->chunks = m->chunks_bw;
+	cur_bw = (chunks * r->mon_scale) >> 20;
+
+	if (m->delta_comp)
+		m->delta_bw = abs(cur_bw - m->prev_bw);
+	m->delta_comp = false;
+	m->prev_bw = cur_bw;
+	m->prev_bw_msr = tval;
+}
+
+/*
  * This is called via IPI to read the CQM/MBM counters
  * on a domain.
  */
@@ -297,6 +329,118 @@ void mon_event_count(void *info)
 	}
 }
 
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ *   current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values.  To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+{
+	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+	struct mbm_state *pmbm_data, *cmbm_data;
+	u32 cur_bw, delta_bw, user_bw;
+	struct rdt_resource *r_mba;
+	struct rdt_domain *dom_mba;
+	struct list_head *head;
+	struct rdtgroup *entry;
+
+	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+	closid = rgrp->closid;
+	rmid = rgrp->mon.rmid;
+	pmbm_data = &dom_mbm->mbm_local[rmid];
+
+	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+	if (!dom_mba) {
+		pr_warn_once("Failure to get domain for MBA update\n");
+		return;
+	}
+
+	cur_bw = pmbm_data->prev_bw;
+	user_bw = dom_mba->mbps_val[closid];
+	delta_bw = pmbm_data->delta_bw;
+	cur_msr_val = dom_mba->ctrl_val[closid];
+
+	/*
+	 * For Ctrl groups read data from child monitor groups.
+	 */
+	head = &rgrp->mon.crdtgrp_list;
+	list_for_each_entry(entry, head, mon.crdtgrp_list) {
+		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+		cur_bw += cmbm_data->prev_bw;
+		delta_bw += cmbm_data->delta_bw;
+	}
+
+	/*
+	 * Scale up/down the bandwidth linearly for the ctrl group.  The
+	 * bandwidth step is the bandwidth granularity specified by the
+	 * hardware.
+	 *
+	 * The delta_bw is used when increasing the bandwidth so that we
+	 * dont alternately increase and decrease the control values
+	 * continuously.
+	 *
+	 * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
+	 * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
+	 * switching between 90 and 110 continuously if we only check
+	 * cur_bw < user_bw.
+	 */
+	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+		new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+	} else if (cur_msr_val < MAX_MBA_BW &&
+		   (user_bw > (cur_bw + delta_bw))) {
+		new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+	} else {
+		return;
+	}
+
+	cur_msr = r_mba->msr_base + closid;
+	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
+	dom_mba->ctrl_val[closid] = new_msr_val;
+
+	/*
+	 * Delta values are updated dynamically package wise for each
+	 * rdtgrp everytime the throttle MSR changes value.
+	 *
+	 * This is because (1)the increase in bandwidth is not perfectly
+	 * linear and only "approximately" linear even when the hardware
+	 * says it is linear.(2)Also since MBA is a core specific
+	 * mechanism, the delta values vary based on number of cores used
+	 * by the rdtgrp.
+	 */
+	pmbm_data->delta_comp = true;
+	list_for_each_entry(entry, head, mon.crdtgrp_list) {
+		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+		cmbm_data->delta_comp = true;
+	}
+}
+
 static void mbm_update(struct rdt_domain *d, int rmid)
 {
 	struct rmid_read rr;
@@ -314,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
 	}
 	if (is_mbm_local_enabled()) {
 		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
-		__mon_event_count(rmid, &rr);
+
+		/*
+		 * Call the MBA software controller only for the
+		 * control groups and when user has enabled
+		 * the software controller explicitly.
+		 */
+		if (!is_mba_sc(NULL))
+			__mon_event_count(rmid, &rr);
+		else
+			mbm_bw_count(rmid, &rr);
 	}
 }
 
@@ -385,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
 		head = &prgrp->mon.crdtgrp_list;
 		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
 			mbm_update(d, crgrp->mon.rmid);
+
+		if (is_mba_sc(NULL))
+			update_mba_bw(prgrp, d);
 	}
 
 	schedule_delayed_work_on(cpu, &d->mbm_over, delay);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index fca759d272a1..749856a2e736 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1005,6 +1005,11 @@ static void l2_qos_cfg_update(void *arg)
 	wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
 }
 
+static inline bool is_mba_linear(void)
+{
+	return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
+}
+
 static int set_cache_qos_cfg(int level, bool enable)
 {
 	void (*update)(void *arg);
@@ -1041,6 +1046,28 @@ static int set_cache_qos_cfg(int level, bool enable)
 	return 0;
 }
 
+/*
+ * Enable or disable the MBA software controller
+ * which helps user specify bandwidth in MBps.
+ * MBA software controller is supported only if
+ * MBM is supported and MBA is in linear scale.
+ */
+static int set_mba_sc(bool mba_sc)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
+	struct rdt_domain *d;
+
+	if (!is_mbm_enabled() || !is_mba_linear() ||
+	    mba_sc == is_mba_sc(r))
+		return -EINVAL;
+
+	r->membw.mba_sc = mba_sc;
+	list_for_each_entry(d, &r->domains, list)
+		setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
+
+	return 0;
+}
+
 static int cdp_enable(int level, int data_type, int code_type)
 {
 	struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
@@ -1123,6 +1150,10 @@ static int parse_rdtgroupfs_options(char *data)
 			ret = cdpl2_enable();
 			if (ret)
 				goto out;
+		} else if (!strcmp(token, "mba_MBps")) {
+			ret = set_mba_sc(true);
+			if (ret)
+				goto out;
 		} else {
 			ret = -EINVAL;
 			goto out;
@@ -1445,6 +1476,8 @@ static void rdt_kill_sb(struct super_block *sb)
 	cpus_read_lock();
 	mutex_lock(&rdtgroup_mutex);
 
+	set_mba_sc(false);
+
 	/*Put everything back to default values. */
 	for_each_alloc_enabled_rdt_resource(r)
 		reset_all_ctrls(r);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 475cb4f5f14f..c805a06e14c3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -48,7 +48,7 @@ static struct dentry *dfs_inj;
 
 static u8 n_banks;
 
-#define MAX_FLAG_OPT_SIZE	3
+#define MAX_FLAG_OPT_SIZE	4
 #define NBCFG			0x44
 
 enum injection_type {
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 42cf2880d0ed..cd76380af79f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1727,6 +1727,21 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
 	}
 }
 
+static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
+{
+	struct mca_config *cfg = &mca_cfg;
+
+	 /*
+	  * All newer Centaur CPUs support MCE broadcasting. Enable
+	  * synchronization with a one second timeout.
+	  */
+	if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
+	     c->x86 > 6) {
+		if (cfg->monarch_timeout < 0)
+			cfg->monarch_timeout = USEC_PER_SEC;
+	}
+}
+
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
@@ -1739,6 +1754,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 		mce_amd_feature_init(c);
 		break;
 		}
+	case X86_VENDOR_CENTAUR:
+		mce_centaur_feature_init(c);
+		break;
 
 	default:
 		break;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f7666eef4a87..f591b01930db 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -94,6 +94,11 @@ static struct smca_bank_name smca_names[] = {
 	[SMCA_SMU]	= { "smu",		"System Management Unit" },
 };
 
+static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
+{
+	[0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
+};
+
 const char *smca_get_name(enum smca_bank_types t)
 {
 	if (t >= N_SMCA_BANK_TYPES)
@@ -431,8 +436,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
-static u32 smca_get_block_address(unsigned int cpu, unsigned int bank,
-				  unsigned int block)
+static u32 smca_get_block_address(unsigned int bank, unsigned int block)
 {
 	u32 low, high;
 	u32 addr = 0;
@@ -443,24 +447,30 @@ static u32 smca_get_block_address(unsigned int cpu, unsigned int bank,
 	if (!block)
 		return MSR_AMD64_SMCA_MCx_MISC(bank);
 
+	/* Check our cache first: */
+	if (smca_bank_addrs[bank][block] != -1)
+		return smca_bank_addrs[bank][block];
+
 	/*
 	 * For SMCA enabled processors, BLKPTR field of the first MISC register
 	 * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
 	 */
-	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
-		return addr;
+	if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+		goto out;
 
 	if (!(low & MCI_CONFIG_MCAX))
-		return addr;
+		goto out;
 
-	if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+	if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
 	    (low & MASK_BLKPTR_LO))
-		return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+		addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
 
+out:
+	smca_bank_addrs[bank][block] = addr;
 	return addr;
 }
 
-static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high,
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
 			     unsigned int bank, unsigned int block)
 {
 	u32 addr = 0, offset = 0;
@@ -468,20 +478,8 @@ static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 hi
 	if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
 		return addr;
 
-	/* Get address from already initialized block. */
-	if (per_cpu(threshold_banks, cpu)) {
-		struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank];
-
-		if (bankp && bankp->blocks) {
-			struct threshold_block *blockp = &bankp->blocks[block];
-
-			if (blockp)
-				return blockp->address;
-		}
-	}
-
 	if (mce_flags.smca)
-		return smca_get_block_address(cpu, bank, block);
+		return smca_get_block_address(bank, block);
 
 	/* Fall back to method we used for older processors: */
 	switch (block) {
@@ -559,7 +557,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			smca_configure(bank, cpu);
 
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			address = get_block_address(cpu, address, low, high, bank, block);
+			address = get_block_address(address, low, high, bank, block);
 			if (!address)
 				break;
 
@@ -1176,7 +1174,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 	if (err)
 		goto out_free;
 recurse:
-	address = get_block_address(cpu, address, low, high, bank, ++block);
+	address = get_block_address(address, low, high, bank, ++block);
 	if (!address)
 		return 0;
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 10c4fc2c91f8..77e201301528 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -564,14 +564,12 @@ static int __reload_late(void *info)
 	apply_microcode_local(&err);
 	spin_unlock(&update_lock);
 
+	/* siblings return UCODE_OK because their engine got updated already */
 	if (err > UCODE_NFOUND) {
 		pr_warn("Error reloading microcode on CPU %d\n", cpu);
-		return -1;
-	/* siblings return UCODE_OK because their engine got updated already */
+		ret = -1;
 	} else if (err == UCODE_UPDATED || err == UCODE_OK) {
 		ret = 1;
-	} else {
-		return ret;
 	}
 
 	/*
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 32b8e5724f96..1c2cfa0644aa 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -485,7 +485,6 @@ static void show_saved_mc(void)
  */
 static void save_mc_for_early(u8 *mc, unsigned int size)
 {
-#ifdef CONFIG_HOTPLUG_CPU
 	/* Synchronization during CPU hotplug. */
 	static DEFINE_MUTEX(x86_cpu_microcode_mutex);
 
@@ -495,7 +494,6 @@ static void save_mc_for_early(u8 *mc, unsigned int size)
 	show_saved_mc();
 
 	mutex_unlock(&x86_cpu_microcode_mutex);
-#endif
 }
 
 static bool load_builtin_intel_microcode(struct cpio_data *cp)
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index ad9e5ed81181..2ad9107ee980 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y		:= main.o if.o generic.o cleanup.o
+obj-y		:= mtrr.o if.o generic.o cleanup.o
 obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 7468de429087..9a19c800fe40 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -46,6 +46,7 @@
 #include <linux/pci.h>
 #include <linux/smp.h>
 #include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820/api.h>
@@ -100,7 +101,7 @@ static int have_wrcomb(void)
 		if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
 		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
 		    dev->revision <= 5) {
-			pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+			pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
 			pci_dev_put(dev);
 			return 0;
 		}
@@ -110,7 +111,7 @@ static int have_wrcomb(void)
 		 */
 		if (dev->vendor == PCI_VENDOR_ID_INTEL &&
 		    dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
-			pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
+			pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
 			pci_dev_put(dev);
 			return 0;
 		}
@@ -312,24 +313,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		return error;
 
 	if (type >= MTRR_NUM_TYPES) {
-		pr_warn("mtrr: type: %u invalid\n", type);
+		pr_warn("type: %u invalid\n", type);
 		return -EINVAL;
 	}
 
 	/* If the type is WC, check that this processor supports it */
 	if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
-		pr_warn("mtrr: your processor doesn't support write-combining\n");
+		pr_warn("your processor doesn't support write-combining\n");
 		return -ENOSYS;
 	}
 
 	if (!size) {
-		pr_warn("mtrr: zero sized request\n");
+		pr_warn("zero sized request\n");
 		return -EINVAL;
 	}
 
 	if ((base | (base + size - 1)) >>
 	    (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
-		pr_warn("mtrr: base or size exceeds the MTRR width\n");
+		pr_warn("base or size exceeds the MTRR width\n");
 		return -EINVAL;
 	}
 
@@ -360,8 +361,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 				} else if (types_compatible(type, ltype))
 					continue;
 			}
-			pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
-				" 0x%lx000,0x%lx000\n", base, size, lbase,
+			pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
 				lsize);
 			goto out;
 		}
@@ -369,7 +369,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		if (ltype != type) {
 			if (types_compatible(type, ltype))
 				continue;
-			pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+			pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
 				base, size, mtrr_attrib_to_str(ltype),
 				mtrr_attrib_to_str(type));
 			goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 			}
 		}
 	} else {
-		pr_info("mtrr: no more MTRRs available\n");
+		pr_info("no more MTRRs available\n");
 	}
 	error = i;
  out:
@@ -407,8 +407,8 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 static int mtrr_check(unsigned long base, unsigned long size)
 {
 	if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
-		pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
-		pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
+		pr_warn("size and base must be multiples of 4 kiB\n");
+		pr_debug("size: 0x%lx  base: 0x%lx\n", size, base);
 		dump_stack();
 		return -1;
 	}
@@ -499,22 +499,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 			}
 		}
 		if (reg < 0) {
-			pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
+			pr_debug("no MTRR for %lx000,%lx000 found\n",
 				 base, size);
 			goto out;
 		}
 	}
 	if (reg >= max) {
-		pr_warn("mtrr: register: %d too big\n", reg);
+		pr_warn("register: %d too big\n", reg);
 		goto out;
 	}
 	mtrr_if->get(reg, &lbase, &lsize, &ltype);
 	if (lsize < 1) {
-		pr_warn("mtrr: MTRR %d not used\n", reg);
+		pr_warn("MTRR %d not used\n", reg);
 		goto out;
 	}
 	if (mtrr_usage_table[reg] < 1) {
-		pr_warn("mtrr: reg: %d has count=0\n", reg);
+		pr_warn("reg: %d has count=0\n", reg);
 		goto out;
 	}
 	if (--mtrr_usage_table[reg] < 1)
@@ -775,7 +775,7 @@ void __init mtrr_bp_init(void)
 	}
 
 	if (!mtrr_enabled()) {
-		pr_info("MTRR: Disabled\n");
+		pr_info("Disabled\n");
 
 		/*
 		 * PAT initialization relies on MTRR's rendezvous handler.
@@ -793,6 +793,9 @@ void mtrr_ap_init(void)
 
 	if (!use_intel() || mtrr_aps_delayed_init)
 		return;
+
+	rcu_cpu_starting(smp_processor_id());
+
 	/*
 	 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
 	 * changed, but this routine will be called in cpu boot time,
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index b099024d339c..81c0afb39d0a 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -27,7 +27,7 @@
  * exists, use it for populating initial_apicid and cpu topology
  * detection.
  */
-void detect_extended_topology(struct cpuinfo_x86 *c)
+int detect_extended_topology(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	unsigned int eax, ebx, ecx, edx, sub_index;
@@ -36,7 +36,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
 	static bool printed;
 
 	if (c->cpuid_level < 0xb)
-		return;
+		return -1;
 
 	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
 
@@ -44,7 +44,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
 	 * check if the cpuid leaf 0xb is actually implemented.
 	 */
 	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
-		return;
+		return -1;
 
 	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
 
@@ -95,6 +95,6 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
 			       c->cpu_core_id);
 		printed = 1;
 	}
-	return;
 #endif
+	return 0;
 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 18fa9d74c182..666a284116ac 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,11 +22,14 @@
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
 
+#define OPCODE_BUFSIZE 64
+
 int panic_on_unrecovered_nmi;
 int panic_on_io_nmi;
-static unsigned int code_bytes = 64;
 static int die_counter;
 
+static struct pt_regs exec_summary_regs;
+
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
 		   struct stack_info *info)
 {
@@ -69,9 +72,62 @@ static void printk_stack_address(unsigned long address, int reliable,
 	printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
 }
 
+/*
+ * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
+ *
+ * In case where we don't have the exact kernel image (which, if we did, we can
+ * simply disassemble and navigate to the RIP), the purpose of the bigger
+ * prologue is to have more context and to be able to correlate the code from
+ * the different toolchains better.
+ *
+ * In addition, it helps in recreating the register allocation of the failing
+ * kernel and thus make sense of the register dump.
+ *
+ * What is more, the additional complication of a variable length insn arch like
+ * x86 warrants having longer byte sequence before rIP so that the disassembler
+ * can "sync" up properly and find instruction boundaries when decoding the
+ * opcode bytes.
+ *
+ * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
+ * guesstimate in attempt to achieve all of the above.
+ */
+void show_opcodes(u8 *rip, const char *loglvl)
+{
+	unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3;
+	u8 opcodes[OPCODE_BUFSIZE];
+	u8 *ip;
+	int i;
+
+	printk("%sCode: ", loglvl);
+
+	ip = (u8 *)rip - code_prologue;
+	if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) {
+		pr_cont("Bad RIP value.\n");
+		return;
+	}
+
+	for (i = 0; i < OPCODE_BUFSIZE; i++, ip++) {
+		if (ip == rip)
+			pr_cont("<%02x> ", opcodes[i]);
+		else
+			pr_cont("%02x ", opcodes[i]);
+	}
+	pr_cont("\n");
+}
+
+void show_ip(struct pt_regs *regs, const char *loglvl)
+{
+#ifdef CONFIG_X86_32
+	printk("%sEIP: %pS\n", loglvl, (void *)regs->ip);
+#else
+	printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
+#endif
+	show_opcodes((u8 *)regs->ip, loglvl);
+}
+
 void show_iret_regs(struct pt_regs *regs)
 {
-	printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+	show_ip(regs, KERN_DEFAULT);
 	printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
 		regs->sp, regs->flags);
 }
@@ -267,7 +323,6 @@ unsigned long oops_begin(void)
 	bust_spinlocks(1);
 	return flags;
 }
-EXPORT_SYMBOL_GPL(oops_begin);
 NOKPROBE_SYMBOL(oops_begin);
 
 void __noreturn rewind_stack_do_exit(int signr);
@@ -287,6 +342,9 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 	raw_local_irq_restore(flags);
 	oops_exit();
 
+	/* Executive summary in case the oops scrolled away */
+	__show_regs(&exec_summary_regs, true);
+
 	if (!signr)
 		return;
 	if (in_interrupt())
@@ -305,10 +363,10 @@ NOKPROBE_SYMBOL(oops_end);
 
 int __die(const char *str, struct pt_regs *regs, long err)
 {
-#ifdef CONFIG_X86_32
-	unsigned short ss;
-	unsigned long sp;
-#endif
+	/* Save the regs of the first oops for the executive summary later. */
+	if (!die_counter)
+		exec_summary_regs = *regs;
+
 	printk(KERN_DEFAULT
 	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
 	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
@@ -318,26 +376,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
 	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
 
+	show_regs(regs);
+	print_modules();
+
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
 		return 1;
 
-	print_modules();
-	show_regs(regs);
-#ifdef CONFIG_X86_32
-	if (user_mode(regs)) {
-		sp = regs->sp;
-		ss = regs->ss;
-	} else {
-		sp = kernel_stack_pointer(regs);
-		savesegment(ss, ss);
-	}
-	printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n",
-	       (void *)regs->ip, ss, sp);
-#else
-	/* Executive summary in case the oops scrolled away */
-	printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp);
-#endif
 	return 0;
 }
 NOKPROBE_SYMBOL(__die);
@@ -356,30 +401,9 @@ void die(const char *str, struct pt_regs *regs, long err)
 	oops_end(flags, regs, sig);
 }
 
-static int __init code_bytes_setup(char *s)
-{
-	ssize_t ret;
-	unsigned long val;
-
-	if (!s)
-		return -EINVAL;
-
-	ret = kstrtoul(s, 0, &val);
-	if (ret)
-		return ret;
-
-	code_bytes = val;
-	if (code_bytes > 8192)
-		code_bytes = 8192;
-
-	return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
-
 void show_regs(struct pt_regs *regs)
 {
 	bool all = true;
-	int i;
 
 	show_regs_print_info(KERN_DEFAULT);
 
@@ -389,36 +413,8 @@ void show_regs(struct pt_regs *regs)
 	__show_regs(regs, all);
 
 	/*
-	 * When in-kernel, we also print out the stack and code at the
-	 * time of the fault..
+	 * When in-kernel, we also print out the stack at the time of the fault..
 	 */
-	if (!user_mode(regs)) {
-		unsigned int code_prologue = code_bytes * 43 / 64;
-		unsigned int code_len = code_bytes;
-		unsigned char c;
-		u8 *ip;
-
+	if (!user_mode(regs))
 		show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
-
-		printk(KERN_DEFAULT "Code: ");
-
-		ip = (u8 *)regs->ip - code_prologue;
-		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
-			/* try starting at IP */
-			ip = (u8 *)regs->ip;
-			code_len = code_len - code_prologue + 1;
-		}
-		for (i = 0; i < code_len; i++, ip++) {
-			if (ip < (u8 *)PAGE_OFFSET ||
-					probe_kernel_address(ip, c)) {
-				pr_cont(" Bad RIP value.");
-				break;
-			}
-			if (ip == (u8 *)regs->ip)
-				pr_cont("<%02x> ", c);
-			else
-				pr_cont("%02x ", c);
-		}
-	}
-	pr_cont("\n");
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 6a2cb1442e05..d1f25c831447 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -155,7 +155,8 @@ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 si
 	int x = table->nr_entries;
 
 	if (x >= ARRAY_SIZE(table->entries)) {
-		pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
+		pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
+		       start, start + size - 1);
 		return;
 	}
 
@@ -190,9 +191,10 @@ void __init e820__print_table(char *who)
 	int i;
 
 	for (i = 0; i < e820_table->nr_entries; i++) {
-		pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
-		       e820_table->entries[i].addr,
-		       e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+		pr_info("%s: [mem %#018Lx-%#018Lx] ",
+			who,
+			e820_table->entries[i].addr,
+			e820_table->entries[i].addr + e820_table->entries[i].size - 1);
 
 		e820_print_type(e820_table->entries[i].type);
 		pr_cont("\n");
@@ -574,7 +576,7 @@ void __init e820__update_table_print(void)
 	if (e820__update_table(e820_table))
 		return;
 
-	pr_info("e820: modified physical RAM map:\n");
+	pr_info("modified physical RAM map:\n");
 	e820__print_table("modified");
 }
 
@@ -636,9 +638,8 @@ __init void e820__setup_pci_gap(void)
 	if (!found) {
 #ifdef CONFIG_X86_64
 		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
-		pr_err(
-			"e820: Cannot find an available gap in the 32-bit address range\n"
-			"e820: PCI devices with unassigned 32-bit BARs may not work!\n");
+		pr_err("Cannot find an available gap in the 32-bit address range\n");
+		pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
 #else
 		gapstart = 0x10000000;
 #endif
@@ -649,7 +650,8 @@ __init void e820__setup_pci_gap(void)
 	 */
 	pci_mem_start = gapstart;
 
-	pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
+	pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
+		gapstart, gapstart + gapsize - 1);
 }
 
 /*
@@ -711,7 +713,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
 	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
 
 	early_memunmap(sdata, data_len);
-	pr_info("e820: extended physical RAM map:\n");
+	pr_info("extended physical RAM map:\n");
 	e820__print_table("extended");
 }
 
@@ -780,7 +782,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
 	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 	if (addr) {
 		e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
-		pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n");
+		pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
 		e820__update_table_kexec();
 	}
 
@@ -830,8 +832,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
 	if (last_pfn > max_arch_pfn)
 		last_pfn = max_arch_pfn;
 
-	pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
-			 last_pfn, max_arch_pfn);
+	pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
+		last_pfn, max_arch_pfn);
 	return last_pfn;
 }
 
@@ -1005,7 +1007,7 @@ void __init e820__finish_early_params(void)
 		if (e820__update_table(e820_table) < 0)
 			early_panic("Invalid user supplied memory map");
 
-		pr_info("e820: user-defined physical RAM map:\n");
+		pr_info("user-defined physical RAM map:\n");
 		e820__print_table("user");
 	}
 }
@@ -1238,7 +1240,7 @@ void __init e820__memory_setup(void)
 	memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
 	memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
 
-	pr_info("e820: BIOS-provided physical RAM map:\n");
+	pr_info("BIOS-provided physical RAM map:\n");
 	e820__print_table(who);
 }
 
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index bae0d32e327b..da5d8ac60062 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -28,8 +28,6 @@
 #include <asm/irq_remapping.h>
 #include <asm/early_ioremap.h>
 
-#define dev_err(msg)  pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg)
-
 static void __init fix_hypertransport_config(int num, int slot, int func)
 {
 	u32 htcfg;
@@ -617,7 +615,8 @@ static void __init apple_airport_reset(int bus, int slot, int func)
 
 		pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
 		if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
-			dev_err("Cannot power up Apple AirPort card\n");
+			pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n",
+			       bus, slot, func);
 			return;
 		}
 	}
@@ -628,7 +627,8 @@ static void __init apple_airport_reset(int bus, int slot, int func)
 
 	mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
 	if (!mmio) {
-		dev_err("Cannot iomap Apple AirPort card\n");
+		pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n",
+		       bus, slot, func);
 		return;
 	}
 
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0c408f8c4ed4..a21d6ace648e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -6,6 +6,10 @@
  */
 
 #define DISABLE_BRANCH_PROFILING
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
 #include <linux/init.h>
 #include <linux/linkage.h>
 #include <linux/types.h>
@@ -32,11 +36,6 @@
 #include <asm/microcode.h>
 #include <asm/kasan.h>
 
-#ifdef CONFIG_X86_5LEVEL
-#undef pgtable_l5_enabled
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
-
 /*
  * Manage page tables very early on.
  */
@@ -45,8 +44,7 @@ static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 
 #ifdef CONFIG_X86_5LEVEL
-unsigned int __pgtable_l5_enabled __ro_after_init;
-EXPORT_SYMBOL(__pgtable_l5_enabled);
+unsigned int __pgtable_l5_enabled __initdata;
 unsigned int pgdir_shift __ro_after_init = 39;
 EXPORT_SYMBOL(pgdir_shift);
 unsigned int ptrs_per_p4d __ro_after_init = 1;
@@ -82,13 +80,14 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
 
 static bool __head check_la57_support(unsigned long physaddr)
 {
-	if (native_cpuid_eax(0) < 7)
-		return false;
-
-	if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+	/*
+	 * 5-level paging is detected and enabled at kernel decomression
+	 * stage. Only check if it has been enabled there.
+	 */
+	if (!(native_read_cr4() & X86_CR4_LA57))
 		return false;
 
-	*fixup_int(&pgtable_l5_enabled, physaddr) = 1;
+	*fixup_int(&__pgtable_l5_enabled, physaddr) = 1;
 	*fixup_int(&pgdir_shift, physaddr) = 48;
 	*fixup_int(&ptrs_per_p4d, physaddr) = 512;
 	*fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
@@ -104,6 +103,12 @@ static bool __head check_la57_support(unsigned long physaddr)
 }
 #endif
 
+/* Code in __startup_64() can be relocated during execution, but the compiler
+ * doesn't have to generate PC-relative relocations when accessing globals from
+ * that function. Clang actually does not generate them, which leads to
+ * boot-time crashes. To work around this problem, every global pointer must
+ * be adjusted using fixup_pointer().
+ */
 unsigned long __head __startup_64(unsigned long physaddr,
 				  struct boot_params *bp)
 {
@@ -113,6 +118,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
 	p4dval_t *p4d;
 	pudval_t *pud;
 	pmdval_t *pmd, pmd_entry;
+	pteval_t *mask_ptr;
 	bool la57;
 	int i;
 	unsigned int *next_pgt_ptr;
@@ -196,7 +202,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
 
 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
 	/* Filter out unsupported __PAGE_KERNEL_* bits: */
-	pmd_entry &= __supported_pte_mask;
+	mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
+	pmd_entry &= *mask_ptr;
 	pmd_entry += sme_get_me_mask();
 	pmd_entry +=  physaddr;
 
@@ -273,7 +280,7 @@ again:
 	 * critical -- __PAGE_OFFSET would point us back into the dynamic
 	 * range and we might end up looping forever...
 	 */
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		p4d_p = pgd_p;
 	else if (pgd)
 		p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 8ce4212e2b8d..b6be34ee88e9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -975,8 +975,7 @@ int __init hpet_enable(void)
 	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
 	hpet_writel(cfg, HPET_CFG);
 	if (cfg)
-		pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
-			cfg);
+		pr_warn("Unrecognized bits %#x set in global cfg\n", cfg);
 
 	for (i = 0; i <= last; ++i) {
 		cfg = hpet_readl(HPET_Tn_CFG(i));
@@ -988,7 +987,7 @@ int __init hpet_enable(void)
 			 | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
 			 | HPET_TN_FSB | HPET_TN_FSB_CAP);
 		if (cfg)
-			pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
+			pr_warn("Unrecognized bits %#x set in cfg#%u\n",
 				cfg, i);
 	}
 	hpet_print_config();
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index fa183a131edc..108c48d0d40e 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL2.0
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Jailhouse paravirt_ops implementation
  *
@@ -37,7 +37,7 @@ static uint32_t __init jailhouse_detect(void)
 	return jailhouse_cpuid_base();
 }
 
-static void jailhouse_get_wallclock(struct timespec *now)
+static void jailhouse_get_wallclock(struct timespec64 *now)
 {
 	memset(now, 0, sizeof(*now));
 }
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 3182908b7e6c..7326078eaa7a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -398,11 +398,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	 * little bit simple
 	 */
 	efi_map_sz = efi_get_runtime_map_size();
-	efi_map_sz = ALIGN(efi_map_sz, 16);
 	params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
 				MAX_ELFCOREHDR_STR_LEN;
 	params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
-	kbuf.bufsz = params_cmdline_sz + efi_map_sz +
+	kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
 				sizeof(struct setup_data) +
 				sizeof(struct efi_setup_data);
 
@@ -410,7 +409,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	if (!params)
 		return ERR_PTR(-ENOMEM);
 	efi_map_offset = params_cmdline_sz;
-	efi_setup_data_offset = efi_map_offset + efi_map_sz;
+	efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
 
 	/* Copy setup header onto bootparams. Documentation/x86/boot.txt */
 	setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 0715f827607c..6f4d42377fe5 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -370,6 +370,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
 	if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 		return 0;
 
+	/* We should not singlestep on the exception masking instructions */
+	if (insn_masking_exception(insn))
+		return 0;
+
 #ifdef CONFIG_X86_64
 	/* Only x86_64 has RIP relative instructions */
 	if (insn_rip_relative(insn)) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7867417cfaff..5b2300b818af 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -457,7 +457,7 @@ static void __init sev_map_percpu_data(void)
 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
 {
 	native_smp_prepare_cpus(max_cpus);
-	if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
 		static_branch_disable(&virt_spin_lock_key);
 }
 
@@ -553,7 +553,7 @@ static void __init kvm_guest_init(void)
 	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
-	    !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
+	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
 	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
 		pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
 
@@ -649,7 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void)
 	int cpu;
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
-	    !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
+	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
 	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 		for_each_possible_cpu(cpu) {
 			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
@@ -745,7 +745,7 @@ void __init kvm_spinlock_init(void)
 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 		return;
 
-	if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
 		return;
 
 	__pv_init_lock_hash();
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 8b26c9e01cc4..bf8d1eb7fca3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -53,7 +53,7 @@ static struct pvclock_wall_clock *wall_clock;
  * have elapsed since the hypervisor wrote the data. So we try to account for
  * that with system time
  */
-static void kvm_get_wallclock(struct timespec *now)
+static void kvm_get_wallclock(struct timespec64 *now)
 {
 	struct pvclock_vcpu_time_info *vcpu_time;
 	int low, high;
@@ -72,7 +72,7 @@ static void kvm_get_wallclock(struct timespec *now)
 	put_cpu();
 }
 
-static int kvm_set_wallclock(const struct timespec *now)
+static int kvm_set_wallclock(const struct timespec64 *now)
 {
 	return -ENODEV;
 }
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d41d896481b8..c9b14020f4dd 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -166,7 +166,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
 		 */
 		pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
 		/* Filter out unsuppored __PAGE_KERNEL* bits: */
-		pgprot_val(pte_prot) |= __supported_pte_mask;
+		pgprot_val(pte_prot) &= __supported_pte_mask;
 		pte = pfn_pte(pfn, pte_prot);
 		set_pte_at(mm, va, ptep, pte);
 		pte_unmap_unlock(ptep, ptl);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 60cdec6628b0..d1ab07ec8c9a 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -57,12 +57,17 @@ static void load_segments(void)
 static void machine_kexec_free_page_tables(struct kimage *image)
 {
 	free_page((unsigned long)image->arch.pgd);
+	image->arch.pgd = NULL;
 #ifdef CONFIG_X86_PAE
 	free_page((unsigned long)image->arch.pmd0);
+	image->arch.pmd0 = NULL;
 	free_page((unsigned long)image->arch.pmd1);
+	image->arch.pmd1 = NULL;
 #endif
 	free_page((unsigned long)image->arch.pte0);
+	image->arch.pte0 = NULL;
 	free_page((unsigned long)image->arch.pte1);
+	image->arch.pte1 = NULL;
 }
 
 static int machine_kexec_alloc_page_tables(struct kimage *image)
@@ -79,7 +84,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image)
 	    !image->arch.pmd0 || !image->arch.pmd1 ||
 #endif
 	    !image->arch.pte0 || !image->arch.pte1) {
-		machine_kexec_free_page_tables(image);
 		return -ENOMEM;
 	}
 	return 0;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index a5e55d832d0a..4c8acdfdc5a7 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -39,9 +39,13 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
 static void free_transition_pgtable(struct kimage *image)
 {
 	free_page((unsigned long)image->arch.p4d);
+	image->arch.p4d = NULL;
 	free_page((unsigned long)image->arch.pud);
+	image->arch.pud = NULL;
 	free_page((unsigned long)image->arch.pmd);
+	image->arch.pmd = NULL;
 	free_page((unsigned long)image->arch.pte);
+	image->arch.pte = NULL;
 }
 
 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
@@ -91,7 +95,6 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
 	return 0;
 err:
-	free_transition_pgtable(image);
 	return result;
 }
 
@@ -351,7 +354,8 @@ void arch_crash_save_vmcoreinfo(void)
 {
 	VMCOREINFO_NUMBER(phys_base);
 	VMCOREINFO_SYMBOL(init_top_pgt);
-	VMCOREINFO_NUMBER(pgtable_l5_enabled);
+	vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+			pgtable_l5_enabled());
 
 #ifdef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 77625b60a510..ab5d9dd668d2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -15,13 +15,11 @@
 #include <asm/x86_init.h>
 #include <asm/iommu_table.h>
 
-static int forbid_dac __read_mostly;
+static bool disable_dac_quirk __read_mostly;
 
 const struct dma_map_ops *dma_ops = &dma_direct_ops;
 EXPORT_SYMBOL(dma_ops);
 
-static int iommu_sac_force __read_mostly;
-
 #ifdef CONFIG_IOMMU_DEBUG
 int panic_on_overflow __read_mostly = 1;
 int force_iommu __read_mostly = 1;
@@ -55,9 +53,6 @@ struct device x86_dma_fallback_dev = {
 };
 EXPORT_SYMBOL(x86_dma_fallback_dev);
 
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES       65536
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
@@ -76,7 +71,7 @@ void __init pci_iommu_alloc(void)
 	}
 }
 
-bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
+bool arch_dma_alloc_attrs(struct device **dev)
 {
 	if (!*dev)
 		*dev = &x86_dma_fallback_dev;
@@ -125,13 +120,13 @@ static __init int iommu_setup(char *p)
 		if (!strncmp(p, "nomerge", 7))
 			iommu_merge = 0;
 		if (!strncmp(p, "forcesac", 8))
-			iommu_sac_force = 1;
+			pr_warn("forcesac option ignored.\n");
 		if (!strncmp(p, "allowdac", 8))
-			forbid_dac = 0;
+			pr_warn("allowdac option ignored.\n");
 		if (!strncmp(p, "nodac", 5))
-			forbid_dac = 1;
+			pr_warn("nodac option ignored.\n");
 		if (!strncmp(p, "usedac", 6)) {
-			forbid_dac = -1;
+			disable_dac_quirk = true;
 			return 1;
 		}
 #ifdef CONFIG_SWIOTLB
@@ -156,40 +151,9 @@ static __init int iommu_setup(char *p)
 }
 early_param("iommu", iommu_setup);
 
-int arch_dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PCI
-	if (mask > 0xffffffff && forbid_dac > 0) {
-		dev_info(dev, "PCI: Disallowing DAC for device\n");
-		return 0;
-	}
-#endif
-
-	/* Tell the device to use SAC when IOMMU force is on.  This
-	   allows the driver to use cheaper accesses in some cases.
-
-	   Problem with this is that if we overflow the IOMMU area and
-	   return DAC as fallback address the device may not handle it
-	   correctly.
-
-	   As a special case some controllers have a 39bit address
-	   mode that is as efficient as 32bit (aic79xx). Don't force
-	   SAC for these.  Assume all masks <= 40 bits are of this
-	   type. Normally this doesn't make any difference, but gives
-	   more gentle handling of IOMMU overflow. */
-	if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
-		dev_info(dev, "Force SAC with mask %Lx\n", mask);
-		return 0;
-	}
-
-	return 1;
-}
-EXPORT_SYMBOL(arch_dma_supported);
-
 static int __init pci_iommu_init(void)
 {
 	struct iommu_table_entry *p;
-	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 
 #ifdef CONFIG_PCI
 	dma_debug_add_bus(&pci_bus_type);
@@ -209,11 +173,17 @@ rootfs_initcall(pci_iommu_init);
 #ifdef CONFIG_PCI
 /* Many VIA bridges seem to corrupt data for DAC. Disable it here */
 
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+	pdev->dev.dma_32bit_limit = true;
+	return 0;
+}
+
 static void via_no_dac(struct pci_dev *dev)
 {
-	if (forbid_dac == 0) {
+	if (!disable_dac_quirk) {
 		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
-		forbid_dac = 1;
+		pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
 	}
 }
 DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
deleted file mode 100644
index ac7ea3a8242f..000000000000
--- a/arch/x86/kernel/pci-nommu.c
+++ /dev/null
@@ -1,90 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Fallback functions when the main IOMMU code is not compiled in. This
-   code is roughly equivalent to i386. */
-#include <linux/dma-direct.h>
-#include <linux/scatterlist.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-#include <linux/pci.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/iommu.h>
-#include <asm/dma.h>
-
-#define NOMMU_MAPPING_ERROR		0
-
-static int
-check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
-{
-	if (hwdev && !dma_capable(hwdev, bus, size)) {
-		if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
-			printk(KERN_ERR
-			    "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
-				name, (long long)bus, size,
-				(long long)*hwdev->dma_mask);
-		return 0;
-	}
-	return 1;
-}
-
-static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
-				 unsigned long offset, size_t size,
-				 enum dma_data_direction dir,
-				 unsigned long attrs)
-{
-	dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
-	WARN_ON(size == 0);
-	if (!check_addr("map_single", dev, bus, size))
-		return NOMMU_MAPPING_ERROR;
-	return bus;
-}
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA.  This is the scatter-gather version of the
- * above pci_map_single interface.  Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length.  They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- *       DMA address/length pairs than there are SG table elements.
- *       (for example via virtual mapping capabilities)
- *       The routine returns the number of addr/length pairs actually
- *       used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
-			int nents, enum dma_data_direction dir,
-			unsigned long attrs)
-{
-	struct scatterlist *s;
-	int i;
-
-	WARN_ON(nents == 0 || sg[0].length == 0);
-
-	for_each_sg(sg, s, nents, i) {
-		BUG_ON(!sg_page(s));
-		s->dma_address = sg_phys(s);
-		if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
-			return 0;
-		s->dma_length = s->length;
-	}
-	return nents;
-}
-
-static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	return dma_addr == NOMMU_MAPPING_ERROR;
-}
-
-const struct dma_map_ops nommu_dma_ops = {
-	.alloc			= dma_generic_alloc_coherent,
-	.free			= dma_generic_free_coherent,
-	.map_sg			= nommu_map_sg,
-	.map_page		= nommu_map_page,
-	.is_phys		= 1,
-	.mapping_error		= nommu_mapping_error,
-	.dma_supported		= x86_dma_supported,
-};
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index e47b2dbbdef3..c06c4c16c6b6 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -151,17 +151,19 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 	regs_user_copy->sp = user_regs->sp;
 	regs_user_copy->cs = user_regs->cs;
 	regs_user_copy->ss = user_regs->ss;
-
 	/*
-	 * Most system calls don't save these registers, don't report them.
+	 * Store user space frame-pointer value on sample
+	 * to facilitate stack unwinding for cases when
+	 * user space executable code has such support
+	 * enabled at compile time:
 	 */
+	regs_user_copy->bp = user_regs->bp;
+
 	regs_user_copy->bx = -1;
-	regs_user_copy->bp = -1;
 	regs_user_copy->r12 = -1;
 	regs_user_copy->r13 = -1;
 	regs_user_copy->r14 = -1;
 	regs_user_copy->r15 = -1;
-
 	/*
 	 * For this to be at all useful, we need a reasonable guess for
 	 * the ABI.  Be careful: we're in NMI context, and we're
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 03408b942adb..30ca2d1a9231 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -38,6 +38,7 @@
 #include <asm/switch_to.h>
 #include <asm/desc.h>
 #include <asm/prctl.h>
+#include <asm/spec-ctrl.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -278,6 +279,148 @@ static inline void switch_to_bitmap(struct tss_struct *tss,
 	}
 }
 
+#ifdef CONFIG_SMP
+
+struct ssb_state {
+	struct ssb_state	*shared_state;
+	raw_spinlock_t		lock;
+	unsigned int		disable_state;
+	unsigned long		local_state;
+};
+
+#define LSTATE_SSB	0
+
+static DEFINE_PER_CPU(struct ssb_state, ssb_state);
+
+void speculative_store_bypass_ht_init(void)
+{
+	struct ssb_state *st = this_cpu_ptr(&ssb_state);
+	unsigned int this_cpu = smp_processor_id();
+	unsigned int cpu;
+
+	st->local_state = 0;
+
+	/*
+	 * Shared state setup happens once on the first bringup
+	 * of the CPU. It's not destroyed on CPU hotunplug.
+	 */
+	if (st->shared_state)
+		return;
+
+	raw_spin_lock_init(&st->lock);
+
+	/*
+	 * Go over HT siblings and check whether one of them has set up the
+	 * shared state pointer already.
+	 */
+	for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
+		if (cpu == this_cpu)
+			continue;
+
+		if (!per_cpu(ssb_state, cpu).shared_state)
+			continue;
+
+		/* Link it to the state of the sibling: */
+		st->shared_state = per_cpu(ssb_state, cpu).shared_state;
+		return;
+	}
+
+	/*
+	 * First HT sibling to come up on the core.  Link shared state of
+	 * the first HT sibling to itself. The siblings on the same core
+	 * which come up later will see the shared state pointer and link
+	 * themself to the state of this CPU.
+	 */
+	st->shared_state = st;
+}
+
+/*
+ * Logic is: First HT sibling enables SSBD for both siblings in the core
+ * and last sibling to disable it, disables it for the whole core. This how
+ * MSR_SPEC_CTRL works in "hardware":
+ *
+ *  CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
+ */
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+	struct ssb_state *st = this_cpu_ptr(&ssb_state);
+	u64 msr = x86_amd_ls_cfg_base;
+
+	if (!static_cpu_has(X86_FEATURE_ZEN)) {
+		msr |= ssbd_tif_to_amd_ls_cfg(tifn);
+		wrmsrl(MSR_AMD64_LS_CFG, msr);
+		return;
+	}
+
+	if (tifn & _TIF_SSBD) {
+		/*
+		 * Since this can race with prctl(), block reentry on the
+		 * same CPU.
+		 */
+		if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
+			return;
+
+		msr |= x86_amd_ls_cfg_ssbd_mask;
+
+		raw_spin_lock(&st->shared_state->lock);
+		/* First sibling enables SSBD: */
+		if (!st->shared_state->disable_state)
+			wrmsrl(MSR_AMD64_LS_CFG, msr);
+		st->shared_state->disable_state++;
+		raw_spin_unlock(&st->shared_state->lock);
+	} else {
+		if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
+			return;
+
+		raw_spin_lock(&st->shared_state->lock);
+		st->shared_state->disable_state--;
+		if (!st->shared_state->disable_state)
+			wrmsrl(MSR_AMD64_LS_CFG, msr);
+		raw_spin_unlock(&st->shared_state->lock);
+	}
+}
+#else
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+	u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
+
+	wrmsrl(MSR_AMD64_LS_CFG, msr);
+}
+#endif
+
+static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
+{
+	/*
+	 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
+	 * so ssbd_tif_to_spec_ctrl() just works.
+	 */
+	wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
+}
+
+static __always_inline void intel_set_ssb_state(unsigned long tifn)
+{
+	u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
+
+	wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+}
+
+static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
+{
+	if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
+		amd_set_ssb_virt_state(tifn);
+	else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+		amd_set_core_ssb_state(tifn);
+	else
+		intel_set_ssb_state(tifn);
+}
+
+void speculative_store_bypass_update(unsigned long tif)
+{
+	preempt_disable();
+	__speculative_store_bypass_update(tif);
+	preempt_enable();
+}
+
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss)
 {
@@ -309,6 +452,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 
 	if ((tifp ^ tifn) & _TIF_NOCPUID)
 		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+	if ((tifp ^ tifn) & _TIF_SSBD)
+		__speculative_store_bypass_update(tifn);
 }
 
 /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 5224c6099184..0ae659de21eb 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -76,16 +76,14 @@ void __show_regs(struct pt_regs *regs, int all)
 		savesegment(gs, gs);
 	}
 
-	printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
-	printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
-		raw_smp_processor_id());
+	show_ip(regs, KERN_DEFAULT);
 
 	printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
 		regs->ax, regs->bx, regs->cx, regs->dx);
 	printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
 		regs->si, regs->di, regs->bp, sp);
-	printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+	printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
+	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags);
 
 	if (!all)
 		return;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4b100fe0f508..12bb445fb98d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -542,6 +542,7 @@ void set_personality_64bit(void)
 	clear_thread_flag(TIF_X32);
 	/* Pretend that this comes from a 64bit execve */
 	task_pt_regs(current)->orig_ax = __NR_execve;
+	current_thread_info()->status &= ~TS_COMPAT;
 
 	/* Ensure the corresponding mm is not marked. */
 	if (current->mm)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index ed5c4cdf0a34..e2ee403865eb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1377,7 +1377,6 @@ static void fill_sigtrap_info(struct task_struct *tsk,
 	tsk->thread.trap_nr = X86_TRAP_DB;
 	tsk->thread.error_code = error_code;
 
-	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 	info->si_code = si_code;
 	info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
@@ -1395,6 +1394,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 {
 	struct siginfo info;
 
+	clear_siginfo(&info);
 	fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 761f6af6efa5..637982efecd8 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -123,28 +123,35 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 			    struct pvclock_vcpu_time_info *vcpu_time,
-			    struct timespec *ts)
+			    struct timespec64 *ts)
 {
 	u32 version;
 	u64 delta;
-	struct timespec now;
+	struct timespec64 now;
 
 	/* get wallclock at system boot */
 	do {
 		version = wall_clock->version;
 		rmb();		/* fetch version before time */
+		/*
+		 * Note: wall_clock->sec is a u32 value, so it can
+		 * only store dates between 1970 and 2106. To allow
+		 * times beyond that, we need to create a new hypercall
+		 * interface with an extended pvclock_wall_clock structure
+		 * like ARM has.
+		 */
 		now.tv_sec  = wall_clock->sec;
 		now.tv_nsec = wall_clock->nsec;
 		rmb();		/* fetch time before checking version */
 	} while ((wall_clock->version & 1) || (version != wall_clock->version));
 
 	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
-	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+	delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;
 
 	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
 	now.tv_sec = delta;
 
-	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+	set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
 }
 
 void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index f7b82ed7b5b5..586f718b8e95 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -39,7 +39,7 @@ EXPORT_SYMBOL(rtc_lock);
  * jump to the next second precisely 500 ms later. Check the Motorola
  * MC146818A or Dallas DS12887 data sheet for details.
  */
-int mach_set_rtc_mmss(const struct timespec *now)
+int mach_set_rtc_mmss(const struct timespec64 *now)
 {
 	unsigned long long nowtime = now->tv_sec;
 	struct rtc_time tm;
@@ -60,7 +60,7 @@ int mach_set_rtc_mmss(const struct timespec *now)
 	return retval;
 }
 
-void mach_get_cmos_time(struct timespec *now)
+void mach_get_cmos_time(struct timespec64 *now)
 {
 	unsigned int status, year, mon, day, hour, min, sec, century = 0;
 	unsigned long flags;
@@ -118,7 +118,7 @@ void mach_get_cmos_time(struct timespec *now)
 	} else
 		year += CMOS_YEARS_OFFS;
 
-	now->tv_sec = mktime(year, mon, day, hour, min, sec);
+	now->tv_sec = mktime64(year, mon, day, hour, min, sec);
 	now->tv_nsec = 0;
 }
 
@@ -145,13 +145,13 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
 }
 EXPORT_SYMBOL(rtc_cmos_write);
 
-int update_persistent_clock(struct timespec now)
+int update_persistent_clock64(struct timespec64 now)
 {
 	return x86_platform.set_wallclock(&now);
 }
 
 /* not static: needed by APM */
-void read_persistent_clock(struct timespec *ts)
+void read_persistent_clock64(struct timespec64 *ts)
 {
 	x86_platform.get_wallclock(ts);
 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6285697b6e56..2f86d883dd95 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -50,6 +50,7 @@
 #include <linux/init_ohci1394_dma.h>
 #include <linux/kvm_para.h>
 #include <linux/dma-contiguous.h>
+#include <xen/xen.h>
 
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -534,6 +535,11 @@ static void __init reserve_crashkernel(void)
 		high = true;
 	}
 
+	if (xen_pv_domain()) {
+		pr_info("Ignoring crashkernel for a Xen PV domain\n");
+		return;
+	}
+
 	/* 0 means: find the address automatically */
 	if (crash_base <= 0) {
 		/*
@@ -1306,11 +1312,3 @@ static int __init register_kernel_offset_dumper(void)
 	return 0;
 }
 __initcall(register_kernel_offset_dumper);
-
-void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
-{
-	if (!boot_cpu_has(X86_FEATURE_OSPKE))
-		return;
-
-	seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
-}
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 14c057f29979..9ccbf0576cd0 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
 	BUILD_BUG_ON(NSIGFPE  != 15);
 	BUILD_BUG_ON(NSIGSEGV != 7);
 	BUILD_BUG_ON(NSIGBUS  != 5);
-	BUILD_BUG_ON(NSIGTRAP != 4);
+	BUILD_BUG_ON(NSIGTRAP != 5);
 	BUILD_BUG_ON(NSIGCHLD != 6);
 	BUILD_BUG_ON(NSIGSYS  != 1);
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ff99e2b6fc54..c2f7d1d2a5c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,13 +77,9 @@
 #include <asm/i8259.h>
 #include <asm/misc.h>
 #include <asm/qspinlock.h>
-
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
 
 /* representing HT siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -242,6 +238,8 @@ static void notrace start_secondary(void *unused)
 	 */
 	check_tsc_sync_target();
 
+	speculative_store_bypass_ht_init();
+
 	/*
 	 * Lock vector_lock, set CPU online and bring the vector
 	 * allocator online. Online must be set with vector_lock held
@@ -390,15 +388,47 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return false;
 }
 
+/*
+ * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ *
+ * These are Intel CPUs that enumerate an LLC that is shared by
+ * multiple NUMA nodes. The LLC on these systems is shared for
+ * off-package data access but private to the NUMA node (half
+ * of the package) for on-package access.
+ *
+ * CPUID (the source of the information about the LLC) can only
+ * enumerate the cache as being shared *or* unshared, but not
+ * this particular configuration. The CPU in this case enumerates
+ * the cache to be shared across the entire package (spanning both
+ * NUMA nodes).
+ */
+
+static const struct x86_cpu_id snc_cpu[] = {
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
+	{}
+};
+
 static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
 	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 
-	if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
-	    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
-		return topology_sane(c, o, "llc");
+	/* Do not match if we do not have a valid APICID for cpu: */
+	if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+		return false;
 
-	return false;
+	/* Do not match if LLC id does not match: */
+	if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+		return false;
+
+	/*
+	 * Allow the SNC topology without warning. Return of false
+	 * means 'c' does not share the LLC of 'o'. This will be
+	 * reflected to userspace.
+	 */
+	if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+		return false;
+
+	return topology_sane(c, o, "llc");
 }
 
 /*
@@ -456,7 +486,8 @@ static struct sched_domain_topology_level x86_topology[] = {
 
 /*
  * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours and Intel Cluster-on-Die have this.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
  */
 static bool x86_has_numa_in_package;
 
@@ -1257,6 +1288,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	set_mtrr_aps_delayed_init();
 
 	smp_quirk_init_udelay();
+
+	speculative_store_bypass_ht_init();
 }
 
 void arch_enable_nonboot_cpus_begin(void)
@@ -1536,6 +1569,8 @@ static inline void mwait_play_dead(void)
 	void *mwait_ptr;
 	int i;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return;
 	if (!this_cpu_has(X86_FEATURE_MWAIT))
 		return;
 	if (!this_cpu_has(X86_FEATURE_CLFLUSH))
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index a3f15ed545b5..6a78d4b36a79 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
@@ -19,7 +20,6 @@
 #include <linux/elf.h>
 
 #include <asm/elf.h>
-#include <asm/compat.h>
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
 #include <asm/mpx.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 03f3d7695dac..a535dd64de63 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -299,6 +299,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 		cond_local_irq_enable(regs);
+		clear_siginfo(&info);
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
@@ -854,6 +855,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
+	clear_siginfo(&info);
 	info.si_signo		= SIGFPE;
 	info.si_errno		= 0;
 	info.si_addr		= (void __user *)uprobe_get_trap_addr(regs);
@@ -929,6 +931,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();
 
+	clear_siginfo(&info);
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_BADSTK;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ef32297ff17e..74392d9d51e0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -317,7 +317,7 @@ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
 	hpet2 -= hpet1;
 	tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
 	do_div(tmp, 1000000);
-	do_div(deltatsc, tmp);
+	deltatsc = div64_u64(deltatsc, tmp);
 
 	return (unsigned long) deltatsc;
 }
@@ -1067,6 +1067,7 @@ static struct clocksource clocksource_tsc_early = {
 	.resume			= tsc_resume,
 	.mark_unstable		= tsc_cs_mark_unstable,
 	.tick_stable		= tsc_cs_tick_stable,
+	.list			= LIST_HEAD_INIT(clocksource_tsc_early.list),
 };
 
 /*
@@ -1086,6 +1087,7 @@ static struct clocksource clocksource_tsc = {
 	.resume			= tsc_resume,
 	.mark_unstable		= tsc_cs_mark_unstable,
 	.tick_stable		= tsc_cs_tick_stable,
+	.list			= LIST_HEAD_INIT(clocksource_tsc.list),
 };
 
 void mark_tsc_unstable(char *reason)
@@ -1098,13 +1100,9 @@ void mark_tsc_unstable(char *reason)
 		clear_sched_clock_stable();
 	disable_sched_clock_irqtime();
 	pr_info("Marking TSC unstable due to %s\n", reason);
-	/* Change only the rating, when not registered */
-	if (clocksource_tsc.mult) {
-		clocksource_mark_unstable(&clocksource_tsc);
-	} else {
-		clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
-		clocksource_tsc.rating = 0;
-	}
+
+	clocksource_mark_unstable(&clocksource_tsc_early);
+	clocksource_mark_unstable(&clocksource_tsc);
 }
 
 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
@@ -1244,7 +1242,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 
 	/* Don't bother refining TSC on unstable systems */
 	if (tsc_unstable)
-		return;
+		goto unreg;
 
 	/*
 	 * Since the work is started early in boot, we may be
@@ -1297,11 +1295,12 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 
 out:
 	if (tsc_unstable)
-		return;
+		goto unreg;
 
 	if (boot_cpu_has(X86_FEATURE_ART))
 		art_related_clocksource = &clocksource_tsc;
 	clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
 	clocksource_unregister(&clocksource_tsc_early);
 }
 
@@ -1311,8 +1310,8 @@ static int __init init_tsc_clocksource(void)
 	if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)
 		return 0;
 
-	if (check_tsc_unstable())
-		return 0;
+	if (tsc_unstable)
+		goto unreg;
 
 	if (tsc_clocksource_reliable)
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
@@ -1328,6 +1327,7 @@ static int __init init_tsc_clocksource(void)
 		if (boot_cpu_has(X86_FEATURE_ART))
 			art_related_clocksource = &clocksource_tsc;
 		clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
 		clocksource_unregister(&clocksource_tsc_early);
 		return 0;
 	}
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index f44ce0fb3583..ff20b35e98dd 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -278,6 +278,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
 	tsk->thread.error_code	= X86_PF_USER | X86_PF_WRITE;
 	tsk->thread.trap_nr	= X86_TRAP_PF;
 
+	clear_siginfo(&info);
 	info.si_signo	= SIGSEGV;
 	info.si_errno	= 0;
 	info.si_code	= SEGV_MAPERR;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 85c7ef23d99f..58d8d800875d 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -299,6 +299,10 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
 	if (is_prefix_bad(insn))
 		return -ENOTSUPP;
 
+	/* We should not singlestep on the exception masking instructions */
+	if (insn_masking_exception(insn))
+		return -ENOTSUPP;
+
 	if (x86_64)
 		good_insns = good_insns_64;
 	else
@@ -1079,8 +1083,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 		return orig_ret_vaddr;
 
 	if (nleft != rasize) {
-		pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
-			"%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
+		pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
+		       current->pid, regs->sp, regs->ip);
 
 		force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
 	}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 795f3a80e576..5e1458f609a1 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -117,11 +117,11 @@ SECTIONS
 
 #ifdef CONFIG_X86_64
 		. = ALIGN(PAGE_SIZE);
-		VMLINUX_SYMBOL(__entry_trampoline_start) = .;
+		__entry_trampoline_start = .;
 		_entry_trampoline = .;
 		*(.entry_trampoline)
 		. = ALIGN(PAGE_SIZE);
-		VMLINUX_SYMBOL(__entry_trampoline_end) = .;
+		__entry_trampoline_end = .;
 		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
 #endif
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 82055b90a8b3..92bf2f2e7cdd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -379,7 +379,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 0x80000008.ebx */
 	const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-		F(IBPB) | F(IBRS);
+		F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
 
 	/* cpuid 0xC0000001.edx */
 	const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -408,7 +408,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
 		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
-		F(ARCH_CAPABILITIES);
+		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
@@ -495,6 +495,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 				entry->ecx &= ~F(PKU);
 			entry->edx &= kvm_cpuid_7_0_edx_x86_features;
 			cpuid_mask(&entry->edx, CPUID_7_EDX);
+			/*
+			 * We emulate ARCH_CAPABILITIES in software even
+			 * if the host doesn't support it.
+			 */
+			entry->edx |= F(ARCH_CAPABILITIES);
 		} else {
 			entry->ebx = 0;
 			entry->ecx = 0;
@@ -647,13 +652,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			g_phys_as = phys_as;
 		entry->eax = g_phys_as | (virt_as << 8);
 		entry->edx = 0;
-		/* IBRS and IBPB aren't necessarily present in hardware cpuid */
-		if (boot_cpu_has(X86_FEATURE_IBPB))
-			entry->ebx |= F(IBPB);
-		if (boot_cpu_has(X86_FEATURE_IBRS))
-			entry->ebx |= F(IBRS);
+		/*
+		 * IBRS, IBPB and VIRT_SSBD aren't necessarily present in
+		 * hardware cpuid
+		 */
+		if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
+			entry->ebx |= F(AMD_IBPB);
+		if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
+			entry->ebx |= F(AMD_IBRS);
+		if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+			entry->ebx |= F(VIRT_SSBD);
 		entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
 		cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+		if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+			entry->ebx |= F(VIRT_SSBD);
 		break;
 	}
 	case 0x80000019:
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 98618e397342..46ff64da44ca 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1260,12 +1260,16 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
 	}
 }
 
-static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
 {
-	struct kvm_run *run = vcpu->run;
+	kvm_hv_hypercall_set_result(vcpu, result);
+	++vcpu->stat.hypercalls;
+	return kvm_skip_emulated_instruction(vcpu);
+}
 
-	kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
-	return 1;
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+	return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
 }
 
 static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
@@ -1296,8 +1300,10 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
 	if (param & ~KVM_HYPERV_CONN_ID_MASK)
 		return HV_STATUS_INVALID_HYPERCALL_INPUT;
 
-	/* conn_to_evt is protected by vcpu->kvm->srcu */
+	/* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
+	rcu_read_lock();
 	eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+	rcu_read_unlock();
 	if (!eventfd)
 		return HV_STATUS_INVALID_PORT_ID;
 
@@ -1348,7 +1354,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	/* Hypercall continuation is not supported yet */
 	if (rep_cnt || rep_idx) {
 		ret = HV_STATUS_INVALID_HYPERCALL_CODE;
-		goto set_result;
+		goto out;
 	}
 
 	switch (code) {
@@ -1379,9 +1385,8 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 		break;
 	}
 
-set_result:
-	kvm_hv_hypercall_set_result(vcpu, ret);
-	return 1;
+out:
+	return kvm_hv_hypercall_complete(vcpu, ret);
 }
 
 void kvm_hv_init_vm(struct kvm *kvm)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 70dcb5548022..3773c4625114 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1463,23 +1463,6 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	local_irq_restore(flags);
 }
 
-static void start_sw_period(struct kvm_lapic *apic)
-{
-	if (!apic->lapic_timer.period)
-		return;
-
-	if (apic_lvtt_oneshot(apic) &&
-	    ktime_after(ktime_get(),
-			apic->lapic_timer.target_expiration)) {
-		apic_timer_expired(apic);
-		return;
-	}
-
-	hrtimer_start(&apic->lapic_timer.timer,
-		apic->lapic_timer.target_expiration,
-		HRTIMER_MODE_ABS_PINNED);
-}
-
 static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
 {
 	ktime_t now, remaining;
@@ -1539,11 +1522,43 @@ static bool set_target_expiration(struct kvm_lapic *apic)
 
 static void advance_periodic_target_expiration(struct kvm_lapic *apic)
 {
-	apic->lapic_timer.tscdeadline +=
-		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
+	ktime_t now = ktime_get();
+	u64 tscl = rdtsc();
+	ktime_t delta;
+
+	/*
+	 * Synchronize both deadlines to the same time source or
+	 * differences in the periods (caused by differences in the
+	 * underlying clocks or numerical approximation errors) will
+	 * cause the two to drift apart over time as the errors
+	 * accumulate.
+	 */
 	apic->lapic_timer.target_expiration =
 		ktime_add_ns(apic->lapic_timer.target_expiration,
 				apic->lapic_timer.period);
+	delta = ktime_sub(apic->lapic_timer.target_expiration, now);
+	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+		nsec_to_cycles(apic->vcpu, delta);
+}
+
+static void start_sw_period(struct kvm_lapic *apic)
+{
+	if (!apic->lapic_timer.period)
+		return;
+
+	if (ktime_after(ktime_get(),
+			apic->lapic_timer.target_expiration)) {
+		apic_timer_expired(apic);
+
+		if (apic_lvtt_oneshot(apic))
+			return;
+
+		advance_periodic_target_expiration(apic);
+	}
+
+	hrtimer_start(&apic->lapic_timer.timer,
+		apic->lapic_timer.target_expiration,
+		HRTIMER_MODE_ABS_PINNED);
 }
 
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8494dbae41b9..d634f0332c0f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3007,6 +3007,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 {
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo	= SIGBUS;
 	info.si_errno	= 0;
 	info.si_code	= BUS_MCEERR_AR;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b58787daf9f8..26110c202b19 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -49,7 +49,7 @@
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
 #include <asm/irq_remapping.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -213,6 +213,12 @@ struct vcpu_svm {
 	} host;
 
 	u64 spec_ctrl;
+	/*
+	 * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+	 * translated into the appropriate L2_CFG bits on the host to
+	 * perform speculative control.
+	 */
+	u64 virt_spec_ctrl;
 
 	u32 *msrpm;
 
@@ -1423,12 +1429,23 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
+static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (is_guest_mode(vcpu))
+		return svm->nested.hsave->control.tsc_offset;
+
+	return vcpu->arch.tsc_offset;
+}
+
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 g_tsc_offset = 0;
 
 	if (is_guest_mode(vcpu)) {
+		/* Write L1's TSC offset.  */
 		g_tsc_offset = svm->vmcb->control.tsc_offset -
 			       svm->nested.hsave->control.tsc_offset;
 		svm->nested.hsave->control.tsc_offset = offset;
@@ -2049,6 +2066,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vcpu->arch.microcode_version = 0x01000065;
 	svm->spec_ctrl = 0;
+	svm->virt_spec_ctrl = 0;
 
 	if (!init_event) {
 		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
@@ -3322,6 +3340,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	/* Restore the original control entries */
 	copy_vmcb_control_area(vmcb, hsave);
 
+	svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
@@ -3482,10 +3501,12 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	/* We don't want to see VMMCALLs from a nested guest */
 	clr_intercept(svm, INTERCEPT_VMMCALL);
 
+	svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
+	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+
 	svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
 	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
 	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
-	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
 	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
 	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
@@ -4035,12 +4056,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	switch (msr_info->index) {
-	case MSR_IA32_TSC: {
-		msr_info->data = svm->vmcb->control.tsc_offset +
-			kvm_scale_tsc(vcpu, rdtsc());
-
-		break;
-	}
 	case MSR_STAR:
 		msr_info->data = svm->vmcb->save.star;
 		break;
@@ -4100,11 +4115,18 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
 			return 1;
 
 		msr_info->data = svm->spec_ctrl;
 		break;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+			return 1;
+
+		msr_info->data = svm->virt_spec_ctrl;
+		break;
 	case MSR_F15H_IC_CFG: {
 
 		int family, model;
@@ -4193,12 +4215,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		svm->vmcb->save.g_pat = data;
 		mark_dirty(svm->vmcb, VMCB_NPT);
 		break;
-	case MSR_IA32_TSC:
-		kvm_write_tsc(vcpu, msr);
-		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
@@ -4225,7 +4244,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		break;
 	case MSR_IA32_PRED_CMD:
 		if (!msr->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
+		    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
 			return 1;
 
 		if (data & ~PRED_CMD_IBPB)
@@ -4239,6 +4258,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 			break;
 		set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
 		break;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		if (!msr->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+			return 1;
+
+		if (data & ~SPEC_CTRL_SSBD)
+			return 1;
+
+		svm->virt_spec_ctrl = data;
+		break;
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
 		break;
@@ -5265,9 +5294,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		}
 
 		if (!ret && svm) {
-			trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
-						 host_irq, e->gsi,
-						 vcpu_info.vector,
+			trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+						 e->gsi, vcpu_info.vector,
 						 vcpu_info.pi_desc_addr, set);
 		}
 
@@ -5553,8 +5581,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (svm->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+	x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
@@ -5648,6 +5675,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		);
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
+#ifdef CONFIG_X86_64
+	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+	loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+	loadsegment(gs, svm->host.gs);
+#endif
+#endif
+
 	/*
 	 * We do not use IBRS in the kernel. If this vCPU has used the
 	 * SPEC_CTRL MSR it may have left it on; save the value and
@@ -5666,20 +5705,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	if (svm->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
-
-	/* Eliminate branch target predictions from guest mode */
-	vmexit_fill_RSB();
-
-#ifdef CONFIG_X86_64
-	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
-	loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
-	loadsegment(gs, svm->host.gs);
-#endif
-#endif
+	x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
 
 	reload_tss(vcpu);
 
@@ -5782,7 +5808,7 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
-static bool svm_has_high_real_mode_segbase(void)
+static bool svm_has_emulated_msr(int index)
 {
 	return true;
 }
@@ -7008,7 +7034,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
 	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
-	.cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
+	.has_emulated_msr = svm_has_emulated_msr,
 
 	.vcpu_create = svm_create_vcpu,
 	.vcpu_free = svm_free_vcpu,
@@ -7102,6 +7128,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
+	.read_l1_tsc_offset = svm_read_l1_tsc_offset,
 	.write_tsc_offset = svm_write_tsc_offset,
 
 	.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aafcc9881e88..40aa29204baf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,7 +51,7 @@
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
 #include <asm/mshyperv.h>
 
 #include "trace.h"
@@ -1494,6 +1494,12 @@ static inline bool cpu_has_vmx_vmfunc(void)
 		SECONDARY_EXEC_ENABLE_VMFUNC;
 }
 
+static bool vmx_umip_emulated(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_DESC;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -2880,18 +2886,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
- * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
- */
-static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
 {
-	u64 host_tsc, tsc_offset;
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-	host_tsc = rdtsc();
-	tsc_offset = vmcs_read64(TSC_OFFSET);
-	return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
+	if (is_guest_mode(vcpu) &&
+	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+		return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+	return vcpu->arch.tsc_offset;
 }
 
 /*
@@ -3524,12 +3527,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 #endif
 	case MSR_EFER:
 		return kvm_get_msr_common(vcpu, msr_info);
-	case MSR_IA32_TSC:
-		msr_info->data = guest_read_tsc(vcpu);
-		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 			return 1;
 
@@ -3646,17 +3645,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vmcs_write64(GUEST_BNDCFGS, data);
 		break;
-	case MSR_IA32_TSC:
-		kvm_write_tsc(vcpu, msr_info);
-		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 			return 1;
 
 		/* The STIBP bit doesn't fault even if it's not advertised */
-		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
 			return 1;
 
 		vmx->spec_ctrl = data;
@@ -3682,7 +3677,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_PRED_CMD:
 		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 			return 1;
 
@@ -4553,12 +4547,6 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
 }
 
-static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
-{
-	if (enable_ept)
-		vmx_flush_tlb(vcpu, true);
-}
-
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -4776,14 +4764,16 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	else
 		hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
 
-	if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
-		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-			      SECONDARY_EXEC_DESC);
-		hw_cr4 &= ~X86_CR4_UMIP;
-	} else if (!is_guest_mode(vcpu) ||
-	           !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
-		vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+	if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+		if (cr4 & X86_CR4_UMIP) {
+			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
 				SECONDARY_EXEC_DESC);
+			hw_cr4 &= ~X86_CR4_UMIP;
+		} else if (!is_guest_mode(vcpu) ||
+			!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
+			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+					SECONDARY_EXEC_DESC);
+	}
 
 	if (cr4 & X86_CR4_VMXE) {
 		/*
@@ -9287,7 +9277,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	} else {
 		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-		vmx_flush_tlb_ept_only(vcpu);
+		vmx_flush_tlb(vcpu, true);
 	}
 	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
 
@@ -9315,7 +9305,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
 	    !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
 			     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
 		vmcs_write64(APIC_ACCESS_ADDR, hpa);
-		vmx_flush_tlb_ept_only(vcpu);
+		vmx_flush_tlb(vcpu, true);
 	}
 }
 
@@ -9495,9 +9485,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
 
-static bool vmx_has_high_real_mode_segbase(void)
+static bool vmx_has_emulated_msr(int index)
 {
-	return enable_unrestricted_guest || emulate_invalid_guest_state;
+	switch (index) {
+	case MSR_IA32_SMBASE:
+		/*
+		 * We cannot do SMM unless we can run the guest in big
+		 * real mode.
+		 */
+		return enable_unrestricted_guest || emulate_invalid_guest_state;
+	case MSR_AMD64_VIRT_SPEC_CTRL:
+		/* This is AMD only.  */
+		return false;
+	default:
+		return true;
+	}
 }
 
 static bool vmx_mpx_supported(void)
@@ -9512,12 +9514,6 @@ static bool vmx_xsaves_supported(void)
 		SECONDARY_EXEC_XSAVES;
 }
 
-static bool vmx_umip_emulated(void)
-{
-	return vmcs_config.cpu_based_2nd_exec_ctrl &
-		SECONDARY_EXEC_DESC;
-}
-
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -9735,8 +9731,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (vmx->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+	x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 
@@ -9884,8 +9879,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-	if (vmx->spec_ctrl)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
@@ -10608,6 +10602,16 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	return true;
 }
 
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+					  struct vmcs12 *vmcs12)
+{
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+	    !page_address_valid(vcpu, vmcs12->apic_access_addr))
+		return -EINVAL;
+	else
+		return 0;
+}
+
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 					   struct vmcs12 *vmcs12)
 {
@@ -11176,11 +11180,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 	}
 
-	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
-		vmcs_write64(TSC_OFFSET,
-			vcpu->arch.tsc_offset + vmcs12->tsc_offset);
-	else
-		vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
 	if (kvm_has_tsc_control)
 		decache_tsc_multiplier(vmx);
 
@@ -11222,7 +11223,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		}
 	} else if (nested_cpu_has2(vmcs12,
 				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		vmx_flush_tlb_ept_only(vcpu);
+		vmx_flush_tlb(vcpu, true);
 	}
 
 	/*
@@ -11299,6 +11300,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+	if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
 	if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
@@ -11420,6 +11424,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	u32 msr_entry_idx;
 	u32 exit_qual;
+	int r;
 
 	enter_guest_mode(vcpu);
 
@@ -11429,26 +11434,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
 	vmx_segment_cache_clear(vmx);
 
-	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
-		leave_guest_mode(vcpu);
-		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-		nested_vmx_entry_failure(vcpu, vmcs12,
-					 EXIT_REASON_INVALID_STATE, exit_qual);
-		return 1;
-	}
+	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+	r = EXIT_REASON_INVALID_STATE;
+	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+		goto fail;
 
 	nested_get_vmcs12_pages(vcpu, vmcs12);
 
+	r = EXIT_REASON_MSR_LOAD_FAIL;
 	msr_entry_idx = nested_vmx_load_msr(vcpu,
 					    vmcs12->vm_entry_msr_load_addr,
 					    vmcs12->vm_entry_msr_load_count);
-	if (msr_entry_idx) {
-		leave_guest_mode(vcpu);
-		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-		nested_vmx_entry_failure(vcpu, vmcs12,
-				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
-		return 1;
-	}
+	if (msr_entry_idx)
+		goto fail;
 
 	/*
 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -11457,6 +11457,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
 	 */
 	return 0;
+
+fail:
+	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+	leave_guest_mode(vcpu);
+	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+	nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
+	return 1;
 }
 
 /*
@@ -12028,6 +12036,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
 	leave_guest_mode(vcpu);
 
+	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
 	if (likely(!vmx->fail)) {
 		if (exit_reason == -1)
 			sync_vmcs12(vcpu, vmcs12);
@@ -12065,7 +12076,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	} else if (!nested_cpu_has_ept(vmcs12) &&
 		   nested_cpu_has2(vmcs12,
 				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		vmx_flush_tlb_ept_only(vcpu);
+		vmx_flush_tlb(vcpu, true);
 	}
 
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -12224,10 +12235,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
 
 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tscl = rdtsc();
-	u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
-	u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+	struct vcpu_vmx *vmx;
+	u64 tscl, guest_tscl, delta_tsc;
+
+	if (kvm_mwait_in_guest(vcpu->kvm))
+		return -EOPNOTSUPP;
+
+	vmx = to_vmx(vcpu);
+	tscl = rdtsc();
+	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+	delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
 
 	/* Convert to host delta tsc if tsc scaling is enabled */
 	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
@@ -12533,7 +12550,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
 		vcpu_info.vector = irq.vector;
 
-		trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+		trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
 				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
 		if (set)
@@ -12622,7 +12639,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
 	.cpu_has_accelerated_tpr = report_flexpriority,
-	.cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+	.has_emulated_msr = vmx_has_emulated_msr,
 
 	.vm_init = vmx_vm_init,
 	.vm_alloc = vmx_vm_alloc,
@@ -12712,6 +12729,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
+	.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
 	.write_tsc_offset = vmx_write_tsc_offset,
 
 	.set_tdp_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b2ff74b12ec4..71e7cda6d014 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -114,7 +114,7 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 static bool __read_mostly report_ignored_msrs = true;
 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
 
-unsigned int min_timer_period_us = 500;
+unsigned int min_timer_period_us = 200;
 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 
 static bool __read_mostly kvmclock_periodic_sync = true;
@@ -843,7 +843,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 #ifdef CONFIG_X86_64
-	cr3 &= ~CR3_PCID_INVD;
+	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+	if (pcid_enabled)
+		cr3 &= ~CR3_PCID_INVD;
 #endif
 
 	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
@@ -1058,6 +1061,7 @@ static u32 emulated_msrs[] = {
 	MSR_SMI_COUNT,
 	MSR_PLATFORM_INFO,
 	MSR_MISC_FEATURES_ENABLES,
+	MSR_AMD64_VIRT_SPEC_CTRL,
 };
 
 static unsigned num_emulated_msrs;
@@ -1490,7 +1494,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 
 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 {
-	u64 curr_offset = vcpu->arch.tsc_offset;
+	u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
 	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 
@@ -1532,7 +1536,9 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-	return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+	u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+
+	return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
@@ -2362,6 +2368,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.smbase = data;
 		break;
+	case MSR_IA32_TSC:
+		kvm_write_tsc(vcpu, msr_info);
+		break;
 	case MSR_SMI_COUNT:
 		if (!msr_info->host_initiated)
 			return 1;
@@ -2605,6 +2614,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_UCODE_REV:
 		msr_info->data = vcpu->arch.microcode_version;
 		break;
+	case MSR_IA32_TSC:
+		msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+		break;
 	case MSR_MTRRcap:
 	case 0x200 ... 0x2ff:
 		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -2819,7 +2831,8 @@ out:
 static inline bool kvm_can_mwait_in_guest(void)
 {
 	return boot_cpu_has(X86_FEATURE_MWAIT) &&
-		!boot_cpu_has_bug(X86_BUG_MONITOR);
+		!boot_cpu_has_bug(X86_BUG_MONITOR) &&
+		boot_cpu_has(X86_FEATURE_ARAT);
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -2894,7 +2907,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		 * fringe case that is not enabled except via specific settings
 		 * of the module parameters.
 		 */
-		r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+		r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
 		break;
 	case KVM_CAP_VAPIC:
 		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
@@ -4594,14 +4607,8 @@ static void kvm_init_msr_list(void)
 	num_msrs_to_save = j;
 
 	for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
-		switch (emulated_msrs[i]) {
-		case MSR_IA32_SMBASE:
-			if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
-				continue;
-			break;
-		default:
-			break;
-		}
+		if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+			continue;
 
 		if (j < i)
 			emulated_msrs[j] = emulated_msrs[i];
@@ -6662,9 +6669,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
-	int op_64_bit, r;
-
-	r = kvm_skip_emulated_instruction(vcpu);
+	int op_64_bit;
 
 	if (kvm_hv_hypercall_enabled(vcpu->kvm))
 		return kvm_hv_hypercall(vcpu);
@@ -6712,8 +6717,9 @@ out:
 	if (!op_64_bit)
 		ret = (u32)ret;
 	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+
 	++vcpu->stat.hypercalls;
-	return r;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 
@@ -7970,6 +7976,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
 	struct msr_data apic_base_msr;
 	int mmu_reset_needed = 0;
+	int cpuid_update_needed = 0;
 	int pending_vec, max_bits, idx;
 	struct desc_ptr dt;
 	int ret = -EINVAL;
@@ -8008,8 +8015,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	vcpu->arch.cr0 = sregs->cr0;
 
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+	cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
+				(X86_CR4_OSXSAVE | X86_CR4_PKE));
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-	if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+	if (cpuid_update_needed)
 		kvm_update_cpuid(vcpu);
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7d35ce672989..c9492f764902 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -302,13 +302,6 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 	    __rem;						\
 	 })
 
-#define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
-#define KVM_X86_DISABLE_EXITS_HTL            (1 << 1)
-#define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
-#define KVM_X86_DISABLE_VALID_EXITS          (KVM_X86_DISABLE_EXITS_MWAIT | \
-                                              KVM_X86_DISABLE_EXITS_HTL | \
-                                              KVM_X86_DISABLE_EXITS_PAUSE)
-
 static inline bool kvm_mwait_in_guest(struct kvm *kvm)
 {
 	return kvm->arch.mwait_in_guest;
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 9a53a06e5a3e..c3b527a9f95d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -184,11 +184,11 @@ ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
 /*
- * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
+ * __memcpy_mcsafe - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
  * Writes to target are posted and don't generate machine checks.
  */
-ENTRY(memcpy_mcsafe_unrolled)
+ENTRY(__memcpy_mcsafe)
 	cmpl $8, %edx
 	/* Less than 8 bytes? Go to byte copy loop */
 	jb .L_no_whole_words
@@ -204,58 +204,29 @@ ENTRY(memcpy_mcsafe_unrolled)
 	subl $8, %ecx
 	negl %ecx
 	subl %ecx, %edx
-.L_copy_leading_bytes:
+.L_read_leading_bytes:
 	movb (%rsi), %al
+.L_write_leading_bytes:
 	movb %al, (%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
-	jnz .L_copy_leading_bytes
+	jnz .L_read_leading_bytes
 
 .L_8byte_aligned:
-	/* Figure out how many whole cache lines (64-bytes) to copy */
-	movl %edx, %ecx
-	andl $63, %edx
-	shrl $6, %ecx
-	jz .L_no_whole_cache_lines
-
-	/* Loop copying whole cache lines */
-.L_cache_w0: movq (%rsi), %r8
-.L_cache_w1: movq 1*8(%rsi), %r9
-.L_cache_w2: movq 2*8(%rsi), %r10
-.L_cache_w3: movq 3*8(%rsi), %r11
-	movq %r8, (%rdi)
-	movq %r9, 1*8(%rdi)
-	movq %r10, 2*8(%rdi)
-	movq %r11, 3*8(%rdi)
-.L_cache_w4: movq 4*8(%rsi), %r8
-.L_cache_w5: movq 5*8(%rsi), %r9
-.L_cache_w6: movq 6*8(%rsi), %r10
-.L_cache_w7: movq 7*8(%rsi), %r11
-	movq %r8, 4*8(%rdi)
-	movq %r9, 5*8(%rdi)
-	movq %r10, 6*8(%rdi)
-	movq %r11, 7*8(%rdi)
-	leaq 64(%rsi), %rsi
-	leaq 64(%rdi), %rdi
-	decl %ecx
-	jnz .L_cache_w0
-
-	/* Are there any trailing 8-byte words? */
-.L_no_whole_cache_lines:
 	movl %edx, %ecx
 	andl $7, %edx
 	shrl $3, %ecx
 	jz .L_no_whole_words
 
-	/* Copy trailing words */
-.L_copy_trailing_words:
+.L_read_words:
 	movq (%rsi), %r8
-	mov %r8, (%rdi)
-	leaq 8(%rsi), %rsi
-	leaq 8(%rdi), %rdi
+.L_write_words:
+	movq %r8, (%rdi)
+	addq $8, %rsi
+	addq $8, %rdi
 	decl %ecx
-	jnz .L_copy_trailing_words
+	jnz .L_read_words
 
 	/* Any trailing bytes? */
 .L_no_whole_words:
@@ -264,38 +235,53 @@ ENTRY(memcpy_mcsafe_unrolled)
 
 	/* Copy trailing bytes */
 	movl %edx, %ecx
-.L_copy_trailing_bytes:
+.L_read_trailing_bytes:
 	movb (%rsi), %al
+.L_write_trailing_bytes:
 	movb %al, (%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
-	jnz .L_copy_trailing_bytes
+	jnz .L_read_trailing_bytes
 
 	/* Copy successful. Return zero */
 .L_done_memcpy_trap:
 	xorq %rax, %rax
 	ret
-ENDPROC(memcpy_mcsafe_unrolled)
-EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
+ENDPROC(__memcpy_mcsafe)
+EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
 	.section .fixup, "ax"
-	/* Return -EFAULT for any failure */
-.L_memcpy_mcsafe_fail:
-	mov	$-EFAULT, %rax
+	/*
+	 * Return number of bytes not copied for any failure. Note that
+	 * there is no "tail" handling since the source buffer is 8-byte
+	 * aligned and poison is cacheline aligned.
+	 */
+.E_read_words:
+	shll	$3, %ecx
+.E_leading_bytes:
+	addl	%edx, %ecx
+.E_trailing_bytes:
+	mov	%ecx, %eax
 	ret
 
+	/*
+	 * For write fault handling, given the destination is unaligned,
+	 * we handle faults on multi-byte writes with a byte-by-byte
+	 * copy up to the write-protected page.
+	 */
+.E_write_words:
+	shll	$3, %ecx
+	addl	%edx, %ecx
+	movl	%ecx, %edx
+	jmp mcsafe_handle_tail
+
 	.previous
 
-	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
-	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
+	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
+	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+	_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
+	_ASM_EXTABLE(.L_write_words, .E_write_words)
+	_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
 #endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 75d3776123cc..9c5606d88f61 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -23,13 +23,13 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 	asm volatile(
 		"	testq  %[size8],%[size8]\n"
 		"	jz     4f\n"
-		"0:	movq %[zero],(%[dst])\n"
-		"	addq   %[eight],%[dst]\n"
+		"0:	movq $0,(%[dst])\n"
+		"	addq   $8,%[dst]\n"
 		"	decl %%ecx ; jnz   0b\n"
 		"4:	movq  %[size1],%%rcx\n"
 		"	testl %%ecx,%%ecx\n"
 		"	jz     2f\n"
-		"1:	movb   %b[zero],(%[dst])\n"
+		"1:	movb   $0,(%[dst])\n"
 		"	incq   %[dst]\n"
 		"	decl %%ecx ; jnz  1b\n"
 		"2:\n"
@@ -40,8 +40,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 		_ASM_EXTABLE(0b,3b)
 		_ASM_EXTABLE(1b,2b)
 		: [size8] "=&c"(size), [dst] "=&D" (__d0)
-		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
-		  [zero] "r" (0UL), [eight] "r" (8UL));
+		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
 	clac();
 	return size;
 }
@@ -75,6 +74,27 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
 	return len;
 }
 
+/*
+ * Similar to copy_user_handle_tail, probe for the write fault point,
+ * but reuse __memcpy_mcsafe in case a new read error is encountered.
+ * clac() is handled in _copy_to_iter_mcsafe().
+ */
+__visible unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len)
+{
+	for (; len; --len, to++, from++) {
+		/*
+		 * Call the assembly routine back directly since
+		 * memcpy_mcsafe() may silently fallback to memcpy.
+		 */
+		unsigned long rem = __memcpy_mcsafe(to, from, 1);
+
+		if (rem)
+			break;
+	}
+	return len;
+}
+
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 /**
  * clean_cache_range - write back a cache range with CLWB
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 62a7e9f65dec..2f3c9196b834 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
+#include <linux/highmem.h>
 
 #include <asm/pgtable.h>
 
@@ -334,16 +335,16 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
 			   pgprotval_t eff_in, unsigned long P)
 {
 	int i;
-	pte_t *start;
+	pte_t *pte;
 	pgprotval_t prot, eff;
 
-	start = (pte_t *)pmd_page_vaddr(addr);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		prot = pte_flags(*start);
-		eff = effective_prot(eff_in, prot);
 		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+		pte = pte_offset_map(&addr, st->current_address);
+		prot = pte_flags(*pte);
+		eff = effective_prot(eff_in, prot);
 		note_page(m, st, __pgprot(prot), eff, 5);
-		start++;
+		pte_unmap(pte);
 	}
 }
 #ifdef CONFIG_KASAN
@@ -359,7 +360,7 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
 				void *pt)
 {
 	if (__pa(pt) == __pa(kasan_zero_pmd) ||
-	    (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) ||
+	    (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) ||
 	    __pa(pt) == __pa(kasan_zero_pud)) {
 		pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
 		note_page(m, st, __pgprot(prot), 0, 5);
@@ -475,8 +476,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 	}
 }
 
-#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
-#define pgd_none(a)  (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
+#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
+#define pgd_none(a)  (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
 
 static inline bool is_hypervisor_range(int idx)
 {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 73bd8c95ac71..9a84a0d08727 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -209,6 +209,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 	unsigned lsb = 0;
 	siginfo_t info;
 
+	clear_siginfo(&info);
 	info.si_signo	= si_signo;
 	info.si_errno	= 0;
 	info.si_code	= si_code;
@@ -439,7 +440,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	if (pgd_none(*pgd_k))
 		return -1;
 
-	if (pgtable_l5_enabled) {
+	if (pgtable_l5_enabled()) {
 		if (pgd_none(*pgd)) {
 			set_pgd(pgd, *pgd_k);
 			arch_flush_lazy_mmu_mode();
@@ -454,7 +455,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	if (p4d_none(*p4d_k))
 		return -1;
 
-	if (p4d_none(*p4d) && !pgtable_l5_enabled) {
+	if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
 		set_p4d(p4d, *p4d_k);
 		arch_flush_lazy_mmu_mode();
 	} else {
@@ -828,6 +829,8 @@ static inline void
 show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 		unsigned long address, struct task_struct *tsk)
 {
+	const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+
 	if (!unhandled_signal(tsk, SIGSEGV))
 		return;
 
@@ -835,13 +838,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 		return;
 
 	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
-		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-		tsk->comm, task_pid_nr(tsk), address,
+		loglvl, tsk->comm, task_pid_nr(tsk), address,
 		(void *)regs->ip, (void *)regs->sp, error_code);
 
 	print_vma_addr(KERN_CONT " in ", regs->ip);
 
 	printk(KERN_CONT "\n");
+
+	show_opcodes((u8 *)regs->ip, loglvl);
 }
 
 static void
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index a2f0c7e20fb0..fe7a12599d8e 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -123,7 +123,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 		result = ident_p4d_init(info, p4d, addr, next);
 		if (result)
 			return result;
-		if (pgtable_l5_enabled) {
+		if (pgtable_l5_enabled()) {
 			set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
 		} else {
 			/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0a400606dea0..17383f9677fa 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -180,7 +180,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
  */
 void sync_global_pgds(unsigned long start, unsigned long end)
 {
-	if (pgtable_l5_enabled)
+	if (pgtable_l5_enabled())
 		sync_global_pgds_l5(start, end);
 	else
 		sync_global_pgds_l4(start, end);
@@ -643,7 +643,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
 	unsigned long vaddr = (unsigned long)__va(paddr);
 	int i = p4d_index(vaddr);
 
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
 
 	for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
@@ -723,7 +723,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
 					   page_size_mask);
 
 		spin_lock(&init_mm.page_table_lock);
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			pgd_populate(&init_mm, pgd, p4d);
 		else
 			p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
@@ -1100,7 +1100,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
 		 * 5-level case we should free them. This code will have to change
 		 * to adapt for boot-time switching between 4 and 5 level page tables.
 		 */
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			free_pud_table(pud_base, p4d);
 	}
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 980dbebd0ca7..e3e77527f8df 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -2,10 +2,8 @@
 #define DISABLE_BRANCH_PROFILING
 #define pr_fmt(fmt) "kasan: " fmt
 
-#ifdef CONFIG_X86_5LEVEL
-/* Too early to use cpu_feature_enabled() */
-#define pgtable_l5_enabled __pgtable_l5_enabled
-#endif
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
 
 #include <linux/bootmem.h>
 #include <linux/kasan.h>
@@ -182,7 +180,7 @@ static void __init clear_pgds(unsigned long start,
 		 * With folded p4d, pgd_clear() is nop, use p4d_clear()
 		 * instead.
 		 */
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			pgd_clear(pgd);
 		else
 			p4d_clear(p4d_offset(pgd, start));
@@ -197,7 +195,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
 {
 	unsigned long p4d;
 
-	if (!pgtable_l5_enabled)
+	if (!pgtable_l5_enabled())
 		return (p4d_t *)pgd;
 
 	p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
@@ -284,7 +282,7 @@ void __init kasan_early_init(void)
 	for (i = 0; i < PTRS_PER_PUD; i++)
 		kasan_zero_pud[i] = __pud(pud_val);
 
-	for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++)
+	for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++)
 		kasan_zero_p4d[i] = __p4d(p4d_val);
 
 	kasan_map_early_shadow(early_top_pgt);
@@ -315,7 +313,7 @@ void __init kasan_init(void)
 	 * bunch of things like kernel code, modules, EFI mapping, etc.
 	 * We need to take extra steps to not overwrite them.
 	 */
-	if (pgtable_l5_enabled) {
+	if (pgtable_l5_enabled()) {
 		void *ptr;
 
 		ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 615cc03ced84..61db77b0eda9 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -78,7 +78,7 @@ void __init kernel_randomize_memory(void)
 	struct rnd_state rand_state;
 	unsigned long remain_entropy;
 
-	vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
+	vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
 	vaddr = vaddr_start;
 
 	/*
@@ -124,7 +124,7 @@ void __init kernel_randomize_memory(void)
 		 */
 		entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
 		prandom_bytes_state(&rand_state, &rand, sizeof(rand));
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			entropy = (rand % (entropy + 1)) & P4D_MASK;
 		else
 			entropy = (rand % (entropy + 1)) & PUD_MASK;
@@ -136,7 +136,7 @@ void __init kernel_randomize_memory(void)
 		 * randomization alignment.
 		 */
 		vaddr += get_padding(&kaslr_regions[i]);
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			vaddr = round_up(vaddr + 1, P4D_SIZE);
 		else
 			vaddr = round_up(vaddr + 1, PUD_SIZE);
@@ -212,7 +212,7 @@ void __meminit init_trampoline(void)
 		return;
 	}
 
-	if (pgtable_l5_enabled)
+	if (pgtable_l5_enabled())
 		init_trampoline_p4d();
 	else
 		init_trampoline_pud();
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 25504d5aa816..fa150855647c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -136,13 +136,13 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 
 	/* whine about and ignore invalid blks */
 	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
-		pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
-			   nid, start, end - 1);
+		pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+			nid, start, end - 1);
 		return 0;
 	}
 
 	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
-		pr_err("NUMA: too many memblk ranges\n");
+		pr_err("too many memblk ranges\n");
 		return -EINVAL;
 	}
 
@@ -267,14 +267,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			 */
 			if (bi->end > bj->start && bi->start < bj->end) {
 				if (bi->nid != bj->nid) {
-					pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+					pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
 					       bi->nid, bi->start, bi->end - 1,
 					       bj->nid, bj->start, bj->end - 1);
 					return -EINVAL;
 				}
-				pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
-					   bi->nid, bi->start, bi->end - 1,
-					   bj->start, bj->end - 1);
+				pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+					bi->nid, bi->start, bi->end - 1,
+					bj->start, bj->end - 1);
 			}
 
 			/*
@@ -364,7 +364,7 @@ static int __init numa_alloc_distance(void)
 	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
 				      size, PAGE_SIZE);
 	if (!phys) {
-		pr_warning("NUMA: Warning: can't allocate distance table!\n");
+		pr_warn("Warning: can't allocate distance table!\n");
 		/* don't retry until explicitly reset */
 		numa_distance = (void *)1LU;
 		return -ENOMEM;
@@ -410,14 +410,14 @@ void __init numa_set_distance(int from, int to, int distance)
 
 	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
 			from < 0 || to < 0) {
-		pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
-			    from, to, distance);
+		pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+			     from, to, distance);
 		return;
 	}
 
 	if ((u8)distance != distance ||
 	    (from == to && distance != LOCAL_DISTANCE)) {
-		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+		pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
 			     from, to, distance);
 		return;
 	}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 0f3d50f4c48c..3bded76e8d5c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -93,6 +93,18 @@ void arch_report_meminfo(struct seq_file *m)
 static inline void split_page_count(int level) { }
 #endif
 
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+	return addr >= start && addr < end;
+}
+
+static inline int
+within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
+{
+	return addr >= start && addr <= end;
+}
+
 #ifdef CONFIG_X86_64
 
 static inline unsigned long highmap_start_pfn(void)
@@ -106,20 +118,25 @@ static inline unsigned long highmap_end_pfn(void)
 	return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
 }
 
-#endif
-
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
 {
-	return addr >= start && addr < end;
+	/*
+	 * Kernel text has an alias mapping at a high address, known
+	 * here as "highmap".
+	 */
+	return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
 }
 
-static inline int
-within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
+#else
+
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
 {
-	return addr >= start && addr <= end;
+	/* There is no highmap on 32-bit */
+	return false;
 }
 
+#endif
+
 /*
  * Flushing functions
  */
@@ -172,7 +189,7 @@ static void __cpa_flush_all(void *arg)
 
 static void cpa_flush_all(unsigned long cache)
 {
-	BUG_ON(irqs_disabled());
+	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
 	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
@@ -236,7 +253,7 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 #endif
 
-	BUG_ON(irqs_disabled());
+	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
 	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 
@@ -1183,6 +1200,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 		cpa->numpages = 1;
 		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
 		return 0;
+
+	} else if (__cpa_pfn_in_highmap(cpa->pfn)) {
+		/* Faults in the highmap are OK, so do not warn: */
+		return -EFAULT;
 	} else {
 		WARN(1, KERN_WARNING "CPA: called for zero pte. "
 			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
@@ -1335,8 +1356,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	 * to touch the high mapped kernel as well:
 	 */
 	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
-	    within_inclusive(cpa->pfn, highmap_start_pfn(),
-			     highmap_end_pfn())) {
+	    __cpa_pfn_in_highmap(cpa->pfn)) {
 		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
 					       __START_KERNEL_map - phys_base;
 		alias_cpa = *cpa;
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index d7bc0eea20a5..6e98e0a7c923 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
 	 */
 	if (pkey != -1)
 		return pkey;
-	/*
-	 * Look for a protection-key-drive execute-only mapping
-	 * which is now being given permissions that are not
-	 * execute-only.  Move it back to the default pkey.
-	 */
-	if (vma_is_pkey_exec_only(vma) &&
-	    (prot & (PROT_READ|PROT_WRITE))) {
-		return 0;
-	}
+
 	/*
 	 * The mapping is execute-only.  Go try to get the
 	 * execute-only protection key.  If we fail to do that,
 	 * fall through as if we do not have execute-only
-	 * support.
+	 * support in this mm.
 	 */
 	if (prot == PROT_EXEC) {
 		pkey = execute_only_pkey(vma->vm_mm);
 		if (pkey > 0)
 			return pkey;
+	} else if (vma_is_pkey_exec_only(vma)) {
+		/*
+		 * Protections are *not* PROT_EXEC, but the mapping
+		 * is using the exec-only pkey.  This mapping was
+		 * PROT_EXEC and will no longer be.  Move back to
+		 * the default pkey.
+		 */
+		return ARCH_DEFAULT_PKEY;
 	}
+
 	/*
 	 * This is a vanilla, non-pkey mprotect (or we failed to
 	 * setup execute-only), inherit the pkey from the VMA we
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index f1fd52f449e0..4d418e705878 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -421,6 +421,16 @@ static inline bool pti_kernel_image_global_ok(void)
 	if (boot_cpu_has(X86_FEATURE_K8))
 		return false;
 
+	/*
+	 * RANDSTRUCT derives its hardening benefits from the
+	 * attacker's lack of knowledge about the layout of kernel
+	 * data structures.  Keep the kernel image non-global in
+	 * cases where RANDSTRUCT is in use to help keep the layout a
+	 * secret.
+	 */
+	if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT))
+		return false;
+
 	return true;
 }
 
@@ -430,12 +440,24 @@ static inline bool pti_kernel_image_global_ok(void)
  */
 void pti_clone_kernel_text(void)
 {
+	/*
+	 * rodata is part of the kernel image and is normally
+	 * readable on the filesystem or on the web.  But, do not
+	 * clone the areas past rodata, they might contain secrets.
+	 */
 	unsigned long start = PFN_ALIGN(_text);
-	unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
+	unsigned long end = (unsigned long)__end_rodata_hpage_align;
 
 	if (!pti_kernel_image_global_ok())
 		return;
 
+	pr_debug("mapping partial kernel image into user address space\n");
+
+	/*
+	 * Note that this will undo _some_ of the work that
+	 * pti_set_kernel_image_nonglobal() did to clear the
+	 * global bit.
+	 */
 	pti_clone_pmds(start, end, _PAGE_RW);
 }
 
@@ -458,8 +480,6 @@ void pti_set_kernel_image_nonglobal(void)
 	if (pti_kernel_image_global_ok())
 		return;
 
-	pr_debug("set kernel image non-global\n");
-
 	set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
 }
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e055d1a06699..6eb1f34c3c85 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
 	unsigned long sp = current_stack_pointer;
 	pgd_t *pgd = pgd_offset(mm, sp);
 
-	if (pgtable_l5_enabled) {
+	if (pgtable_l5_enabled()) {
 		if (unlikely(pgd_none(*pgd))) {
 			pgd_t *pgd_ref = pgd_offset_k(sp);
 
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index fefb4b619598..59e123da580c 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,6 +1,9 @@
 #
 # Arch-specific network modules
 #
-OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
 
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+ifeq ($(CONFIG_X86_32),y)
+        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
+else
+        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
+endif
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
deleted file mode 100644
index b33093f84528..000000000000
--- a/arch/x86/net/bpf_jit.S
+++ /dev/null
@@ -1,154 +0,0 @@
-/* bpf_jit.S : BPF JIT helper functions
- *
- * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-/*
- * Calling convention :
- * rbx : skb pointer (callee saved)
- * esi : offset of byte(s) to fetch in skb (can be scratched)
- * r10 : copy of skb->data
- * r9d : hlen = skb->len - skb->data_len
- */
-#define SKBDATA	%r10
-#define SKF_MAX_NEG_OFF    $(-0x200000) /* SKF_LL_OFF from filter.h */
-
-#define FUNC(name) \
-	.globl name; \
-	.type name, @function; \
-	name:
-
-FUNC(sk_load_word)
-	test	%esi,%esi
-	js	bpf_slow_path_word_neg
-
-FUNC(sk_load_word_positive_offset)
-	mov	%r9d,%eax		# hlen
-	sub	%esi,%eax		# hlen - offset
-	cmp	$3,%eax
-	jle	bpf_slow_path_word
-	mov     (SKBDATA,%rsi),%eax
-	bswap   %eax  			/* ntohl() */
-	ret
-
-FUNC(sk_load_half)
-	test	%esi,%esi
-	js	bpf_slow_path_half_neg
-
-FUNC(sk_load_half_positive_offset)
-	mov	%r9d,%eax
-	sub	%esi,%eax		#	hlen - offset
-	cmp	$1,%eax
-	jle	bpf_slow_path_half
-	movzwl	(SKBDATA,%rsi),%eax
-	rol	$8,%ax			# ntohs()
-	ret
-
-FUNC(sk_load_byte)
-	test	%esi,%esi
-	js	bpf_slow_path_byte_neg
-
-FUNC(sk_load_byte_positive_offset)
-	cmp	%esi,%r9d   /* if (offset >= hlen) goto bpf_slow_path_byte */
-	jle	bpf_slow_path_byte
-	movzbl	(SKBDATA,%rsi),%eax
-	ret
-
-/* rsi contains offset and can be scratched */
-#define bpf_slow_path_common(LEN)		\
-	lea	32(%rbp), %rdx;\
-	FRAME_BEGIN;				\
-	mov	%rbx, %rdi; /* arg1 == skb */	\
-	push	%r9;				\
-	push	SKBDATA;			\
-/* rsi already has offset */			\
-	mov	$LEN,%ecx;	/* len */	\
-	call	skb_copy_bits;			\
-	test    %eax,%eax;			\
-	pop	SKBDATA;			\
-	pop	%r9;				\
-	FRAME_END
-
-
-bpf_slow_path_word:
-	bpf_slow_path_common(4)
-	js	bpf_error
-	mov	32(%rbp),%eax
-	bswap	%eax
-	ret
-
-bpf_slow_path_half:
-	bpf_slow_path_common(2)
-	js	bpf_error
-	mov	32(%rbp),%ax
-	rol	$8,%ax
-	movzwl	%ax,%eax
-	ret
-
-bpf_slow_path_byte:
-	bpf_slow_path_common(1)
-	js	bpf_error
-	movzbl	32(%rbp),%eax
-	ret
-
-#define sk_negative_common(SIZE)				\
-	FRAME_BEGIN;						\
-	mov	%rbx, %rdi; /* arg1 == skb */			\
-	push	%r9;						\
-	push	SKBDATA;					\
-/* rsi already has offset */					\
-	mov	$SIZE,%edx;	/* size */			\
-	call	bpf_internal_load_pointer_neg_helper;		\
-	test	%rax,%rax;					\
-	pop	SKBDATA;					\
-	pop	%r9;						\
-	FRAME_END;						\
-	jz	bpf_error
-
-bpf_slow_path_word_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi	/* test range */
-	jl	bpf_error	/* offset lower -> error  */
-
-FUNC(sk_load_word_negative_offset)
-	sk_negative_common(4)
-	mov	(%rax), %eax
-	bswap	%eax
-	ret
-
-bpf_slow_path_half_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi
-	jl	bpf_error
-
-FUNC(sk_load_half_negative_offset)
-	sk_negative_common(2)
-	mov	(%rax),%ax
-	rol	$8,%ax
-	movzwl	%ax,%eax
-	ret
-
-bpf_slow_path_byte_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi
-	jl	bpf_error
-
-FUNC(sk_load_byte_negative_offset)
-	sk_negative_common(1)
-	movzbl	(%rax), %eax
-	ret
-
-bpf_error:
-# force a return 0 from jit handler
-	xor	%eax,%eax
-	mov	(%rbp),%rbx
-	mov	8(%rbp),%r13
-	mov	16(%rbp),%r14
-	mov	24(%rbp),%r15
-	add	$40, %rbp
-	leaveq
-	ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b725154182cc..8fca446aaef6 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,4 +1,5 @@
-/* bpf_jit_comp.c : BPF JIT compiler
+/*
+ * bpf_jit_comp.c: BPF JIT compiler
  *
  * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
  * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
@@ -16,15 +17,6 @@
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 
-/*
- * assembly code in arch/x86/net/bpf_jit.S
- */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
-extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
-extern u8 sk_load_byte_positive_offset[];
-extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
-extern u8 sk_load_byte_negative_offset[];
-
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
 	if (len == 1)
@@ -45,14 +37,15 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
 #define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
 #define EMIT4(b1, b2, b3, b4)   EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
 #define EMIT1_off32(b1, off) \
-	do {EMIT1(b1); EMIT(off, 4); } while (0)
+	do { EMIT1(b1); EMIT(off, 4); } while (0)
 #define EMIT2_off32(b1, b2, off) \
-	do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+	do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
 #define EMIT3_off32(b1, b2, b3, off) \
-	do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+	do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
 #define EMIT4_off32(b1, b2, b3, b4, off) \
-	do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
 
 static bool is_imm8(int value)
 {
@@ -70,9 +63,10 @@ static bool is_uimm32(u64 value)
 }
 
 /* mov dst, src */
-#define EMIT_mov(DST, SRC) \
-	do {if (DST != SRC) \
-		EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
+#define EMIT_mov(DST, SRC)								 \
+	do {										 \
+		if (DST != SRC)								 \
+			EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
 	} while (0)
 
 static int bpf_size_to_x86_bytes(int bpf_size)
@@ -89,7 +83,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 		return 0;
 }
 
-/* list of x86 cond jumps opcodes (. + s8)
+/*
+ * List of x86 cond jumps opcodes (. + s8)
  * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
  */
 #define X86_JB  0x72
@@ -103,38 +98,37 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 #define X86_JLE 0x7E
 #define X86_JG  0x7F
 
-#define CHOOSE_LOAD_FUNC(K, func) \
-	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-
-/* pick a register outside of BPF range for JIT internal work */
+/* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
 
-/* The following table maps BPF registers to x64 registers.
+/*
+ * The following table maps BPF registers to x86-64 registers.
  *
- * x64 register r12 is unused, since if used as base address
+ * x86-64 register R12 is unused, since if used as base address
  * register in load/store instructions, it always needs an
  * extra byte of encoding and is callee saved.
  *
- *  r9 caches skb->len - skb->data_len
- * r10 caches skb->data, and used for blinding (if enabled)
+ * Also x86-64 register R9 is unused. x86-64 register R10 is
+ * used for blinding (if enabled).
  */
 static const int reg2hex[] = {
-	[BPF_REG_0] = 0,  /* rax */
-	[BPF_REG_1] = 7,  /* rdi */
-	[BPF_REG_2] = 6,  /* rsi */
-	[BPF_REG_3] = 2,  /* rdx */
-	[BPF_REG_4] = 1,  /* rcx */
-	[BPF_REG_5] = 0,  /* r8 */
-	[BPF_REG_6] = 3,  /* rbx callee saved */
-	[BPF_REG_7] = 5,  /* r13 callee saved */
-	[BPF_REG_8] = 6,  /* r14 callee saved */
-	[BPF_REG_9] = 7,  /* r15 callee saved */
-	[BPF_REG_FP] = 5, /* rbp readonly */
-	[BPF_REG_AX] = 2, /* r10 temp register */
-	[AUX_REG] = 3,    /* r11 temp register */
+	[BPF_REG_0] = 0,  /* RAX */
+	[BPF_REG_1] = 7,  /* RDI */
+	[BPF_REG_2] = 6,  /* RSI */
+	[BPF_REG_3] = 2,  /* RDX */
+	[BPF_REG_4] = 1,  /* RCX */
+	[BPF_REG_5] = 0,  /* R8  */
+	[BPF_REG_6] = 3,  /* RBX callee saved */
+	[BPF_REG_7] = 5,  /* R13 callee saved */
+	[BPF_REG_8] = 6,  /* R14 callee saved */
+	[BPF_REG_9] = 7,  /* R15 callee saved */
+	[BPF_REG_FP] = 5, /* RBP readonly */
+	[BPF_REG_AX] = 2, /* R10 temp register */
+	[AUX_REG] = 3,    /* R11 temp register */
 };
 
-/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15
+/*
+ * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
  * which need extra byte of encoding.
  * rax,rcx,...,rbp have simpler encoding
  */
@@ -153,7 +147,7 @@ static bool is_axreg(u32 reg)
 	return reg == BPF_REG_0;
 }
 
-/* add modifiers if 'reg' maps to x64 registers r8..r15 */
+/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
 static u8 add_1mod(u8 byte, u32 reg)
 {
 	if (is_ereg(reg))
@@ -170,13 +164,13 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
 	return byte;
 }
 
-/* encode 'dst_reg' register into x64 opcode 'byte' */
+/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
 static u8 add_1reg(u8 byte, u32 dst_reg)
 {
 	return byte + reg2hex[dst_reg];
 }
 
-/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
+/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
 static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 {
 	return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
@@ -184,27 +178,24 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 
 static void jit_fill_hole(void *area, unsigned int size)
 {
-	/* fill whole space with int3 instructions */
+	/* Fill whole space with INT3 instructions */
 	memset(area, 0xcc, size);
 }
 
 struct jit_context {
-	int cleanup_addr; /* epilogue code offset */
-	bool seen_ld_abs;
-	bool seen_ax_reg;
+	int cleanup_addr; /* Epilogue code offset */
 };
 
-/* maximum number of bytes emitted while JITing one eBPF insn */
+/* Maximum number of bytes emitted while JITing one eBPF insn */
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
-#define AUX_STACK_SPACE \
-	(32 /* space for rbx, r13, r14, r15 */ + \
-	 8 /* space for skb_copy_bits() buffer */)
+#define AUX_STACK_SPACE		40 /* Space for RBX, R13, R14, R15, tailcnt */
 
-#define PROLOGUE_SIZE 37
+#define PROLOGUE_SIZE		37
 
-/* emit x64 prologue code for BPF program and check it's size.
+/*
+ * Emit x86-64 prologue code for BPF program and check its size.
  * bpf_tail_call helper will skip it while jumping into another program
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
@@ -212,8 +203,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	u8 *prog = *pprog;
 	int cnt = 0;
 
-	EMIT1(0x55); /* push rbp */
-	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+	/* push rbp */
+	EMIT1(0x55);
+
+	/* mov rbp,rsp */
+	EMIT3(0x48, 0x89, 0xE5);
 
 	/* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */
 	EMIT3_off32(0x48, 0x81, 0xEC,
@@ -222,19 +216,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	/* sub rbp, AUX_STACK_SPACE */
 	EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
 
-	/* all classic BPF filters use R6(rbx) save it */
-
 	/* mov qword ptr [rbp+0],rbx */
 	EMIT4(0x48, 0x89, 0x5D, 0);
-
-	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
-	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
-	 * R8(r14). R9(r15) spill could be made conditional, but there is only
-	 * one 'bpf_error' return path out of helper functions inside bpf_jit.S
-	 * The overhead of extra spill is negligible for any filter other
-	 * than synthetic ones. Therefore not worth adding complexity.
-	 */
-
 	/* mov qword ptr [rbp+8],r13 */
 	EMIT4(0x4C, 0x89, 0x6D, 8);
 	/* mov qword ptr [rbp+16],r14 */
@@ -243,9 +226,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	EMIT4(0x4C, 0x89, 0x7D, 24);
 
 	if (!ebpf_from_cbpf) {
-		/* Clear the tail call counter (tail_call_cnt): for eBPF tail
+		/*
+		 * Clear the tail call counter (tail_call_cnt): for eBPF tail
 		 * calls we need to reset the counter to 0. It's done in two
-		 * instructions, resetting rax register to 0, and moving it
+		 * instructions, resetting RAX register to 0, and moving it
 		 * to the counter location.
 		 */
 
@@ -260,7 +244,9 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	*pprog = prog;
 }
 
-/* generate the following code:
+/*
+ * Generate the following code:
+ *
  * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
  *   if (index >= array->map.max_entries)
  *     goto out;
@@ -278,23 +264,26 @@ static void emit_bpf_tail_call(u8 **pprog)
 	int label1, label2, label3;
 	int cnt = 0;
 
-	/* rdi - pointer to ctx
+	/*
+	 * rdi - pointer to ctx
 	 * rsi - pointer to bpf_array
 	 * rdx - index in bpf_array
 	 */
 
-	/* if (index >= array->map.max_entries)
-	 *   goto out;
+	/*
+	 * if (index >= array->map.max_entries)
+	 *	goto out;
 	 */
 	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
 	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */
+#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
 
-	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
-	 *   goto out;
+	/*
+	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *	goto out;
 	 */
 	EMIT2_off32(0x8B, 0x85, 36);              /* mov eax, dword ptr [rbp + 36] */
 	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
@@ -308,8 +297,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT4_off32(0x48, 0x8B, 0x84, 0xD6,       /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
 		    offsetof(struct bpf_array, ptrs));
 
-	/* if (prog == NULL)
-	 *   goto out;
+	/*
+	 * if (prog == NULL)
+	 *	goto out;
 	 */
 	EMIT3(0x48, 0x85, 0xC0);		  /* test rax,rax */
 #define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
@@ -321,7 +311,8 @@ static void emit_bpf_tail_call(u8 **pprog)
 	      offsetof(struct bpf_prog, bpf_func));
 	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */
 
-	/* now we're ready to jump into next BPF program
+	/*
+	 * Wow we're ready to jump into next BPF program
 	 * rdi == ctx (1st arg)
 	 * rax == prog->bpf_func + prologue_size
 	 */
@@ -334,26 +325,6 @@ static void emit_bpf_tail_call(u8 **pprog)
 	*pprog = prog;
 }
 
-
-static void emit_load_skb_data_hlen(u8 **pprog)
-{
-	u8 *prog = *pprog;
-	int cnt = 0;
-
-	/* r9d = skb->len - skb->data_len (headlen)
-	 * r10 = skb->data
-	 */
-	/* mov %r9d, off32(%rdi) */
-	EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
-
-	/* sub %r9d, off32(%rdi) */
-	EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
-
-	/* mov %r10, off32(%rdi) */
-	EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
-	*pprog = prog;
-}
-
 static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 			   u32 dst_reg, const u32 imm32)
 {
@@ -361,7 +332,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 	u8 b1, b2, b3;
 	int cnt = 0;
 
-	/* optimization: if imm32 is positive, use 'mov %eax, imm32'
+	/*
+	 * Optimization: if imm32 is positive, use 'mov %eax, imm32'
 	 * (which zero-extends imm32) to save 2 bytes.
 	 */
 	if (sign_propagate && (s32)imm32 < 0) {
@@ -373,7 +345,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 		goto done;
 	}
 
-	/* optimization: if imm32 is zero, use 'xor %eax, %eax'
+	/*
+	 * Optimization: if imm32 is zero, use 'xor %eax, %eax'
 	 * to save 3 bytes.
 	 */
 	if (imm32 == 0) {
@@ -400,7 +373,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
 	int cnt = 0;
 
 	if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
-		/* For emitting plain u32, where sign bit must not be
+		/*
+		 * For emitting plain u32, where sign bit must not be
 		 * propagated LLVM tends to load imm64 over mov32
 		 * directly, so save couple of bytes by just doing
 		 * 'mov %eax, imm32' instead.
@@ -439,8 +413,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 {
 	struct bpf_insn *insn = bpf_prog->insnsi;
 	int insn_cnt = bpf_prog->len;
-	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
-	bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0);
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
 	int i, cnt = 0;
@@ -450,9 +422,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog));
 
-	if (seen_ld_abs)
-		emit_load_skb_data_hlen(&prog);
-
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		const s32 imm32 = insn->imm;
 		u32 dst_reg = insn->dst_reg;
@@ -460,13 +429,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		u8 b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
-		bool reload_skb_data;
 		int ilen;
 		u8 *func;
 
-		if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
-			ctx->seen_ax_reg = seen_ax_reg = true;
-
 		switch (insn->code) {
 			/* ALU */
 		case BPF_ALU | BPF_ADD | BPF_X:
@@ -525,7 +490,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			else if (is_ereg(dst_reg))
 				EMIT1(add_1mod(0x40, dst_reg));
 
-			/* b3 holds 'normal' opcode, b2 short form only valid
+			/*
+			 * b3 holds 'normal' opcode, b2 short form only valid
 			 * in case dst is eax/rax.
 			 */
 			switch (BPF_OP(insn->code)) {
@@ -593,7 +559,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			/* mov rax, dst_reg */
 			EMIT_mov(BPF_REG_0, dst_reg);
 
-			/* xor edx, edx
+			/*
+			 * xor edx, edx
 			 * equivalent to 'xor rdx, rdx', but one byte less
 			 */
 			EMIT2(0x31, 0xd2);
@@ -655,7 +622,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			}
 			break;
 		}
-			/* shifts */
+			/* Shifts */
 		case BPF_ALU | BPF_LSH | BPF_K:
 		case BPF_ALU | BPF_RSH | BPF_K:
 		case BPF_ALU | BPF_ARSH | BPF_K:
@@ -686,7 +653,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
 
-			/* check for bad case when dst_reg == rcx */
+			/* Check for bad case when dst_reg == rcx */
 			if (dst_reg == BPF_REG_4) {
 				/* mov r11, dst_reg */
 				EMIT_mov(AUX_REG, dst_reg);
@@ -724,13 +691,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_BE:
 			switch (imm32) {
 			case 16:
-				/* emit 'ror %ax, 8' to swap lower 2 bytes */
+				/* Emit 'ror %ax, 8' to swap lower 2 bytes */
 				EMIT1(0x66);
 				if (is_ereg(dst_reg))
 					EMIT1(0x41);
 				EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
 
-				/* emit 'movzwl eax, ax' */
+				/* Emit 'movzwl eax, ax' */
 				if (is_ereg(dst_reg))
 					EMIT3(0x45, 0x0F, 0xB7);
 				else
@@ -738,7 +705,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
 				break;
 			case 32:
-				/* emit 'bswap eax' to swap lower 4 bytes */
+				/* Emit 'bswap eax' to swap lower 4 bytes */
 				if (is_ereg(dst_reg))
 					EMIT2(0x41, 0x0F);
 				else
@@ -746,7 +713,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_1reg(0xC8, dst_reg));
 				break;
 			case 64:
-				/* emit 'bswap rax' to swap 8 bytes */
+				/* Emit 'bswap rax' to swap 8 bytes */
 				EMIT3(add_1mod(0x48, dst_reg), 0x0F,
 				      add_1reg(0xC8, dst_reg));
 				break;
@@ -756,7 +723,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_LE:
 			switch (imm32) {
 			case 16:
-				/* emit 'movzwl eax, ax' to zero extend 16-bit
+				/*
+				 * Emit 'movzwl eax, ax' to zero extend 16-bit
 				 * into 64 bit
 				 */
 				if (is_ereg(dst_reg))
@@ -766,7 +734,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
 				break;
 			case 32:
-				/* emit 'mov eax, eax' to clear upper 32-bits */
+				/* Emit 'mov eax, eax' to clear upper 32-bits */
 				if (is_ereg(dst_reg))
 					EMIT1(0x45);
 				EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
@@ -809,9 +777,9 @@ st:			if (is_imm8(insn->off))
 
 			/* STX: *(u8*)(dst_reg + off) = src_reg */
 		case BPF_STX | BPF_MEM | BPF_B:
-			/* emit 'mov byte ptr [rax + off], al' */
+			/* Emit 'mov byte ptr [rax + off], al' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg) ||
-			    /* have to add extra byte for x86 SIL, DIL regs */
+			    /* We have to add extra byte for x86 SIL, DIL regs */
 			    src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
 				EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
 			else
@@ -840,25 +808,26 @@ stx:			if (is_imm8(insn->off))
 
 			/* LDX: dst_reg = *(u8*)(src_reg + off) */
 		case BPF_LDX | BPF_MEM | BPF_B:
-			/* emit 'movzx rax, byte ptr [rax + off]' */
+			/* Emit 'movzx rax, byte ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_H:
-			/* emit 'movzx rax, word ptr [rax + off]' */
+			/* Emit 'movzx rax, word ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_W:
-			/* emit 'mov eax, dword ptr [rax+0x14]' */
+			/* Emit 'mov eax, dword ptr [rax+0x14]' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
 			else
 				EMIT1(0x8B);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_DW:
-			/* emit 'mov rax, qword ptr [rax+0x14]' */
+			/* Emit 'mov rax, qword ptr [rax+0x14]' */
 			EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
-ldx:			/* if insn->off == 0 we can save one extra byte, but
-			 * special case of x86 r13 which always needs an offset
+ldx:			/*
+			 * If insn->off == 0 we can save one extra byte, but
+			 * special case of x86 R13 which always needs an offset
 			 * is not worth the hassle
 			 */
 			if (is_imm8(insn->off))
@@ -870,7 +839,7 @@ ldx:			/* if insn->off == 0 we can save one extra byte, but
 
 			/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
 		case BPF_STX | BPF_XADD | BPF_W:
-			/* emit 'lock add dword ptr [rax + off], eax' */
+			/* Emit 'lock add dword ptr [rax + off], eax' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
 			else
@@ -889,35 +858,12 @@ xadd:			if (is_imm8(insn->off))
 		case BPF_JMP | BPF_CALL:
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
-			if (seen_ld_abs) {
-				reload_skb_data = bpf_helper_changes_pkt_data(func);
-				if (reload_skb_data) {
-					EMIT1(0x57); /* push %rdi */
-					jmp_offset += 22; /* pop, mov, sub, mov */
-				} else {
-					EMIT2(0x41, 0x52); /* push %r10 */
-					EMIT2(0x41, 0x51); /* push %r9 */
-					/* need to adjust jmp offset, since
-					 * pop %r9, pop %r10 take 4 bytes after call insn
-					 */
-					jmp_offset += 4;
-				}
-			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
-				pr_err("unsupported bpf func %d addr %p image %p\n",
+				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
 			EMIT1_off32(0xE8, jmp_offset);
-			if (seen_ld_abs) {
-				if (reload_skb_data) {
-					EMIT1(0x5F); /* pop %rdi */
-					emit_load_skb_data_hlen(&prog);
-				} else {
-					EMIT2(0x41, 0x59); /* pop %r9 */
-					EMIT2(0x41, 0x5A); /* pop %r10 */
-				}
-			}
 			break;
 
 		case BPF_JMP | BPF_TAIL_CALL:
@@ -970,7 +916,7 @@ xadd:			if (is_imm8(insn->off))
 			else
 				EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
 
-emit_cond_jmp:		/* convert BPF opcode to x86 */
+emit_cond_jmp:		/* Convert BPF opcode to x86 */
 			switch (BPF_OP(insn->code)) {
 			case BPF_JEQ:
 				jmp_cond = X86_JE;
@@ -996,22 +942,22 @@ emit_cond_jmp:		/* convert BPF opcode to x86 */
 				jmp_cond = X86_JBE;
 				break;
 			case BPF_JSGT:
-				/* signed '>', GT in x86 */
+				/* Signed '>', GT in x86 */
 				jmp_cond = X86_JG;
 				break;
 			case BPF_JSLT:
-				/* signed '<', LT in x86 */
+				/* Signed '<', LT in x86 */
 				jmp_cond = X86_JL;
 				break;
 			case BPF_JSGE:
-				/* signed '>=', GE in x86 */
+				/* Signed '>=', GE in x86 */
 				jmp_cond = X86_JGE;
 				break;
 			case BPF_JSLE:
-				/* signed '<=', LE in x86 */
+				/* Signed '<=', LE in x86 */
 				jmp_cond = X86_JLE;
 				break;
-			default: /* to silence gcc warning */
+			default: /* to silence GCC warning */
 				return -EFAULT;
 			}
 			jmp_offset = addrs[i + insn->off] - addrs[i];
@@ -1027,9 +973,19 @@ emit_cond_jmp:		/* convert BPF opcode to x86 */
 			break;
 
 		case BPF_JMP | BPF_JA:
-			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (insn->off == -1)
+				/* -1 jmp instructions will always jump
+				 * backwards two bytes. Explicitly handling
+				 * this case avoids wasting too many passes
+				 * when there are long sequences of replaced
+				 * dead code.
+				 */
+				jmp_offset = -2;
+			else
+				jmp_offset = addrs[i + insn->off] - addrs[i];
+
 			if (!jmp_offset)
-				/* optimize out nop jumps */
+				/* Optimize out nop jumps */
 				break;
 emit_jmp:
 			if (is_imm8(jmp_offset)) {
@@ -1042,66 +998,13 @@ emit_jmp:
 			}
 			break;
 
-		case BPF_LD | BPF_IND | BPF_W:
-			func = sk_load_word;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_W:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
-common_load:
-			ctx->seen_ld_abs = seen_ld_abs = true;
-			jmp_offset = func - (image + addrs[i]);
-			if (!func || !is_simm32(jmp_offset)) {
-				pr_err("unsupported bpf func %d addr %p image %p\n",
-				       imm32, func, image);
-				return -EINVAL;
-			}
-			if (BPF_MODE(insn->code) == BPF_ABS) {
-				/* mov %esi, imm32 */
-				EMIT1_off32(0xBE, imm32);
-			} else {
-				/* mov %rsi, src_reg */
-				EMIT_mov(BPF_REG_2, src_reg);
-				if (imm32) {
-					if (is_imm8(imm32))
-						/* add %esi, imm8 */
-						EMIT3(0x83, 0xC6, imm32);
-					else
-						/* add %esi, imm32 */
-						EMIT2_off32(0x81, 0xC6, imm32);
-				}
-			}
-			/* skb pointer is in R6 (%rbx), it will be copied into
-			 * %rdi if skb_copy_bits() call is necessary.
-			 * sk_load_* helpers also use %r10 and %r9d.
-			 * See bpf_jit.S
-			 */
-			if (seen_ax_reg)
-				/* r10 = skb->data, mov %r10, off32(%rbx) */
-				EMIT3_off32(0x4c, 0x8b, 0x93,
-					    offsetof(struct sk_buff, data));
-			EMIT1_off32(0xE8, jmp_offset); /* call */
-			break;
-
-		case BPF_LD | BPF_IND | BPF_H:
-			func = sk_load_half;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_H:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
-			goto common_load;
-		case BPF_LD | BPF_IND | BPF_B:
-			func = sk_load_byte;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_B:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
-			goto common_load;
-
 		case BPF_JMP | BPF_EXIT:
 			if (seen_exit) {
 				jmp_offset = ctx->cleanup_addr - addrs[i];
 				goto emit_jmp;
 			}
 			seen_exit = true;
-			/* update cleanup_addr */
+			/* Update cleanup_addr */
 			ctx->cleanup_addr = proglen;
 			/* mov rbx, qword ptr [rbp+0] */
 			EMIT4(0x48, 0x8B, 0x5D, 0);
@@ -1119,10 +1022,11 @@ common_load:
 			break;
 
 		default:
-			/* By design x64 JIT should support all BPF instructions
+			/*
+			 * By design x86-64 JIT should support all BPF instructions.
 			 * This error will be seen if new instruction was added
-			 * to interpreter, but not to JIT
-			 * or if there is junk in bpf_prog
+			 * to the interpreter, but not to the JIT, or if there is
+			 * junk in bpf_prog.
 			 */
 			pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
 			return -EINVAL;
@@ -1174,7 +1078,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		return orig_prog;
 
 	tmp = bpf_jit_blind_constants(prog);
-	/* If blinding was requested and we failed during blinding,
+	/*
+	 * If blinding was requested and we failed during blinding,
 	 * we must fall back to the interpreter.
 	 */
 	if (IS_ERR(tmp))
@@ -1208,8 +1113,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_addrs;
 	}
 
-	/* Before first pass, make a rough estimation of addrs[]
-	 * each bpf instruction is translated to less than 64 bytes
+	/*
+	 * Before first pass, make a rough estimation of addrs[]
+	 * each BPF instruction is translated to less than 64 bytes
 	 */
 	for (proglen = 0, i = 0; i < prog->len; i++) {
 		proglen += 64;
@@ -1218,14 +1124,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	ctx.cleanup_addr = proglen;
 skip_init_addrs:
 
-	/* JITed image shrinks with every pass and the loop iterates
-	 * until the image stops shrinking. Very large bpf programs
+	/*
+	 * JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large BPF programs
 	 * may converge on the last pass. In such case do one more
-	 * pass to emit the final image
+	 * pass to emit the final image.
 	 */
 	for (pass = 0; pass < 20 || image; pass++) {
 		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
 		if (proglen <= 0) {
+out_image:
 			image = NULL;
 			if (header)
 				bpf_jit_binary_free(header);
@@ -1236,8 +1144,7 @@ skip_init_addrs:
 			if (proglen != oldproglen) {
 				pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
 				       proglen, oldproglen);
-				prog = orig_prog;
-				goto out_addrs;
+				goto out_image;
 			}
 			break;
 		}
@@ -1273,7 +1180,7 @@ skip_init_addrs:
 		prog = orig_prog;
 	}
 
-	if (!prog->is_func || extra_pass) {
+	if (!image || !prog->is_func || extra_pass) {
 out_addrs:
 		kfree(addrs);
 		kfree(jit_data);
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
new file mode 100644
index 000000000000..0cc04e30adc1
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -0,0 +1,2419 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Just-In-Time compiler for eBPF filters on IA32 (32bit x86)
+ *
+ * Author: Wang YanQing (udknight@gmail.com)
+ * The code based on code and ideas from:
+ * Eric Dumazet (eric.dumazet@gmail.com)
+ * and from:
+ * Shubham Bansal <illusionist.neo@gmail.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/if_vlan.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+#include <asm/nospec-branch.h>
+#include <linux/bpf.h>
+
+/*
+ * eBPF prog stack layout:
+ *
+ *                         high
+ * original ESP =>        +-----+
+ *                        |     | callee saved registers
+ *                        +-----+
+ *                        | ... | eBPF JIT scratch space
+ * BPF_FP,IA32_EBP  =>    +-----+
+ *                        | ... | eBPF prog stack
+ *                        +-----+
+ *                        |RSVD | JIT scratchpad
+ * current ESP =>         +-----+
+ *                        |     |
+ *                        | ... | Function call stack
+ *                        |     |
+ *                        +-----+
+ *                          low
+ *
+ * The callee saved registers:
+ *
+ *                                high
+ * original ESP =>        +------------------+ \
+ *                        |        ebp       | |
+ * current EBP =>         +------------------+ } callee saved registers
+ *                        |    ebx,esi,edi   | |
+ *                        +------------------+ /
+ *                                low
+ */
+
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+	if (len == 1)
+		*ptr = bytes;
+	else if (len == 2)
+		*(u16 *)ptr = bytes;
+	else {
+		*(u32 *)ptr = bytes;
+		barrier();
+	}
+	return ptr + len;
+}
+
+#define EMIT(bytes, len) \
+	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+
+#define EMIT1(b1)		EMIT(b1, 1)
+#define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4)   \
+	EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
+#define EMIT1_off32(b1, off) \
+	do { EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+	do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+	do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len)
+
+static bool is_imm8(int value)
+{
+	return value <= 127 && value >= -128;
+}
+
+static bool is_simm32(s64 value)
+{
+	return value == (s64) (s32) value;
+}
+
+#define STACK_OFFSET(k)	(k)
+#define TCALL_CNT	(MAX_BPF_JIT_REG + 0)	/* Tail Call Count */
+
+#define IA32_EAX	(0x0)
+#define IA32_EBX	(0x3)
+#define IA32_ECX	(0x1)
+#define IA32_EDX	(0x2)
+#define IA32_ESI	(0x6)
+#define IA32_EDI	(0x7)
+#define IA32_EBP	(0x5)
+#define IA32_ESP	(0x4)
+
+/*
+ * List of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define IA32_JB  0x72
+#define IA32_JAE 0x73
+#define IA32_JE  0x74
+#define IA32_JNE 0x75
+#define IA32_JBE 0x76
+#define IA32_JA  0x77
+#define IA32_JL  0x7C
+#define IA32_JGE 0x7D
+#define IA32_JLE 0x7E
+#define IA32_JG  0x7F
+
+/*
+ * Map eBPF registers to IA32 32bit registers or stack scratch space.
+ *
+ * 1. All the registers, R0-R10, are mapped to scratch space on stack.
+ * 2. We need two 64 bit temp registers to do complex operations on eBPF
+ *    registers.
+ * 3. For performance reason, the BPF_REG_AX for blinding constant, is
+ *    mapped to real hardware register pair, IA32_ESI and IA32_EDI.
+ *
+ * As the eBPF registers are all 64 bit registers and IA32 has only 32 bit
+ * registers, we have to map each eBPF registers with two IA32 32 bit regs
+ * or scratch memory space and we have to build eBPF 64 bit register from those.
+ *
+ * We use IA32_EAX, IA32_EDX, IA32_ECX, IA32_EBX as temporary registers.
+ */
+static const u8 bpf2ia32[][2] = {
+	/* Return value from in-kernel function, and exit value from eBPF */
+	[BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)},
+
+	/* The arguments from eBPF program to in-kernel function */
+	/* Stored on stack scratch space */
+	[BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)},
+	[BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)},
+	[BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)},
+	[BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)},
+	[BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)},
+
+	/* Callee saved registers that in-kernel function will preserve */
+	/* Stored on stack scratch space */
+	[BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)},
+	[BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)},
+	[BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)},
+	[BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)},
+
+	/* Read only Frame Pointer to access Stack */
+	[BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)},
+
+	/* Temporary register for blinding constants. */
+	[BPF_REG_AX] = {IA32_ESI, IA32_EDI},
+
+	/* Tail call count. Stored on stack scratch space. */
+	[TCALL_CNT] = {STACK_OFFSET(88), STACK_OFFSET(92)},
+};
+
+#define dst_lo	dst[0]
+#define dst_hi	dst[1]
+#define src_lo	src[0]
+#define src_hi	src[1]
+
+#define STACK_ALIGNMENT	8
+/*
+ * Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4,
+ * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9,
+ * BPF_REG_FP, BPF_REG_AX and Tail call counts.
+ */
+#define SCRATCH_SIZE 96
+
+/* Total stack size used in JITed code */
+#define _STACK_SIZE	(stack_depth + SCRATCH_SIZE)
+
+#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+
+/* Get the offset of eBPF REGISTERs stored on scratch space. */
+#define STACK_VAR(off) (off)
+
+/* Encode 'dst_reg' register into IA32 opcode 'byte' */
+static u8 add_1reg(u8 byte, u32 dst_reg)
+{
+	return byte + dst_reg;
+}
+
+/* Encode 'dst_reg' and 'src_reg' registers into IA32 opcode 'byte' */
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+	return byte + dst_reg + (src_reg << 3);
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+	/* Fill whole space with int3 instructions */
+	memset(area, 0xcc, size);
+}
+
+static inline void emit_ia32_mov_i(const u8 dst, const u32 val, bool dstk,
+				   u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (dstk) {
+		if (val == 0) {
+			/* xor eax,eax */
+			EMIT2(0x33, add_2reg(0xC0, IA32_EAX, IA32_EAX));
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(dst));
+		} else {
+			EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP),
+				    STACK_VAR(dst), val);
+		}
+	} else {
+		if (val == 0)
+			EMIT2(0x33, add_2reg(0xC0, dst, dst));
+		else
+			EMIT2_off32(0xC7, add_1reg(0xC0, dst),
+				    val);
+	}
+	*pprog = prog;
+}
+
+/* dst = imm (4 bytes)*/
+static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_EAX : src;
+
+	if (sstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+	if (dstk)
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, sreg), STACK_VAR(dst));
+	else
+		/* mov dst,sreg */
+		EMIT2(0x89, add_2reg(0xC0, dst, sreg));
+
+	*pprog = prog;
+}
+
+/* dst = src */
+static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[],
+				     const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog);
+	if (is64)
+		/* complete 8 byte move */
+		emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog);
+	else
+		/* zero out high 4 bytes */
+		emit_ia32_mov_i(dst_hi, 0, dstk, pprog);
+}
+
+/* Sign extended move */
+static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[],
+				     const u32 val, bool dstk, u8 **pprog)
+{
+	u32 hi = 0;
+
+	if (is64 && (val & (1<<31)))
+		hi = (u32)~0;
+	emit_ia32_mov_i(dst_lo, val, dstk, pprog);
+	emit_ia32_mov_i(dst_hi, hi, dstk, pprog);
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst * src
+ */
+static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_ECX : src;
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+	else
+		/* mov eax,dst */
+		EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+
+	EMIT2(0xF7, add_1reg(0xE0, sreg));
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst));
+	else
+		/* mov dst,eax */
+		EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+
+	*pprog = prog;
+}
+
+static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val,
+					 bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk && val != 64) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	switch (val) {
+	case 16:
+		/*
+		 * Emit 'movzwl eax,ax' to zero extend 16-bit
+		 * into 64 bit
+		 */
+		EMIT2(0x0F, 0xB7);
+		EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 32:
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 64:
+		/* nop */
+		break;
+	}
+
+	if (dstk && val != 64) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val,
+				       bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	switch (val) {
+	case 16:
+		/* Emit 'ror %ax, 8' to swap lower 2 bytes */
+		EMIT1(0x66);
+		EMIT3(0xC1, add_1reg(0xC8, dreg_lo), 8);
+
+		EMIT2(0x0F, 0xB7);
+		EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 32:
+		/* Emit 'bswap eax' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_lo));
+
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 64:
+		/* Emit 'bswap eax' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_lo));
+
+		/* Emit 'bswap edx' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_hi));
+
+		/* mov ecx,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, IA32_ECX, dreg_hi));
+		/* mov dreg_hi,dreg_lo */
+		EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+		/* mov dreg_lo,ecx */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+		break;
+	}
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (div|mod) src
+ */
+static inline void emit_ia32_div_mod_r(const u8 op, const u8 dst, const u8 src,
+				       bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src));
+	else if (src != IA32_ECX)
+		/* mov ecx,src */
+		EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst));
+	else
+		/* mov eax,dst */
+		EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+	/* xor edx,edx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+	/* div ecx */
+	EMIT2(0xF7, add_1reg(0xF0, IA32_ECX));
+
+	if (op == BPF_MOD) {
+		if (dstk)
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(dst));
+		else
+			EMIT2(0x89, add_2reg(0xC0, dst, IA32_EDX));
+	} else {
+		if (dstk)
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(dst));
+		else
+			EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+	}
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (shift) src
+ */
+static inline void emit_ia32_shift_r(const u8 op, const u8 dst, const u8 src,
+				     bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg = dstk ? IA32_EAX : dst;
+	u8 b2;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+	else if (src != IA32_ECX)
+		/* mov ecx,src */
+		EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+	switch (op) {
+	case BPF_LSH:
+		b2 = 0xE0; break;
+	case BPF_RSH:
+		b2 = 0xE8; break;
+	case BPF_ARSH:
+		b2 = 0xF8; break;
+	default:
+		return;
+	}
+	EMIT2(0xD3, add_1reg(b2, dreg));
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) src
+ */
+static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op,
+				   const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_EAX : src;
+	u8 dreg = dstk ? IA32_EDX : dst;
+
+	if (sstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst));
+
+	switch (BPF_OP(op)) {
+	/* dst = dst + src */
+	case BPF_ADD:
+		if (hi && is64)
+			EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+		else
+			EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst - src */
+	case BPF_SUB:
+		if (hi && is64)
+			EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+		else
+			EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst | src */
+	case BPF_OR:
+		EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst & src */
+	case BPF_AND:
+		EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst ^ src */
+	case BPF_XOR:
+		EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+		break;
+	}
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+		      STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_r64(const bool is64, const u8 op,
+				     const u8 dst[], const u8 src[],
+				     bool dstk,  bool sstk,
+				     u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, dstk, sstk, &prog);
+	if (is64)
+		emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk,
+				&prog);
+	else
+		emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) val
+ */
+static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op,
+				   const u8 dst, const s32 val, bool dstk,
+				   u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg = dstk ? IA32_EAX : dst;
+	u8 sreg = IA32_EDX;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+	if (!is_imm8(val))
+		/* mov edx,imm32*/
+		EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EDX), val);
+
+	switch (op) {
+	/* dst = dst + val */
+	case BPF_ADD:
+		if (hi && is64) {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xD0, dreg), val);
+			else
+				EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+		} else {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xC0, dreg), val);
+			else
+				EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+		}
+		break;
+	/* dst = dst - val */
+	case BPF_SUB:
+		if (hi && is64) {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xD8, dreg), val);
+			else
+				EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+		} else {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xE8, dreg), val);
+			else
+				EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+		}
+		break;
+	/* dst = dst | val */
+	case BPF_OR:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xC8, dreg), val);
+		else
+			EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst & val */
+	case BPF_AND:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xE0, dreg), val);
+		else
+			EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst ^ val */
+	case BPF_XOR:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xF0, dreg), val);
+		else
+			EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+		break;
+	case BPF_NEG:
+		EMIT2(0xF7, add_1reg(0xD8, dreg));
+		break;
+	}
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+		      STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_i64(const bool is64, const u8 op,
+				     const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	u32 hi = 0;
+
+	if (is64 && (val & (1<<31)))
+		hi = (u32)~0;
+
+	emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog);
+	if (is64)
+		emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog);
+	else
+		emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+
+	*pprog = prog;
+}
+
+/* dst = ~dst (64 bit) */
+static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	/* xor ecx,ecx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+	/* sub dreg_lo,ecx */
+	EMIT2(0x2B, add_2reg(0xC0, dreg_lo, IA32_ECX));
+	/* mov dreg_lo,ecx */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+	/* xor ecx,ecx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+	/* sbb dreg_hi,ecx */
+	EMIT2(0x19, add_2reg(0xC0, dreg_hi, IA32_ECX));
+	/* mov dreg_hi,ecx */
+	EMIT2(0x89, add_2reg(0xC0, dreg_hi, IA32_ECX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst << src */
+static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[],
+				     bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* shl dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_hi));
+	/* mov ebx,dreg_lo */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	/* shl dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shr ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+	/* or dreg_hi,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* shl dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+	/* mov dreg_hi,dreg_lo */
+	EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst >> src (signed)*/
+static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[],
+				      bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* lshr dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+	/* mov ebx,dreg_hi */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	/* ashr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shl ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+	/* or dreg_lo,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* ashr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+	/* ashr dreg_hi,imm8 */
+	EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* ashr dreg_hi,imm8 */
+	EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst >> src */
+static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* lshr dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+	/* mov ebx,dreg_hi */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	/* shr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shl ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+	/* or dreg_lo,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* shr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst << val */
+static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	/* Do LSH operation */
+	if (val < 32) {
+		/* shl dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val);
+		/* mov ebx,dreg_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+		/* shl dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shr ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+		/* or dreg_hi,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* shl dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_lo), value);
+		/* mov dreg_hi,dreg_lo */
+		EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	} else {
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst >> val */
+static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	/* Do RSH operation */
+	if (val < 32) {
+		/* shr dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+		/* mov ebx,dreg_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+		/* shr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shl ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+		/* or dreg_lo,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* shr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_hi), value);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	} else {
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst >> val (signed) */
+static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val,
+				      bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	/* Do RSH operation */
+	if (val < 32) {
+		/* shr dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+		/* mov ebx,dreg_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shl ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+		/* or dreg_lo,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), value);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+	} else {
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_hi));
+	else
+		/* mov eax,dst_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dst_hi, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+	else
+		/* mul src_lo */
+		EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+	/* mov ecx,eax */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+	else
+		/* mov eax,dst_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi));
+	else
+		/* mul src_hi */
+		EMIT2(0xF7, add_1reg(0xE0, src_hi));
+
+	/* add eax,eax */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+	else
+		/* mov eax,dst_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+	else
+		/* mul src_lo */
+		EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+	/* add ecx,edx */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(dst_hi));
+	} else {
+		/* mov dst_lo,eax */
+		EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+		/* mov dst_hi,ecx */
+		EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+	}
+
+	*pprog = prog;
+}
+
+static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u32 hi;
+
+	hi = val & (1<<31) ? (u32)~0 : 0;
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+	else
+		/* mul dst_hi */
+		EMIT2(0xF7, add_1reg(0xE0, dst_hi));
+
+	/* mov ecx,eax */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), hi);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+	else
+		/* mul dst_lo */
+		EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+	/* add ecx,eax */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+	else
+		/* mul dst_lo */
+		EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+
+	/* add ecx,edx */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(dst_hi));
+	} else {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+	}
+
+	*pprog = prog;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+	if (bpf_size == BPF_W)
+		return 4;
+	else if (bpf_size == BPF_H)
+		return 2;
+	else if (bpf_size == BPF_B)
+		return 1;
+	else if (bpf_size == BPF_DW)
+		return 4; /* imm32 */
+	else
+		return 0;
+}
+
+struct jit_context {
+	int cleanup_addr; /* Epilogue code offset */
+};
+
+/* Maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE	128
+#define BPF_INSN_SAFETY		64
+
+#define PROLOGUE_SIZE 35
+
+/*
+ * Emit prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program.
+ */
+static void emit_prologue(u8 **pprog, u32 stack_depth)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *r1 = bpf2ia32[BPF_REG_1];
+	const u8 fplo = bpf2ia32[BPF_REG_FP][0];
+	const u8 fphi = bpf2ia32[BPF_REG_FP][1];
+	const u8 *tcc = bpf2ia32[TCALL_CNT];
+
+	/* push ebp */
+	EMIT1(0x55);
+	/* mov ebp,esp */
+	EMIT2(0x89, 0xE5);
+	/* push edi */
+	EMIT1(0x57);
+	/* push esi */
+	EMIT1(0x56);
+	/* push ebx */
+	EMIT1(0x53);
+
+	/* sub esp,STACK_SIZE */
+	EMIT2_off32(0x81, 0xEC, STACK_SIZE);
+	/* sub ebp,SCRATCH_SIZE+4+12*/
+	EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16);
+	/* xor ebx,ebx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX));
+
+	/* Set up BPF prog stack base register */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(fphi));
+
+	/* Move BPF_CTX (EAX) to BPF_REG_R1 */
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(r1[1]));
+
+	/* Initialize Tail Count */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[0]));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	*pprog = prog;
+}
+
+/* Emit epilogue code for BPF program */
+static void emit_epilogue(u8 **pprog, u32 stack_depth)
+{
+	u8 *prog = *pprog;
+	const u8 *r0 = bpf2ia32[BPF_REG_0];
+	int cnt = 0;
+
+	/* mov eax,dword ptr [ebp+off]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0]));
+	/* mov edx,dword ptr [ebp+off]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
+
+	/* add ebp,SCRATCH_SIZE+4+12*/
+	EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16);
+
+	/* mov ebx,dword ptr [ebp-12]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
+	/* mov esi,dword ptr [ebp-8]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8);
+	/* mov edi,dword ptr [ebp-4]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4);
+
+	EMIT1(0xC9); /* leave */
+	EMIT1(0xC3); /* ret */
+	*pprog = prog;
+}
+
+/*
+ * Generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ *   if (index >= array->map.max_entries)
+ *     goto out;
+ *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *     goto out;
+ *   prog = array->ptrs[index];
+ *   if (prog == NULL)
+ *     goto out;
+ *   goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *r1 = bpf2ia32[BPF_REG_1];
+	const u8 *r2 = bpf2ia32[BPF_REG_2];
+	const u8 *r3 = bpf2ia32[BPF_REG_3];
+	const u8 *tcc = bpf2ia32[TCALL_CNT];
+	u32 lo, hi;
+	static int jmp_label1 = -1;
+
+	/*
+	 * if (index >= array->map.max_entries)
+	 *     goto out;
+	 */
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r2[0]));
+	/* mov edx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r3[0]));
+
+	/* cmp dword ptr [eax+off],edx */
+	EMIT3(0x39, add_2reg(0x40, IA32_EAX, IA32_EDX),
+	      offsetof(struct bpf_array, map.max_entries));
+	/* jbe out */
+	EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
+
+	/*
+	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *     goto out;
+	 */
+	lo = (u32)MAX_TAIL_CALL_CNT;
+	hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	/* cmp edx,hi */
+	EMIT3(0x83, add_1reg(0xF8, IA32_EBX), hi);
+	EMIT2(IA32_JNE, 3);
+	/* cmp ecx,lo */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
+
+	/* ja out */
+	EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+
+	/* add eax,0x1 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 0x01);
+	/* adc ebx,0x0 */
+	EMIT3(0x83, add_1reg(0xD0, IA32_EBX), 0x00);
+
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+	/* mov dword ptr [ebp+off],edx */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	/* prog = array->ptrs[index]; */
+	/* mov edx, [eax + edx * 4 + offsetof(...)] */
+	EMIT3_off32(0x8B, 0x94, 0x90, offsetof(struct bpf_array, ptrs));
+
+	/*
+	 * if (prog == NULL)
+	 *     goto out;
+	 */
+	/* test edx,edx */
+	EMIT2(0x85, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+	/* je out */
+	EMIT2(IA32_JE, jmp_label(jmp_label1, 2));
+
+	/* goto *(prog->bpf_func + prologue_size); */
+	/* mov edx, dword ptr [edx + 32] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EDX, IA32_EDX),
+	      offsetof(struct bpf_prog, bpf_func));
+	/* add edx,prologue_size */
+	EMIT3(0x83, add_1reg(0xC0, IA32_EDX), PROLOGUE_SIZE);
+
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+
+	/*
+	 * Now we're ready to jump into next BPF program:
+	 * eax == ctx (1st arg)
+	 * edx == prog->bpf_func + prologue_size
+	 */
+	RETPOLINE_EDX_BPF_JIT();
+
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* out: */
+	*pprog = prog;
+}
+
+/* Push the scratch stack register on top of the stack. */
+static inline void emit_push_r64(const u8 src[], u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_hi));
+	/* push ecx */
+	EMIT1(0x51);
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+	/* push ecx */
+	EMIT1(0x51);
+
+	*pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+		  int oldproglen, struct jit_context *ctx)
+{
+	struct bpf_insn *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
+	bool seen_exit = false;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	int i, cnt = 0;
+	int proglen = 0;
+	u8 *prog = temp;
+
+	emit_prologue(&prog, bpf_prog->aux->stack_depth);
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		const s32 imm32 = insn->imm;
+		const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+		const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true;
+		const bool sstk = insn->src_reg == BPF_REG_AX ? false : true;
+		const u8 code = insn->code;
+		const u8 *dst = bpf2ia32[insn->dst_reg];
+		const u8 *src = bpf2ia32[insn->src_reg];
+		const u8 *r0 = bpf2ia32[BPF_REG_0];
+		s64 jmp_offset;
+		u8 jmp_cond;
+		int ilen;
+		u8 *func;
+
+		switch (code) {
+		/* ALU operations */
+		/* dst = src */
+		case BPF_ALU | BPF_MOV | BPF_K:
+		case BPF_ALU | BPF_MOV | BPF_X:
+		case BPF_ALU64 | BPF_MOV | BPF_K:
+		case BPF_ALU64 | BPF_MOV | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mov_r64(is64, dst, src, dstk,
+						  sstk, &prog);
+				break;
+			case BPF_K:
+				/* Sign-extend immediate value to dst reg */
+				emit_ia32_mov_i64(is64, dst, imm32,
+						  dstk, &prog);
+				break;
+			}
+			break;
+		/* dst = dst + src/imm */
+		/* dst = dst - src/imm */
+		/* dst = dst | src/imm */
+		/* dst = dst & src/imm */
+		/* dst = dst ^ src/imm */
+		/* dst = dst * src/imm */
+		/* dst = dst << src */
+		/* dst = dst >> src */
+		case BPF_ALU | BPF_ADD | BPF_K:
+		case BPF_ALU | BPF_ADD | BPF_X:
+		case BPF_ALU | BPF_SUB | BPF_K:
+		case BPF_ALU | BPF_SUB | BPF_X:
+		case BPF_ALU | BPF_OR | BPF_K:
+		case BPF_ALU | BPF_OR | BPF_X:
+		case BPF_ALU | BPF_AND | BPF_K:
+		case BPF_ALU | BPF_AND | BPF_X:
+		case BPF_ALU | BPF_XOR | BPF_K:
+		case BPF_ALU | BPF_XOR | BPF_X:
+		case BPF_ALU64 | BPF_ADD | BPF_K:
+		case BPF_ALU64 | BPF_ADD | BPF_X:
+		case BPF_ALU64 | BPF_SUB | BPF_K:
+		case BPF_ALU64 | BPF_SUB | BPF_X:
+		case BPF_ALU64 | BPF_OR | BPF_K:
+		case BPF_ALU64 | BPF_OR | BPF_X:
+		case BPF_ALU64 | BPF_AND | BPF_K:
+		case BPF_ALU64 | BPF_AND | BPF_X:
+		case BPF_ALU64 | BPF_XOR | BPF_K:
+		case BPF_ALU64 | BPF_XOR | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_alu_r64(is64, BPF_OP(code), dst,
+						  src, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				emit_ia32_alu_i64(is64, BPF_OP(code), dst,
+						  imm32, dstk, &prog);
+				break;
+			}
+			break;
+		case BPF_ALU | BPF_MUL | BPF_K:
+		case BPF_ALU | BPF_MUL | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mul_r(dst_lo, src_lo, dstk,
+						sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_mul_r(dst_lo, IA32_ECX, dstk,
+						false, &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		case BPF_ALU | BPF_LSH | BPF_X:
+		case BPF_ALU | BPF_RSH | BPF_X:
+		case BPF_ALU | BPF_ARSH | BPF_K:
+		case BPF_ALU | BPF_ARSH | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_shift_r(BPF_OP(code), dst_lo, src_lo,
+						  dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_shift_r(BPF_OP(code), dst_lo,
+						  IA32_ECX, dstk, false,
+						  &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = dst / src(imm) */
+		/* dst = dst % src(imm) */
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU | BPF_DIV | BPF_X:
+		case BPF_ALU | BPF_MOD | BPF_K:
+		case BPF_ALU | BPF_MOD | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+						    src_lo, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+						    IA32_ECX, dstk, false,
+						    &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		case BPF_ALU64 | BPF_DIV | BPF_K:
+		case BPF_ALU64 | BPF_DIV | BPF_X:
+		case BPF_ALU64 | BPF_MOD | BPF_K:
+		case BPF_ALU64 | BPF_MOD | BPF_X:
+			goto notyet;
+		/* dst = dst >> imm */
+		/* dst = dst << imm */
+		case BPF_ALU | BPF_RSH | BPF_K:
+		case BPF_ALU | BPF_LSH | BPF_K:
+			if (unlikely(imm32 > 31))
+				return -EINVAL;
+			/* mov ecx,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk,
+					  false, &prog);
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = dst << imm */
+		case BPF_ALU64 | BPF_LSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_lsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = dst >> imm */
+		case BPF_ALU64 | BPF_RSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_rsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = dst << src */
+		case BPF_ALU64 | BPF_LSH | BPF_X:
+			emit_ia32_lsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> src */
+		case BPF_ALU64 | BPF_RSH | BPF_X:
+			emit_ia32_rsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> src (signed) */
+		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			emit_ia32_arsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> imm (signed) */
+		case BPF_ALU64 | BPF_ARSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_arsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = ~dst */
+		case BPF_ALU | BPF_NEG:
+			emit_ia32_alu_i(is64, false, BPF_OP(code),
+					dst_lo, 0, dstk, &prog);
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = ~dst (64 bit) */
+		case BPF_ALU64 | BPF_NEG:
+			emit_ia32_neg64(dst, dstk, &prog);
+			break;
+		/* dst = dst * src/imm */
+		case BPF_ALU64 | BPF_MUL | BPF_X:
+		case BPF_ALU64 | BPF_MUL | BPF_K:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mul_r64(dst, src, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				emit_ia32_mul_i64(dst, imm32, dstk, &prog);
+				break;
+			}
+			break;
+		/* dst = htole(dst) */
+		case BPF_ALU | BPF_END | BPF_FROM_LE:
+			emit_ia32_to_le_r64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = htobe(dst) */
+		case BPF_ALU | BPF_END | BPF_FROM_BE:
+			emit_ia32_to_be_r64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = imm64 */
+		case BPF_LD | BPF_IMM | BPF_DW: {
+			s32 hi, lo = imm32;
+
+			hi = insn[1].imm;
+			emit_ia32_mov_i(dst_lo, lo, dstk, &prog);
+			emit_ia32_mov_i(dst_hi, hi, dstk, &prog);
+			insn++;
+			i++;
+			break;
+		}
+		/* ST: *(u8*)(dst_reg + off) = imm */
+		case BPF_ST | BPF_MEM | BPF_H:
+		case BPF_ST | BPF_MEM | BPF_B:
+		case BPF_ST | BPF_MEM | BPF_W:
+		case BPF_ST | BPF_MEM | BPF_DW:
+			if (dstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov eax,dst_lo */
+				EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT(0xC6, 1); break;
+			case BPF_H:
+				EMIT2(0x66, 0xC7); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0xC7, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_1reg(0x40, IA32_EAX), insn->off);
+			else
+				EMIT1_off32(add_1reg(0x80, IA32_EAX),
+					    insn->off);
+			EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code)));
+
+			if (BPF_SIZE(code) == BPF_DW) {
+				u32 hi;
+
+				hi = imm32 & (1<<31) ? (u32)~0 : 0;
+				EMIT2_off32(0xC7, add_1reg(0x80, IA32_EAX),
+					    insn->off + 4);
+				EMIT(hi, 4);
+			}
+			break;
+
+		/* STX: *(u8*)(dst_reg + off) = src_reg */
+		case BPF_STX | BPF_MEM | BPF_B:
+		case BPF_STX | BPF_MEM | BPF_H:
+		case BPF_STX | BPF_MEM | BPF_W:
+		case BPF_STX | BPF_MEM | BPF_DW:
+			if (dstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov eax,dst_lo */
+				EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+			if (sstk)
+				/* mov edx,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(src_lo));
+			else
+				/* mov edx,src_lo */
+				EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EDX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT(0x88, 1); break;
+			case BPF_H:
+				EMIT2(0x66, 0x89); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0x89, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+				      insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off);
+
+			if (BPF_SIZE(code) == BPF_DW) {
+				if (sstk)
+					/* mov edi,dword ptr [ebp+off] */
+					EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+							     IA32_EDX),
+					      STACK_VAR(src_hi));
+				else
+					/* mov edi,src_hi */
+					EMIT2(0x8B, add_2reg(0xC0, src_hi,
+							     IA32_EDX));
+				EMIT1(0x89);
+				if (is_imm8(insn->off + 4)) {
+					EMIT2(add_2reg(0x40, IA32_EAX,
+						       IA32_EDX),
+					      insn->off + 4);
+				} else {
+					EMIT1(add_2reg(0x80, IA32_EAX,
+						       IA32_EDX));
+					EMIT(insn->off + 4, 4);
+				}
+			}
+			break;
+
+		/* LDX: dst_reg = *(u8*)(src_reg + off) */
+		case BPF_LDX | BPF_MEM | BPF_B:
+		case BPF_LDX | BPF_MEM | BPF_H:
+		case BPF_LDX | BPF_MEM | BPF_W:
+		case BPF_LDX | BPF_MEM | BPF_DW:
+			if (sstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(src_lo));
+			else
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EAX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT2(0x0F, 0xB6); break;
+			case BPF_H:
+				EMIT2(0x0F, 0xB7); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0x8B, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+				      insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off);
+
+			if (dstk)
+				/* mov dword ptr [ebp+off],edx */
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov dst_lo,edx */
+				EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EDX));
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+			case BPF_H:
+			case BPF_W:
+				if (dstk) {
+					EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+					      STACK_VAR(dst_hi));
+					EMIT(0x0, 4);
+				} else {
+					EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0);
+				}
+				break;
+			case BPF_DW:
+				EMIT2_off32(0x8B,
+					    add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off + 4);
+				if (dstk)
+					EMIT3(0x89,
+					      add_2reg(0x40, IA32_EBP,
+						       IA32_EDX),
+					      STACK_VAR(dst_hi));
+				else
+					EMIT2(0x89,
+					      add_2reg(0xC0, dst_hi, IA32_EDX));
+				break;
+			default:
+				break;
+			}
+			break;
+		/* call */
+		case BPF_JMP | BPF_CALL:
+		{
+			const u8 *r1 = bpf2ia32[BPF_REG_1];
+			const u8 *r2 = bpf2ia32[BPF_REG_2];
+			const u8 *r3 = bpf2ia32[BPF_REG_3];
+			const u8 *r4 = bpf2ia32[BPF_REG_4];
+			const u8 *r5 = bpf2ia32[BPF_REG_5];
+
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				goto notyet;
+
+			func = (u8 *) __bpf_call_base + imm32;
+			jmp_offset = func - (image + addrs[i]);
+
+			if (!imm32 || !is_simm32(jmp_offset)) {
+				pr_err("unsupported BPF func %d addr %p image %p\n",
+				       imm32, func, image);
+				return -EINVAL;
+			}
+
+			/* mov eax,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r1[0]));
+			/* mov edx,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r1[1]));
+
+			emit_push_r64(r5, &prog);
+			emit_push_r64(r4, &prog);
+			emit_push_r64(r3, &prog);
+			emit_push_r64(r2, &prog);
+
+			EMIT1_off32(0xE8, jmp_offset + 9);
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r0[0]));
+			/* mov dword ptr [ebp+off],edx */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r0[1]));
+
+			/* add esp,32 */
+			EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32);
+			break;
+		}
+		case BPF_JMP | BPF_TAIL_CALL:
+			emit_bpf_tail_call(&prog);
+			break;
+
+		/* cond jump */
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JNE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JLT | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JLE | BPF_X:
+		case BPF_JMP | BPF_JSGT | BPF_X:
+		case BPF_JMP | BPF_JSLE | BPF_X:
+		case BPF_JMP | BPF_JSLT | BPF_X:
+		case BPF_JMP | BPF_JSGE | BPF_X: {
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+			u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			if (sstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+				      STACK_VAR(src_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+				      STACK_VAR(src_hi));
+			}
+
+			/* cmp dreg_hi,sreg_hi */
+			EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+			EMIT2(IA32_JNE, 2);
+			/* cmp dreg_lo,sreg_lo */
+			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JSET | BPF_X: {
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+			u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			if (sstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+				      STACK_VAR(src_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+				      STACK_VAR(src_hi));
+			}
+			/* and dreg_lo,sreg_lo */
+			EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+			/* and dreg_hi,sreg_hi */
+			EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+			/* or dreg_lo,dreg_hi */
+			EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JSET | BPF_K: {
+			u32 hi;
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = IA32_ECX;
+			u8 sreg_hi = IA32_EBX;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+			hi = imm32 & (1<<31) ? (u32)~0 : 0;
+
+			/* mov ecx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			/* mov ebx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+			/* and dreg_lo,sreg_lo */
+			EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+			/* and dreg_hi,sreg_hi */
+			EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+			/* or dreg_lo,dreg_hi */
+			EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JNE | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JLT | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JLE | BPF_K:
+		case BPF_JMP | BPF_JSGT | BPF_K:
+		case BPF_JMP | BPF_JSLE | BPF_K:
+		case BPF_JMP | BPF_JSLT | BPF_K:
+		case BPF_JMP | BPF_JSGE | BPF_K: {
+			u32 hi;
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = IA32_ECX;
+			u8 sreg_hi = IA32_EBX;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			hi = imm32 & (1<<31) ? (u32)~0 : 0;
+			/* mov ecx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			/* mov ebx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+			/* cmp dreg_hi,sreg_hi */
+			EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+			EMIT2(IA32_JNE, 2);
+			/* cmp dreg_lo,sreg_lo */
+			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+
+emit_cond_jmp:		/* Convert BPF opcode to x86 */
+			switch (BPF_OP(code)) {
+			case BPF_JEQ:
+				jmp_cond = IA32_JE;
+				break;
+			case BPF_JSET:
+			case BPF_JNE:
+				jmp_cond = IA32_JNE;
+				break;
+			case BPF_JGT:
+				/* GT is unsigned '>', JA in x86 */
+				jmp_cond = IA32_JA;
+				break;
+			case BPF_JLT:
+				/* LT is unsigned '<', JB in x86 */
+				jmp_cond = IA32_JB;
+				break;
+			case BPF_JGE:
+				/* GE is unsigned '>=', JAE in x86 */
+				jmp_cond = IA32_JAE;
+				break;
+			case BPF_JLE:
+				/* LE is unsigned '<=', JBE in x86 */
+				jmp_cond = IA32_JBE;
+				break;
+			case BPF_JSGT:
+				/* Signed '>', GT in x86 */
+				jmp_cond = IA32_JG;
+				break;
+			case BPF_JSLT:
+				/* Signed '<', LT in x86 */
+				jmp_cond = IA32_JL;
+				break;
+			case BPF_JSGE:
+				/* Signed '>=', GE in x86 */
+				jmp_cond = IA32_JGE;
+				break;
+			case BPF_JSLE:
+				/* Signed '<=', LE in x86 */
+				jmp_cond = IA32_JLE;
+				break;
+			default: /* to silence GCC warning */
+				return -EFAULT;
+			}
+			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (is_imm8(jmp_offset)) {
+				EMIT2(jmp_cond, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+			} else {
+				pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+
+			break;
+		}
+		case BPF_JMP | BPF_JA:
+			if (insn->off == -1)
+				/* -1 jmp instructions will always jump
+				 * backwards two bytes. Explicitly handling
+				 * this case avoids wasting too many passes
+				 * when there are long sequences of replaced
+				 * dead code.
+				 */
+				jmp_offset = -2;
+			else
+				jmp_offset = addrs[i + insn->off] - addrs[i];
+
+			if (!jmp_offset)
+				/* Optimize out nop jumps */
+				break;
+emit_jmp:
+			if (is_imm8(jmp_offset)) {
+				EMIT2(0xEB, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT1_off32(0xE9, jmp_offset);
+			} else {
+				pr_err("jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+			break;
+		/* STX XADD: lock *(u32 *)(dst + off) += src */
+		case BPF_STX | BPF_XADD | BPF_W:
+		/* STX XADD: lock *(u64 *)(dst + off) += src */
+		case BPF_STX | BPF_XADD | BPF_DW:
+			goto notyet;
+		case BPF_JMP | BPF_EXIT:
+			if (seen_exit) {
+				jmp_offset = ctx->cleanup_addr - addrs[i];
+				goto emit_jmp;
+			}
+			seen_exit = true;
+			/* Update cleanup_addr */
+			ctx->cleanup_addr = proglen;
+			emit_epilogue(&prog, bpf_prog->aux->stack_depth);
+			break;
+notyet:
+			pr_info_once("*** NOT YET: opcode %02x ***\n", code);
+			return -EFAULT;
+		default:
+			/*
+			 * This error will be seen if new instruction was added
+			 * to interpreter, but not to JIT or if there is junk in
+			 * bpf_prog
+			 */
+			pr_err("bpf_jit: unknown opcode %02x\n", code);
+			return -EINVAL;
+		}
+
+		ilen = prog - temp;
+		if (ilen > BPF_MAX_INSN_SIZE) {
+			pr_err("bpf_jit: fatal insn size error\n");
+			return -EFAULT;
+		}
+
+		if (image) {
+			if (unlikely(proglen + ilen > oldproglen)) {
+				pr_err("bpf_jit: fatal error\n");
+				return -EFAULT;
+			}
+			memcpy(image + proglen, temp, ilen);
+		}
+		proglen += ilen;
+		addrs[i] = proglen;
+		prog = temp;
+	}
+	return proglen;
+}
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+	struct bpf_binary_header *header = NULL;
+	struct bpf_prog *tmp, *orig_prog = prog;
+	int proglen, oldproglen = 0;
+	struct jit_context ctx = {};
+	bool tmp_blinded = false;
+	u8 *image = NULL;
+	int *addrs;
+	int pass;
+	int i;
+
+	if (!prog->jit_requested)
+		return orig_prog;
+
+	tmp = bpf_jit_blind_constants(prog);
+	/*
+	 * If blinding was requested and we failed during blinding,
+	 * we must fall back to the interpreter.
+	 */
+	if (IS_ERR(tmp))
+		return orig_prog;
+	if (tmp != prog) {
+		tmp_blinded = true;
+		prog = tmp;
+	}
+
+	addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
+	if (!addrs) {
+		prog = orig_prog;
+		goto out;
+	}
+
+	/*
+	 * Before first pass, make a rough estimation of addrs[]
+	 * each BPF instruction is translated to less than 64 bytes
+	 */
+	for (proglen = 0, i = 0; i < prog->len; i++) {
+		proglen += 64;
+		addrs[i] = proglen;
+	}
+	ctx.cleanup_addr = proglen;
+
+	/*
+	 * JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large BPF programs
+	 * may converge on the last pass. In such case do one more
+	 * pass to emit the final image.
+	 */
+	for (pass = 0; pass < 20 || image; pass++) {
+		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+		if (proglen <= 0) {
+out_image:
+			image = NULL;
+			if (header)
+				bpf_jit_binary_free(header);
+			prog = orig_prog;
+			goto out_addrs;
+		}
+		if (image) {
+			if (proglen != oldproglen) {
+				pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+				       proglen, oldproglen);
+				goto out_image;
+			}
+			break;
+		}
+		if (proglen == oldproglen) {
+			header = bpf_jit_binary_alloc(proglen, &image,
+						      1, jit_fill_hole);
+			if (!header) {
+				prog = orig_prog;
+				goto out_addrs;
+			}
+		}
+		oldproglen = proglen;
+		cond_resched();
+	}
+
+	if (bpf_jit_enable > 1)
+		bpf_jit_dump(prog->len, proglen, pass + 1, image);
+
+	if (image) {
+		bpf_jit_binary_lock_ro(header);
+		prog->bpf_func = (void *)image;
+		prog->jited = 1;
+		prog->jited_len = proglen;
+	} else {
+		prog = orig_prog;
+	}
+
+out_addrs:
+	kfree(addrs);
+out:
+	if (tmp_blinded)
+		bpf_jit_prog_release_other(prog, prog == orig_prog ?
+					   tmp : orig_prog);
+	return prog;
+}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index bed7e7f4e44c..e01f7ceb9e7a 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -225,7 +225,7 @@ int __init efi_alloc_page_tables(void)
 
 	pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
 	if (!pud) {
-		if (pgtable_l5_enabled)
+		if (pgtable_l5_enabled())
 			free_page((unsigned long) pgd_page_vaddr(*pgd));
 		free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
 		return -ENOMEM;
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 58024862a7eb..a52914aa3b6c 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -57,7 +57,7 @@ void vrtc_cmos_write(unsigned char val, unsigned char reg)
 }
 EXPORT_SYMBOL_GPL(vrtc_cmos_write);
 
-void vrtc_get_time(struct timespec *now)
+void vrtc_get_time(struct timespec64 *now)
 {
 	u8 sec, min, hour, mday, mon;
 	unsigned long flags;
@@ -83,18 +83,18 @@ void vrtc_get_time(struct timespec *now)
 	pr_info("vRTC: sec: %d min: %d hour: %d day: %d "
 		"mon: %d year: %d\n", sec, min, hour, mday, mon, year);
 
-	now->tv_sec = mktime(year, mon, mday, hour, min, sec);
+	now->tv_sec = mktime64(year, mon, mday, hour, min, sec);
 	now->tv_nsec = 0;
 }
 
-int vrtc_set_mmss(const struct timespec *now)
+int vrtc_set_mmss(const struct timespec64 *now)
 {
 	unsigned long flags;
 	struct rtc_time tm;
 	int year;
 	int retval = 0;
 
-	rtc_time_to_tm(now->tv_sec, &tm);
+	rtc_time64_to_tm(now->tv_sec, &tm);
 	if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) {
 		/*
 		 * tm.year is the number of years since 1900, and the
@@ -110,8 +110,8 @@ int vrtc_set_mmss(const struct timespec *now)
 		vrtc_cmos_write(tm.tm_sec, RTC_SECONDS);
 		spin_unlock_irqrestore(&rtc_lock, flags);
 	} else {
-		pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
-			__func__, now->tv_sec);
+		pr_err("%s: Invalid vRTC value: write of %llx to vRTC failed\n",
+			__func__, (s64)now->tv_sec);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 48b14b534897..67ccf64c8bd8 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -72,7 +72,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
 	 * tables used by the image kernel.
 	 */
 
-	if (pgtable_l5_enabled) {
+	if (pgtable_l5_enabled()) {
 		p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
 		if (!p4d)
 			return -ENOMEM;
@@ -98,7 +98,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
 		set_pgd(pgd + pgd_index(restore_jump_address), new_pgd);
 	} else {
 		/* No p4d for 4-level paging: point the pgd to the pud page table */
-		pgd_t new_pgd = __pgd(__pa(p4d) | pgprot_val(pgtable_prot));
+		pgd_t new_pgd = __pgd(__pa(pud) | pgprot_val(pgtable_prot));
 		set_pgd(pgd + pgd_index(restore_jump_address), new_pgd);
 	}
 
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 13ed827c7c66..9d529f22fd9d 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -1,5 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
-mainmenu "User Mode Linux/$SUBARCH $KERNELVERSION Kernel Configuration"
+mainmenu "User Mode Linux/$(SUBARCH) $(KERNELVERSION) Kernel Configuration"
+
+comment "Compiler: $(CC_VERSION_TEXT)"
+
+source "scripts/Kconfig.include"
 
 source "arch/um/Kconfig.common"
 
@@ -16,8 +20,8 @@ config UML_X86
 	select GENERIC_FIND_FIRST_BIT
 
 config 64BIT
-	bool "64-bit kernel" if SUBARCH = "x86"
-	default SUBARCH != "i386"
+	bool "64-bit kernel" if "$(SUBARCH)" = "x86"
+	default "$(SUBARCH)" != "i386"
 
 config X86_32
 	def_bool !64BIT
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index 10003359e633..b2d6967262b2 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -23,14 +23,14 @@ $(obj)/vdso.o: $(obj)/vdso.so
 
 targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
 
-export CPPFLAGS_vdso.lds += -P -C
+CPPFLAGS_vdso.lds += -P -C
 
 VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
        -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
 
 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
 
-$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
+$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
 	$(call if_changed,vdso)
 
 $(obj)/%.so: OBJCOPYFLAGS := -S
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index a18703be9ead..1804b27f9632 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -115,6 +115,61 @@ static efi_system_table_t __init *xen_efi_probe(void)
 	return &efi_systab_xen;
 }
 
+/*
+ * Determine whether we're in secure boot mode.
+ *
+ * Please keep the logic in sync with
+ * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot().
+ */
+static enum efi_secureboot_mode xen_efi_get_secureboot(void)
+{
+	static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
+	static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
+	efi_status_t status;
+	u8 moksbstate, secboot, setupmode;
+	unsigned long size;
+
+	size = sizeof(secboot);
+	status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
+				  NULL, &size, &secboot);
+
+	if (status == EFI_NOT_FOUND)
+		return efi_secureboot_mode_disabled;
+
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	size = sizeof(setupmode);
+	status = efi.get_variable(L"SetupMode", &efi_variable_guid,
+				  NULL, &size, &setupmode);
+
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	if (secboot == 0 || setupmode == 1)
+		return efi_secureboot_mode_disabled;
+
+	/* See if a user has put the shim into insecure mode. */
+	size = sizeof(moksbstate);
+	status = efi.get_variable(L"MokSBStateRT", &shim_guid,
+				  NULL, &size, &moksbstate);
+
+	/* If it fails, we don't care why. Default to secure. */
+	if (status != EFI_SUCCESS)
+		goto secure_boot_enabled;
+
+	if (moksbstate == 1)
+		return efi_secureboot_mode_disabled;
+
+ secure_boot_enabled:
+	pr_info("UEFI Secure Boot is enabled.\n");
+	return efi_secureboot_mode_enabled;
+
+ out_efi_err:
+	pr_err("Could not determine UEFI Secure Boot status.\n");
+	return efi_secureboot_mode_unknown;
+}
+
 void __init xen_efi_init(void)
 {
 	efi_system_table_t *efi_systab_xen;
@@ -129,6 +184,8 @@ void __init xen_efi_init(void)
 	boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
 	boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
 
+	boot_params.secure_boot = xen_efi_get_secureboot();
+
 	set_bit(EFI_BOOT, &efi.flags);
 	set_bit(EFI_PARAVIRT, &efi.flags);
 	set_bit(EFI_64BIT, &efi.flags);
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 826898701045..19c1ff542387 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -65,6 +65,19 @@ static void __init xen_hvm_init_mem_mapping(void)
 {
 	early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE);
 	HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+
+	/*
+	 * The virtual address of the shared_info page has changed, so
+	 * the vcpu_info pointer for VCPU 0 is now stale.
+	 *
+	 * The prepare_boot_cpu callback will re-initialize it via
+	 * xen_vcpu_setup, but we can't rely on that to be called for
+	 * old Xen versions (xen_have_vector_callback == 0).
+	 *
+	 * It is, in any case, bad to have a stale vcpu_info pointer
+	 * so reset it now.
+	 */
+	xen_vcpu_info_reset(0);
 }
 
 static void __init init_hvm_pv_info(void)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index c36d23aa6c35..357969a3697c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -421,45 +421,33 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 {
 	unsigned long va = dtr->address;
 	unsigned int size = dtr->size + 1;
-	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
-	unsigned long frames[pages];
-	int f;
-
-	/*
-	 * A GDT can be up to 64k in size, which corresponds to 8192
-	 * 8-byte entries, or 16 4k pages..
-	 */
+	unsigned long pfn, mfn;
+	int level;
+	pte_t *ptep;
+	void *virt;
 
-	BUG_ON(size > 65536);
+	/* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
+	BUG_ON(size > PAGE_SIZE);
 	BUG_ON(va & ~PAGE_MASK);
 
-	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		int level;
-		pte_t *ptep;
-		unsigned long pfn, mfn;
-		void *virt;
-
-		/*
-		 * The GDT is per-cpu and is in the percpu data area.
-		 * That can be virtually mapped, so we need to do a
-		 * page-walk to get the underlying MFN for the
-		 * hypercall.  The page can also be in the kernel's
-		 * linear range, so we need to RO that mapping too.
-		 */
-		ptep = lookup_address(va, &level);
-		BUG_ON(ptep == NULL);
-
-		pfn = pte_pfn(*ptep);
-		mfn = pfn_to_mfn(pfn);
-		virt = __va(PFN_PHYS(pfn));
+	/*
+	 * The GDT is per-cpu and is in the percpu data area.
+	 * That can be virtually mapped, so we need to do a
+	 * page-walk to get the underlying MFN for the
+	 * hypercall.  The page can also be in the kernel's
+	 * linear range, so we need to RO that mapping too.
+	 */
+	ptep = lookup_address(va, &level);
+	BUG_ON(ptep == NULL);
 
-		frames[f] = mfn;
+	pfn = pte_pfn(*ptep);
+	mfn = pfn_to_mfn(pfn);
+	virt = __va(PFN_PHYS(pfn));
 
-		make_lowmem_page_readonly((void *)va);
-		make_lowmem_page_readonly(virt);
-	}
+	make_lowmem_page_readonly((void *)va);
+	make_lowmem_page_readonly(virt);
 
-	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+	if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
 		BUG();
 }
 
@@ -470,34 +458,22 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
 {
 	unsigned long va = dtr->address;
 	unsigned int size = dtr->size + 1;
-	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
-	unsigned long frames[pages];
-	int f;
-
-	/*
-	 * A GDT can be up to 64k in size, which corresponds to 8192
-	 * 8-byte entries, or 16 4k pages..
-	 */
+	unsigned long pfn, mfn;
+	pte_t pte;
 
-	BUG_ON(size > 65536);
+	/* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
+	BUG_ON(size > PAGE_SIZE);
 	BUG_ON(va & ~PAGE_MASK);
 
-	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		pte_t pte;
-		unsigned long pfn, mfn;
+	pfn = virt_to_pfn(va);
+	mfn = pfn_to_mfn(pfn);
 
-		pfn = virt_to_pfn(va);
-		mfn = pfn_to_mfn(pfn);
+	pte = pfn_pte(pfn, PAGE_KERNEL_RO);
 
-		pte = pfn_pte(pfn, PAGE_KERNEL_RO);
-
-		if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
-			BUG();
-
-		frames[f] = mfn;
-	}
+	if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+		BUG();
 
-	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+	if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
 		BUG();
 }
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index d33e7dbe3129..2d76106788a3 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,13 +42,11 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 }
 EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 
-static void xen_flush_tlb_all(void)
+static noinline void xen_flush_tlb_all(void)
 {
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 
-	trace_xen_mmu_flush_tlb_all(0);
-
 	preempt_disable();
 
 	mcs = xen_mc_entry(sizeof(*op));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 486c0a34d00b..2c30cabfda90 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1310,13 +1310,11 @@ unsigned long xen_read_cr2_direct(void)
 	return this_cpu_read(xen_vcpu_info.arch.cr2);
 }
 
-static void xen_flush_tlb(void)
+static noinline void xen_flush_tlb(void)
 {
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 
-	trace_xen_mmu_flush_tlb(0);
-
 	preempt_disable();
 
 	mcs = xen_mc_entry(sizeof(*op));
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 29163c43ebbd..e0f1bcf01d63 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -57,7 +57,7 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs)
 	return xen_clocksource_read();
 }
 
-static void xen_read_wallclock(struct timespec *ts)
+static void xen_read_wallclock(struct timespec64 *ts)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	struct pvclock_wall_clock *wall_clock = &(s->wc);
@@ -68,12 +68,12 @@ static void xen_read_wallclock(struct timespec *ts)
 	put_cpu_var(xen_vcpu);
 }
 
-static void xen_get_wallclock(struct timespec *now)
+static void xen_get_wallclock(struct timespec64 *now)
 {
 	xen_read_wallclock(now);
 }
 
-static int xen_set_wallclock(const struct timespec *now)
+static int xen_set_wallclock(const struct timespec64 *now)
 {
 	return -ENODEV;
 }
@@ -461,7 +461,7 @@ static void __init xen_time_init(void)
 {
 	struct pvclock_vcpu_time_info *pvti;
 	int cpu = smp_processor_id();
-	struct timespec tp;
+	struct timespec64 tp;
 
 	/* As Dom0 is never moved, no penalty on using TSC there */
 	if (xen_initial_domain())
@@ -479,7 +479,7 @@ static void __init xen_time_init(void)
 
 	/* Set initial system time with full resolution */
 	xen_read_wallclock(&tp);
-	do_settimeofday(&tp);
+	do_settimeofday64(&tp);
 
 	setup_force_cpu_cap(X86_FEATURE_TSC);