634 files changed, 13590 insertions, 9593 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 21d0089117fe..2520ca5b42eb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -931,6 +931,18 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
 	bool
 
+config ARCH_HAS_REFCOUNT
+	bool
+	help
+	  An architecture selects this when it has implemented refcount_t
+	  using open coded assembly primitives that provide an optimized
+	  refcount_t implementation, possibly at the expense of some full
+	  refcount state checks of CONFIG_REFCOUNT_FULL=y.
+
+	  The refcount overflow check behavior, however, must be retained.
+	  Catching overflows is the primary security concern for protecting
+	  against bugs in reference counts.
+
 config REFCOUNT_FULL
 	bool "Perform full reference count validation at the expense of speed"
 	help
diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index fb01dfb760c2..05a70edd57b6 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -25,18 +25,10 @@
 	:	"r" (uaddr), "r"(oparg)				\
 	:	"memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h
index ff4049155c84..4d61d2a50c52 100644
--- a/arch/alpha/include/asm/io.h
+++ b/arch/alpha/include/asm/io.h
@@ -299,6 +299,7 @@ static inline void __iomem * ioremap_nocache(unsigned long offset,
 	return ioremap(offset, size);
 }
 
+#define ioremap_wc ioremap_nocache
 #define ioremap_uc ioremap_nocache
 
 static inline void iounmap(volatile void __iomem *addr)
diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index a40b9fc0c6c3..718ac0b64adf 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -16,11 +16,6 @@
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 #define arch_spin_is_locked(x)	((x)->lock != 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
         return lock.lock == 0;
diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h
index 4cb4b6d3452c..0bc66e1d3a7e 100644
--- a/arch/alpha/include/asm/types.h
+++ b/arch/alpha/include/asm/types.h
@@ -1,6 +1,6 @@
 #ifndef _ALPHA_TYPES_H
 #define _ALPHA_TYPES_H
 
-#include <asm-generic/int-ll64.h>
+#include <uapi/asm/types.h>
 
 #endif /* _ALPHA_TYPES_H */
diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h
index b37153ecf2ac..db7fc0f511e2 100644
--- a/arch/alpha/include/asm/unistd.h
+++ b/arch/alpha/include/asm/unistd.h
@@ -3,7 +3,7 @@
 
 #include <uapi/asm/unistd.h>
 
-#define NR_SYSCALLS			514
+#define NR_SYSCALLS			523
 
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/arch/alpha/include/uapi/asm/ioctls.h b/arch/alpha/include/uapi/asm/ioctls.h
index ff67b8373bf7..1cd7dc7d4870 100644
--- a/arch/alpha/include/uapi/asm/ioctls.h
+++ b/arch/alpha/include/uapi/asm/ioctls.h
@@ -100,7 +100,7 @@
 #define TIOCGPKT	_IOR('T', 0x38, int) /* Get packet mode state */
 #define TIOCGPTLCK	_IOR('T', 0x39, int) /* Get Pty lock state */
 #define TIOCGEXCL	_IOR('T', 0x40, int) /* Get exclusive mode state */
-#define TIOCGPTPEER	_IOR('T', 0x41, int) /* Safely open the slave */
+#define TIOCGPTPEER	_IO('T', 0x41) /* Safely open the slave */
 
 #define TIOCSERCONFIG	0x5453
 #define TIOCSERGWILD	0x5454
diff --git a/arch/alpha/include/uapi/asm/types.h b/arch/alpha/include/uapi/asm/types.h
index 9fd3cd459777..8d1024d7be05 100644
--- a/arch/alpha/include/uapi/asm/types.h
+++ b/arch/alpha/include/uapi/asm/types.h
@@ -9,8 +9,18 @@
  * need to be careful to avoid a name clashes.
  */
 
-#ifndef __KERNEL__
+/*
+ * This is here because we used to use l64 for alpha
+ * and we don't want to impact user mode with our change to ll64
+ * in the kernel.
+ *
+ * However, some user programs are fine with this.  They can
+ * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here.
+ */
+#if !defined(__SANE_USERSPACE_TYPES__) && !defined(__KERNEL__)
 #include <asm-generic/int-l64.h>
+#else
+#include <asm-generic/int-ll64.h>
 #endif
 
 #endif /* _UAPI_ALPHA_TYPES_H */
diff --git a/arch/alpha/include/uapi/asm/unistd.h b/arch/alpha/include/uapi/asm/unistd.h
index aa33bf5aacb6..a2945fea6c86 100644
--- a/arch/alpha/include/uapi/asm/unistd.h
+++ b/arch/alpha/include/uapi/asm/unistd.h
@@ -475,5 +475,19 @@
 #define __NR_getrandom			511
 #define __NR_memfd_create		512
 #define __NR_execveat			513
+#define __NR_seccomp			514
+#define __NR_bpf			515
+#define __NR_userfaultfd		516
+#define __NR_membarrier			517
+#define __NR_mlock2			518
+#define __NR_copy_file_range		519
+#define __NR_preadv2			520
+#define __NR_pwritev2			521
+#define __NR_statx			522
+
+/* Alpha doesn't have protection keys. */
+#define __IGNORE_pkey_mprotect
+#define __IGNORE_pkey_alloc
+#define __IGNORE_pkey_free
 
 #endif /* _UAPI_ALPHA_UNISTD_H */
diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index d5f0580746a5..03ff832b1cb4 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -351,7 +351,7 @@ marvel_init_io7(struct io7 *io7)
 	}
 }
 
-void
+void __init
 marvel_io7_present(gct6_node *node)
 {
 	int pe;
@@ -369,6 +369,7 @@ marvel_io7_present(gct6_node *node)
 static void __init
 marvel_find_console_vga_hose(void)
 {
+#ifdef CONFIG_VGA_HOSE
 	u64 *pu64 = (u64 *)((u64)hwrpb + hwrpb->ctbt_offset);
 
 	if (pu64[7] == 3) {	/* TERM_TYPE == graphics */
@@ -402,9 +403,10 @@ marvel_find_console_vga_hose(void)
 			pci_vga_hose = hose;
 		}
 	}
+#endif
 }
 
-gct6_search_struct gct_wanted_node_list[] = {
+gct6_search_struct gct_wanted_node_list[] __initdata = {
 	{ GCT_TYPE_HOSE, GCT_SUBTYPE_IO_PORT_MODULE, marvel_io7_present },
 	{ 0, 0, NULL }
 };
diff --git a/arch/alpha/kernel/core_titan.c b/arch/alpha/kernel/core_titan.c
index 219bf271c0ba..b532d925443d 100644
--- a/arch/alpha/kernel/core_titan.c
+++ b/arch/alpha/kernel/core_titan.c
@@ -461,6 +461,7 @@ titan_ioremap(unsigned long addr, unsigned long size)
 	unsigned long *ptes;
 	unsigned long pfn;
 
+#ifdef CONFIG_VGA_HOSE
 	/*
 	 * Adjust the address and hose, if necessary.
 	 */ 
@@ -468,6 +469,7 @@ titan_ioremap(unsigned long addr, unsigned long size)
 		h = pci_vga_hose->index;
 		addr += pci_vga_hose->mem_space->start;
 	}
+#endif
 
 	/*
 	 * Find the hose.
diff --git a/arch/alpha/kernel/module.c b/arch/alpha/kernel/module.c
index 936bc8f89a67..47632fa8c24e 100644
--- a/arch/alpha/kernel/module.c
+++ b/arch/alpha/kernel/module.c
@@ -181,6 +181,9 @@ apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab,
 		switch (r_type) {
 		case R_ALPHA_NONE:
 			break;
+		case R_ALPHA_REFLONG:
+			*(u32 *)location = value;
+			break;
 		case R_ALPHA_REFQUAD:
 			/* BUG() can produce misaligned relocations. */
 			((u32 *)location)[0] = value;
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 9fc560459ebd..f6726a746427 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -115,7 +115,7 @@ wait_boot_cpu_to_stop(int cpuid)
 /*
  * Where secondaries begin a life of C.
  */
-void
+void __init
 smp_callin(void)
 {
 	int cpuid = hard_smp_processor_id();
diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S
index 9b62e3fd4f03..5b4514abb234 100644
--- a/arch/alpha/kernel/systbls.S
+++ b/arch/alpha/kernel/systbls.S
@@ -532,6 +532,15 @@ sys_call_table:
 	.quad sys_getrandom
 	.quad sys_memfd_create
 	.quad sys_execveat
+	.quad sys_seccomp
+	.quad sys_bpf				/* 515 */
+	.quad sys_userfaultfd
+	.quad sys_membarrier
+	.quad sys_mlock2
+	.quad sys_copy_file_range
+	.quad sys_preadv2			/* 520 */
+	.quad sys_pwritev2
+	.quad sys_statx
 
 	.size sys_call_table, . - sys_call_table
 	.type sys_call_table, @object
diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile
index 7083434dd241..a80815960364 100644
--- a/arch/alpha/lib/Makefile
+++ b/arch/alpha/lib/Makefile
@@ -20,12 +20,8 @@ lib-y =	__divqu.o __remqu.o __divlu.o __remlu.o \
 	checksum.o \
 	csum_partial_copy.o \
 	$(ev67-y)strlen.o \
-	$(ev67-y)strcat.o \
-	strcpy.o \
-	$(ev67-y)strncat.o \
-	strncpy.o \
-	$(ev6-y)stxcpy.o \
-	$(ev6-y)stxncpy.o \
+	stycpy.o \
+	styncpy.o \
 	$(ev67-y)strchr.o \
 	$(ev67-y)strrchr.o \
 	$(ev6-y)memchr.o \
@@ -49,3 +45,17 @@ AFLAGS___remlu.o =       -DREM -DINTSIZE
 $(addprefix $(obj)/,__divqu.o __remqu.o __divlu.o __remlu.o): \
 						$(src)/$(ev6-y)divide.S FORCE
 	$(call if_changed_rule,as_o_S)
+
+# There are direct branches between {str*cpy,str*cat} and stx*cpy.
+# Ensure the branches are within range by merging these objects.
+
+LDFLAGS_stycpy.o := -r
+LDFLAGS_styncpy.o := -r
+
+$(obj)/stycpy.o: $(obj)/strcpy.o $(obj)/$(ev67-y)strcat.o \
+		 $(obj)/$(ev6-y)stxcpy.o FORCE
+	$(call if_changed,ld)
+
+$(obj)/styncpy.o: $(obj)/strncpy.o $(obj)/$(ev67-y)strncat.o \
+		 $(obj)/$(ev6-y)stxncpy.o FORCE
+	$(call if_changed,ld)
diff --git a/arch/alpha/lib/copy_user.S b/arch/alpha/lib/copy_user.S
index 159f1b7e6e49..c277a1a4383e 100644
--- a/arch/alpha/lib/copy_user.S
+++ b/arch/alpha/lib/copy_user.S
@@ -34,7 +34,7 @@
 	.ent __copy_user
 __copy_user:
 	.prologue 0
-	and $18,$18,$0
+	mov $18,$0
 	and $16,7,$3
 	beq $0,$35
 	beq $3,$36
diff --git a/arch/alpha/lib/ev6-copy_user.S b/arch/alpha/lib/ev6-copy_user.S
index 35e6710d0700..954ca03ebebe 100644
--- a/arch/alpha/lib/ev6-copy_user.S
+++ b/arch/alpha/lib/ev6-copy_user.S
@@ -45,9 +45,10 @@
 				# Pipeline info: Slotting & Comments
 __copy_user:
 	.prologue 0
-	andq $18, $18, $0
-	subq $18, 32, $1	# .. E  .. ..	: Is this going to be a small copy?
-	beq $0, $zerolength	# U  .. .. ..	: U L U L
+	mov $18, $0		# .. .. .. E
+	subq $18, 32, $1	# .. .. E. ..	: Is this going to be a small copy?
+	nop			# .. E  .. ..
+	beq $18, $zerolength	# U  .. .. ..	: U L U L
 
 	and $16,7,$3		# .. .. .. E	: is leading dest misalignment
 	ble $1, $onebyteloop	# .. .. U  ..	: 1st branch : small amount of data
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index a5459698f0ee..7db85ab00c52 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -96,7 +96,6 @@ menu "ARC Architecture Configuration"
 
 menu "ARC Platform/SoC/Board"
 
-source "arch/arc/plat-sim/Kconfig"
 source "arch/arc/plat-tb10x/Kconfig"
 source "arch/arc/plat-axs10x/Kconfig"
 #New platform adds here
diff --git a/arch/arc/Makefile b/arch/arc/Makefile
index 44ef35d33956..3a61cfcc38c0 100644
--- a/arch/arc/Makefile
+++ b/arch/arc/Makefile
@@ -107,7 +107,7 @@ core-y		+= arch/arc/
 # w/o this dtb won't embed into kernel binary
 core-y		+= arch/arc/boot/dts/
 
-core-$(CONFIG_ARC_PLAT_SIM)	+= arch/arc/plat-sim/
+core-y				+= arch/arc/plat-sim/
 core-$(CONFIG_ARC_PLAT_TB10X)	+= arch/arc/plat-tb10x/
 core-$(CONFIG_ARC_PLAT_AXS10X)	+= arch/arc/plat-axs10x/
 core-$(CONFIG_ARC_PLAT_EZNPS)	+= arch/arc/plat-eznps/
diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi
index 53ce226f77a5..a380ffa1a458 100644
--- a/arch/arc/boot/dts/axc001.dtsi
+++ b/arch/arc/boot/dts/axc001.dtsi
@@ -15,15 +15,15 @@
 
 / {
 	compatible = "snps,arc";
-	#address-cells = <1>;
-	#size-cells = <1>;
+	#address-cells = <2>;
+	#size-cells = <2>;
 
 	cpu_card {
 		compatible = "simple-bus";
 		#address-cells = <1>;
 		#size-cells = <1>;
 
-		ranges = <0x00000000 0xf0000000 0x10000000>;
+		ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
 
 		core_clk: core_clk {
 			#clock-cells = <0>;
@@ -91,23 +91,21 @@
 	mb_intc: dw-apb-ictl@0xe0012000 {
 		#interrupt-cells = <1>;
 		compatible = "snps,dw-apb-ictl";
-		reg = < 0xe0012000 0x200 >;
+		reg = < 0x0 0xe0012000 0x0 0x200 >;
 		interrupt-controller;
 		interrupt-parent = <&core_intc>;
 		interrupts = < 7 >;
 	};
 
 	memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		ranges = <0x00000000 0x80000000 0x20000000>;
 		device_type = "memory";
-		reg = <0x80000000 0x1b000000>;	/* (512 - 32) MiB */
+		/* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+		reg = <0x0 0x80000000 0x0 0x1b000000>;	/* (512 - 32) MiB */
 	};
 
 	reserved-memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
+		#address-cells = <2>;
+		#size-cells = <2>;
 		ranges;
 		/*
 		 * We just move frame buffer area to the very end of
@@ -118,7 +116,7 @@
 		 */
 		frame_buffer: frame_buffer@9e000000 {
 			compatible = "shared-dma-pool";
-			reg = <0x9e000000 0x2000000>;
+			reg = <0x0 0x9e000000 0x0 0x2000000>;
 			no-map;
 		};
 	};
diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi
index 14df46f141bf..cc9239ef8d08 100644
--- a/arch/arc/boot/dts/axc003.dtsi
+++ b/arch/arc/boot/dts/axc003.dtsi
@@ -14,15 +14,15 @@
 
 / {
 	compatible = "snps,arc";
-	#address-cells = <1>;
-	#size-cells = <1>;
+	#address-cells = <2>;
+	#size-cells = <2>;
 
 	cpu_card {
 		compatible = "simple-bus";
 		#address-cells = <1>;
 		#size-cells = <1>;
 
-		ranges = <0x00000000 0xf0000000 0x10000000>;
+		ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
 
 		core_clk: core_clk {
 			#clock-cells = <0>;
@@ -94,30 +94,29 @@
 	mb_intc: dw-apb-ictl@0xe0012000 {
 		#interrupt-cells = <1>;
 		compatible = "snps,dw-apb-ictl";
-		reg = < 0xe0012000 0x200 >;
+		reg = < 0x0 0xe0012000 0x0 0x200 >;
 		interrupt-controller;
 		interrupt-parent = <&core_intc>;
 		interrupts = < 24 >;
 	};
 
 	memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		ranges = <0x00000000 0x80000000 0x40000000>;
 		device_type = "memory";
-		reg = <0x80000000 0x20000000>;	/* 512MiB */
+		/* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+		reg = <0x0 0x80000000 0x0 0x20000000	/* 512 MiB low mem */
+		       0x1 0xc0000000 0x0 0x40000000>;	/* 1 GiB highmem */
 	};
 
 	reserved-memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
+		#address-cells = <2>;
+		#size-cells = <2>;
 		ranges;
 		/*
 		 * Move frame buffer out of IOC aperture (0x8z-0xAz).
 		 */
 		frame_buffer: frame_buffer@be000000 {
 			compatible = "shared-dma-pool";
-			reg = <0xbe000000 0x2000000>;
+			reg = <0x0 0xbe000000 0x0 0x2000000>;
 			no-map;
 		};
 	};
diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi
index 695f9fa1996b..4ebb2170abec 100644
--- a/arch/arc/boot/dts/axc003_idu.dtsi
+++ b/arch/arc/boot/dts/axc003_idu.dtsi
@@ -14,15 +14,15 @@
 
 / {
 	compatible = "snps,arc";
-	#address-cells = <1>;
-	#size-cells = <1>;
+	#address-cells = <2>;
+	#size-cells = <2>;
 
 	cpu_card {
 		compatible = "simple-bus";
 		#address-cells = <1>;
 		#size-cells = <1>;
 
-		ranges = <0x00000000 0xf0000000 0x10000000>;
+		ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
 
 		core_clk: core_clk {
 			#clock-cells = <0>;
@@ -100,30 +100,29 @@
 	mb_intc: dw-apb-ictl@0xe0012000 {
 		#interrupt-cells = <1>;
 		compatible = "snps,dw-apb-ictl";
-		reg = < 0xe0012000 0x200 >;
+		reg = < 0x0 0xe0012000 0x0 0x200 >;
 		interrupt-controller;
 		interrupt-parent = <&idu_intc>;
 		interrupts = <0>;
 	};
 
 	memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		ranges = <0x00000000 0x80000000 0x40000000>;
 		device_type = "memory";
-		reg = <0x80000000 0x20000000>;	/* 512MiB */
+		/* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+		reg = <0x0 0x80000000 0x0 0x20000000	/* 512 MiB low mem */
+		       0x1 0xc0000000 0x0 0x40000000>;	/* 1 GiB highmem */
 	};
 
 	reserved-memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
+		#address-cells = <2>;
+		#size-cells = <2>;
 		ranges;
 		/*
 		 * Move frame buffer out of IOC aperture (0x8z-0xAz).
 		 */
 		frame_buffer: frame_buffer@be000000 {
 			compatible = "shared-dma-pool";
-			reg = <0xbe000000 0x2000000>;
+			reg = <0x0 0xbe000000 0x0 0x2000000>;
 			no-map;
 		};
 	};
diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi
index 41cfb29b62c1..0ff7e07edcd4 100644
--- a/arch/arc/boot/dts/axs10x_mb.dtsi
+++ b/arch/arc/boot/dts/axs10x_mb.dtsi
@@ -13,7 +13,7 @@
 		compatible = "simple-bus";
 		#address-cells = <1>;
 		#size-cells = <1>;
-		ranges = <0x00000000 0xe0000000 0x10000000>;
+		ranges = <0x00000000 0x0 0xe0000000 0x10000000>;
 		interrupt-parent = <&mb_intc>;
 
 		i2sclk: i2sclk@100a0 {
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
index 57b3e599322f..db04ea4dd2d9 100644
--- a/arch/arc/configs/haps_hs_defconfig
+++ b/arch/arc/configs/haps_hs_defconfig
@@ -21,7 +21,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs"
 CONFIG_PREEMPT=y
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
index f85985adebb2..821a2e562f3f 100644
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
 CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs_idu"
diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig
index ede625c76216..7c9c706ae7f6 100644
--- a/arch/arc/configs/nps_defconfig
+++ b/arch/arc/configs/nps_defconfig
@@ -39,7 +39,6 @@ CONFIG_IP_PNP=y
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index b0066a749d4c..6dff83a238b8 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsim_700"
 CONFIG_PREEMPT=y
 # CONFIG_COMPACTION is not set
diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig
index ebe9ebb92933..31ee51b987e7 100644
--- a/arch/arc/configs/nsim_hs_defconfig
+++ b/arch/arc/configs/nsim_hs_defconfig
@@ -26,7 +26,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs"
 CONFIG_PREEMPT=y
diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig
index 4bde43278be6..8d3b1f67cae4 100644
--- a/arch/arc/configs/nsim_hs_smp_defconfig
+++ b/arch/arc/configs/nsim_hs_smp_defconfig
@@ -24,7 +24,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs_idu"
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index f6fb3d26557e..6168ce2ac2ef 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci"
 # CONFIG_COMPACTION is not set
 CONFIG_NET=y
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index b9f0fe00044b..a70bdeb2b3fd 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci_hs"
 # CONFIG_COMPACTION is not set
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index 155add7761ed..ef96406c446e 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -18,7 +18,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
 # CONFIG_ARC_TIMERS_64BIT is not set
diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
index 4c5118384eb5..f30182549395 100644
--- a/arch/arc/configs/tb10x_defconfig
+++ b/arch/arc/configs/tb10x_defconfig
@@ -38,7 +38,6 @@ CONFIG_IP_MULTICAST=y
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 54b54da6384c..11859287c52a 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i)
 	atomic_ops_unlock(flags);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 #endif
 
 /*
diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
index 19ebddffb279..02fd1cece6ef 100644
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -96,7 +96,9 @@ extern unsigned long perip_base, perip_end;
 #define ARC_REG_SLC_FLUSH	0x904
 #define ARC_REG_SLC_INVALIDATE	0x905
 #define ARC_REG_SLC_RGN_START	0x914
+#define ARC_REG_SLC_RGN_START1	0x915
 #define ARC_REG_SLC_RGN_END	0x916
+#define ARC_REG_SLC_RGN_END1	0x917
 
 /* Bit val in SLC_CONTROL */
 #define SLC_CTRL_DIS		0x001
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 11e1b1f3acda..eb887dd13e74 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -73,20 +73,11 @@
 
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
 #ifndef CONFIG_ARC_HAS_LLSC
 	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
 #endif
@@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	preempt_enable();
 #endif
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h
index db7319e9b506..efb79fafff1d 100644
--- a/arch/arc/include/asm/mmu.h
+++ b/arch/arc/include/asm/mmu.h
@@ -94,6 +94,8 @@ static inline int is_pae40_enabled(void)
 	return IS_ENABLED(CONFIG_ARC_HAS_PAE40);
 }
 
+extern int pae40_exist_but_not_enab(void);
+
 #endif	/* !__ASSEMBLY__ */
 
 #endif
diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h
index 233d5ffe6ec7..a325e6a36523 100644
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -16,11 +16,6 @@
 #define arch_spin_is_locked(x)	((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__)
 #define arch_spin_lock_flags(lock, flags)	arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
 #ifdef CONFIG_ARC_HAS_LLSC
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c
index f928795fd07a..067ea362fb3e 100644
--- a/arch/arc/kernel/intc-arcv2.c
+++ b/arch/arc/kernel/intc-arcv2.c
@@ -75,10 +75,20 @@ void arc_init_IRQ(void)
 	 * Set a default priority for all available interrupts to prevent
 	 * switching of register banks if Fast IRQ and multiple register banks
 	 * are supported by CPU.
+	 * Also disable private-per-core IRQ lines so faulty external HW won't
+	 * trigger interrupt that kernel is not ready to handle.
 	 */
 	for (i = NR_EXCEPTIONS; i < irq_bcr.irqs + NR_EXCEPTIONS; i++) {
 		write_aux_reg(AUX_IRQ_SELECT, i);
 		write_aux_reg(AUX_IRQ_PRIORITY, ARCV2_IRQ_DEF_PRIO);
+
+		/*
+		 * Only mask cpu private IRQs here.
+		 * "common" interrupts are masked at IDU, otherwise it would
+		 * need to be unmasked at each cpu, with IPIs
+		 */
+		if (i < FIRST_EXT_IRQ)
+			write_aux_reg(AUX_IRQ_ENABLE, 0);
 	}
 
 	/* setup status32, don't enable intr yet as kernel doesn't want */
diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c
index 7e608c6b0a01..47b421fa0147 100644
--- a/arch/arc/kernel/intc-compact.c
+++ b/arch/arc/kernel/intc-compact.c
@@ -27,7 +27,7 @@
  */
 void arc_init_IRQ(void)
 {
-	int level_mask = 0;
+	unsigned int level_mask = 0, i;
 
        /* Is timer high priority Interrupt (Level2 in ARCompact jargon) */
 	level_mask |= IS_ENABLED(CONFIG_ARC_COMPACT_IRQ_LEVELS) << TIMER0_IRQ;
@@ -40,6 +40,18 @@ void arc_init_IRQ(void)
 
 	if (level_mask)
 		pr_info("Level-2 interrupts bitset %x\n", level_mask);
+
+	/*
+	 * Disable all IRQ lines so faulty external hardware won't
+	 * trigger interrupt that kernel is not ready to handle.
+	 */
+	for (i = TIMER0_IRQ; i < NR_CPU_IRQS; i++) {
+		unsigned int ienb;
+
+		ienb = read_aux_reg(AUX_IENABLE);
+		ienb &= ~(1 << i);
+		write_aux_reg(AUX_IENABLE, ienb);
+	}
 }
 
 /*
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index a867575a758b..7db283b46ebd 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -665,6 +665,7 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
 	static DEFINE_SPINLOCK(lock);
 	unsigned long flags;
 	unsigned int ctrl;
+	phys_addr_t end;
 
 	spin_lock_irqsave(&lock, flags);
 
@@ -694,8 +695,19 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
 	 * END needs to be setup before START (latter triggers the operation)
 	 * END can't be same as START, so add (l2_line_sz - 1) to sz
 	 */
-	write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1));
-	write_aux_reg(ARC_REG_SLC_RGN_START, paddr);
+	end = paddr + sz + l2_line_sz - 1;
+	if (is_pae40_enabled())
+		write_aux_reg(ARC_REG_SLC_RGN_END1, upper_32_bits(end));
+
+	write_aux_reg(ARC_REG_SLC_RGN_END, lower_32_bits(end));
+
+	if (is_pae40_enabled())
+		write_aux_reg(ARC_REG_SLC_RGN_START1, upper_32_bits(paddr));
+
+	write_aux_reg(ARC_REG_SLC_RGN_START, lower_32_bits(paddr));
+
+	/* Make sure "busy" bit reports correct stataus, see STAR 9001165532 */
+	read_aux_reg(ARC_REG_SLC_CTRL);
 
 	while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY);
 
@@ -1111,6 +1123,13 @@ noinline void __init arc_ioc_setup(void)
 	__dc_enable();
 }
 
+/*
+ * Cache related boot time checks/setups only needed on master CPU:
+ *  - Geometry checks (kernel build and hardware agree: e.g. L1_CACHE_BYTES)
+ *    Assume SMP only, so all cores will have same cache config. A check on
+ *    one core suffices for all
+ *  - IOC setup / dma callbacks only need to be done once
+ */
 void __init arc_cache_init_master(void)
 {
 	unsigned int __maybe_unused cpu = smp_processor_id();
@@ -1190,12 +1209,27 @@ void __ref arc_cache_init(void)
 
 	printk(arc_cache_mumbojumbo(0, str, sizeof(str)));
 
-	/*
-	 * Only master CPU needs to execute rest of function:
-	 *  - Assume SMP so all cores will have same cache config so
-	 *    any geomtry checks will be same for all
-	 *  - IOC setup / dma callbacks only need to be setup once
-	 */
 	if (!cpu)
 		arc_cache_init_master();
+
+	/*
+	 * In PAE regime, TLB and cache maintenance ops take wider addresses
+	 * And even if PAE is not enabled in kernel, the upper 32-bits still need
+	 * to be zeroed to keep the ops sane.
+	 * As an optimization for more common !PAE enabled case, zero them out
+	 * once at init, rather than checking/setting to 0 for every runtime op
+	 */
+	if (is_isa_arcv2() && pae40_exist_but_not_enab()) {
+
+		if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE))
+			write_aux_reg(ARC_REG_IC_PTAG_HI, 0);
+
+		if (IS_ENABLED(CONFIG_ARC_HAS_DCACHE))
+			write_aux_reg(ARC_REG_DC_PTAG_HI, 0);
+
+		if (l2_line_sz) {
+			write_aux_reg(ARC_REG_SLC_RGN_END1, 0);
+			write_aux_reg(ARC_REG_SLC_RGN_START1, 0);
+		}
+	}
 }
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 2a07e6ecafbd..e9d93604ad0f 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -117,7 +117,7 @@ static int arc_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
-	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
 		return ret;
 
 	if (off < count && user_count <= (count - off)) {
@@ -153,6 +153,19 @@ static void _dma_cache_sync(phys_addr_t paddr, size_t size,
 	}
 }
 
+/*
+ * arc_dma_map_page - map a portion of a page for streaming DMA
+ *
+ * Ensure that any data held in the cache is appropriately discarded
+ * or written back.
+ *
+ * The device owns this memory once this call has completed.  The CPU
+ * can regain ownership by calling dma_unmap_page().
+ *
+ * Note: while it takes struct page as arg, caller can "abuse" it to pass
+ * a region larger than PAGE_SIZE, provided it is physically contiguous
+ * and this still works correctly
+ */
 static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
@@ -165,6 +178,24 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
 	return plat_phys_to_dma(dev, paddr);
 }
 
+/*
+ * arc_dma_unmap_page - unmap a buffer previously mapped through dma_map_page()
+ *
+ * After this call, reads by the CPU to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ *
+ * Note: historically this routine was not implemented for ARC
+ */
+static void arc_dma_unmap_page(struct device *dev, dma_addr_t handle,
+			       size_t size, enum dma_data_direction dir,
+			       unsigned long attrs)
+{
+	phys_addr_t paddr = plat_dma_to_phys(dev, handle);
+
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		_dma_cache_sync(paddr, size, dir);
+}
+
 static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	   int nents, enum dma_data_direction dir, unsigned long attrs)
 {
@@ -178,6 +209,18 @@ static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	return nents;
 }
 
+static void arc_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+			     int nents, enum dma_data_direction dir,
+			     unsigned long attrs)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i)
+		arc_dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir,
+				   attrs);
+}
+
 static void arc_dma_sync_single_for_cpu(struct device *dev,
 		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
 {
@@ -223,7 +266,9 @@ const struct dma_map_ops arc_dma_ops = {
 	.free			= arc_dma_free,
 	.mmap			= arc_dma_mmap,
 	.map_page		= arc_dma_map_page,
+	.unmap_page		= arc_dma_unmap_page,
 	.map_sg			= arc_dma_map_sg,
+	.unmap_sg		= arc_dma_unmap_sg,
 	.sync_single_for_device	= arc_dma_sync_single_for_device,
 	.sync_single_for_cpu	= arc_dma_sync_single_for_cpu,
 	.sync_sg_for_cpu	= arc_dma_sync_sg_for_cpu,
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index d0126fdfe2d8..b181f3ee38aa 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -104,6 +104,8 @@
 /* A copy of the ASID from the PID reg is kept in asid_cache */
 DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE;
 
+static int __read_mostly pae_exists;
+
 /*
  * Utility Routine to erase a J-TLB entry
  * Caller needs to setup Index Reg (manually or via getIndex)
@@ -784,7 +786,7 @@ void read_decode_mmu_bcr(void)
 		mmu->u_dtlb = mmu4->u_dtlb * 4;
 		mmu->u_itlb = mmu4->u_itlb * 4;
 		mmu->sasid = mmu4->sasid;
-		mmu->pae = mmu4->pae;
+		pae_exists = mmu->pae = mmu4->pae;
 	}
 }
 
@@ -809,6 +811,11 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len)
 	return buf;
 }
 
+int pae40_exist_but_not_enab(void)
+{
+	return pae_exists && !is_pae40_enabled();
+}
+
 void arc_mmu_init(void)
 {
 	char str[256];
@@ -859,6 +866,9 @@ void arc_mmu_init(void)
 	/* swapper_pg_dir is the pgd for the kernel, used by vmalloc */
 	write_aux_reg(ARC_REG_SCRATCH_DATA0, swapper_pg_dir);
 #endif
+
+	if (pae40_exist_but_not_enab())
+		write_aux_reg(ARC_REG_TLBPD1HI, 0);
 }
 
 /*
diff --git a/arch/arc/plat-sim/Kconfig b/arch/arc/plat-sim/Kconfig
deleted file mode 100644
index ac6af96a82f3..000000000000
--- a/arch/arc/plat-sim/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# Copyright (C) 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-#
-
-menuconfig ARC_PLAT_SIM
-	bool "ARC nSIM based simulation virtual platforms"
-	help
-	  Support for nSIM based ARC simulation platforms
-	  This includes the standalone nSIM (uart only) vs. System C OSCI VP
diff --git a/arch/arc/plat-sim/platform.c b/arch/arc/plat-sim/platform.c
index aea87389e44b..5cda56b1a2ea 100644
--- a/arch/arc/plat-sim/platform.c
+++ b/arch/arc/plat-sim/platform.c
@@ -20,11 +20,14 @@
  */
 
 static const char *simulation_compat[] __initconst = {
+#ifdef CONFIG_ISA_ARCOMPACT
 	"snps,nsim",
-	"snps,nsim_hs",
 	"snps,nsimosci",
+#else
+	"snps,nsim_hs",
 	"snps,nsimosci_hs",
 	"snps,zebu_hs",
+#endif
 	NULL,
 };
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a208bfe367b5..61a0cb15067e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -380,7 +380,7 @@ config ARCH_EP93XX
 	bool "EP93xx-based"
 	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARM_AMBA
-	select ARM_PATCH_PHYS_VIRT
+	imply ARM_PATCH_PHYS_VIRT
 	select ARM_VIC
 	select AUTO_ZRELADDR
 	select CLKDEV_LOOKUP
diff --git a/arch/arm/boot/dts/armada-388-gp.dts b/arch/arm/boot/dts/armada-388-gp.dts
index 895fa6cfa15a..563901e0ec07 100644
--- a/arch/arm/boot/dts/armada-388-gp.dts
+++ b/arch/arm/boot/dts/armada-388-gp.dts
@@ -75,7 +75,7 @@
 					pinctrl-names = "default";
 					pinctrl-0 = <&pca0_pins>;
 					interrupt-parent = <&gpio0>;
-					interrupts = <18 IRQ_TYPE_EDGE_FALLING>;
+					interrupts = <18 IRQ_TYPE_LEVEL_LOW>;
 					gpio-controller;
 					#gpio-cells = <2>;
 					interrupt-controller;
@@ -87,7 +87,7 @@
 					compatible = "nxp,pca9555";
 					pinctrl-names = "default";
 					interrupt-parent = <&gpio0>;
-					interrupts = <18 IRQ_TYPE_EDGE_FALLING>;
+					interrupts = <18 IRQ_TYPE_LEVEL_LOW>;
 					gpio-controller;
 					#gpio-cells = <2>;
 					interrupt-controller;
diff --git a/arch/arm/boot/dts/da850-evm.dts b/arch/arm/boot/dts/da850-evm.dts
index a423e8ebfb37..67e72bc72e80 100644
--- a/arch/arm/boot/dts/da850-evm.dts
+++ b/arch/arm/boot/dts/da850-evm.dts
@@ -301,25 +301,4 @@
 	pinctrl-names = "default";
 	pinctrl-0 = <&vpif_capture_pins>, <&vpif_display_pins>;
 	status = "okay";
-
-	/* VPIF capture port */
-	port@0 {
-		vpif_input_ch0: endpoint@0 {
-			reg = <0>;
-			bus-width = <8>;
-		};
-
-		vpif_input_ch1: endpoint@1 {
-			reg = <1>;
-			bus-width = <8>;
-			data-shift = <8>;
-		};
-	};
-
-	/* VPIF display port */
-	port@1 {
-		vpif_output_ch0: endpoint {
-			bus-width = <8>;
-		};
-	};
 };
diff --git a/arch/arm/boot/dts/da850-lcdk.dts b/arch/arm/boot/dts/da850-lcdk.dts
index b837fec70eec..a0f0916156e6 100644
--- a/arch/arm/boot/dts/da850-lcdk.dts
+++ b/arch/arm/boot/dts/da850-lcdk.dts
@@ -318,11 +318,4 @@
 	pinctrl-names = "default";
 	pinctrl-0 = <&vpif_capture_pins>;
 	status = "okay";
-
-	/* VPIF capture port */
-	port {
-		vpif_ch0: endpoint {
-			  bus-width = <8>;
-		};
-	};
 };
diff --git a/arch/arm/boot/dts/dm8168-evm.dts b/arch/arm/boot/dts/dm8168-evm.dts
index 1865976db5f9..c72a2132aa82 100644
--- a/arch/arm/boot/dts/dm8168-evm.dts
+++ b/arch/arm/boot/dts/dm8168-evm.dts
@@ -68,6 +68,34 @@
 			DM816X_IOPAD(0x0d08, MUX_MODE0)			/* USB1_DRVVBUS */
 		>;
 	};
+
+	nandflash_pins: nandflash_pins {
+		pinctrl-single,pins = <
+			DM816X_IOPAD(0x0b38, PULL_UP | MUX_MODE0)		/* PINCTRL207 GPMC_CS0*/
+			DM816X_IOPAD(0x0b60, PULL_ENA | MUX_MODE0)		/* PINCTRL217 GPMC_ADV_ALE */
+			DM816X_IOPAD(0x0b54, PULL_UP | PULL_ENA | MUX_MODE0)	/* PINCTRL214 GPMC_OE_RE */
+			DM816X_IOPAD(0x0b58, PULL_ENA | MUX_MODE0)		/* PINCTRL215 GPMC_BE0_CLE */
+			DM816X_IOPAD(0x0b50, PULL_UP | MUX_MODE0)		/* PINCTRL213 GPMC_WE */
+			DM816X_IOPAD(0x0b6c, MUX_MODE0)				/* PINCTRL220 GPMC_WAIT */
+			DM816X_IOPAD(0x0be4, PULL_ENA | MUX_MODE0)		/* PINCTRL250 GPMC_CLK */
+			DM816X_IOPAD(0x0ba4, MUX_MODE0)				/* PINCTRL234 GPMC_D0 */
+			DM816X_IOPAD(0x0ba8, MUX_MODE0)				/* PINCTRL234 GPMC_D1 */
+			DM816X_IOPAD(0x0bac, MUX_MODE0)				/* PINCTRL234 GPMC_D2 */
+			DM816X_IOPAD(0x0bb0, MUX_MODE0)				/* PINCTRL234 GPMC_D3 */
+			DM816X_IOPAD(0x0bb4, MUX_MODE0)				/* PINCTRL234 GPMC_D4 */
+			DM816X_IOPAD(0x0bb8, MUX_MODE0)				/* PINCTRL234 GPMC_D5 */
+			DM816X_IOPAD(0x0bbc, MUX_MODE0)				/* PINCTRL234 GPMC_D6 */
+			DM816X_IOPAD(0x0bc0, MUX_MODE0)				/* PINCTRL234 GPMC_D7 */
+			DM816X_IOPAD(0x0bc4, MUX_MODE0)				/* PINCTRL234 GPMC_D8 */
+			DM816X_IOPAD(0x0bc8, MUX_MODE0)				/* PINCTRL234 GPMC_D9 */
+			DM816X_IOPAD(0x0bcc, MUX_MODE0)				/* PINCTRL234 GPMC_D10 */
+			DM816X_IOPAD(0x0bd0, MUX_MODE0)				/* PINCTRL234 GPMC_D11 */
+			DM816X_IOPAD(0x0bd4, MUX_MODE0)				/* PINCTRL234 GPMC_D12 */
+			DM816X_IOPAD(0x0bd8, MUX_MODE0)				/* PINCTRL234 GPMC_D13 */
+			DM816X_IOPAD(0x0bdc, MUX_MODE0)				/* PINCTRL234 GPMC_D14 */
+			DM816X_IOPAD(0x0be0, MUX_MODE0)				/* PINCTRL234 GPMC_D15 */
+		>;
+	};
 };
 
 &i2c1 {
@@ -90,6 +118,8 @@
 
 &gpmc {
 	ranges = <0 0 0x04000000 0x01000000>;	/* CS0: 16MB for NAND */
+	pinctrl-names = "default";
+	pinctrl-0 = <&nandflash_pins>;
 
 	nand@0,0 {
 		compatible = "ti,omap2-nand";
@@ -98,9 +128,11 @@
 		interrupt-parent = <&gpmc>;
 		interrupts = <0 IRQ_TYPE_NONE>, /* fifoevent */
 			     <1 IRQ_TYPE_NONE>; /* termcount */
+		rb-gpios = <&gpmc 0 GPIO_ACTIVE_HIGH>; /* gpmc_wait0 */
 		#address-cells = <1>;
 		#size-cells = <1>;
 		ti,nand-ecc-opt = "bch8";
+		ti,elm-id = <&elm>;
 		nand-bus-width = <16>;
 		gpmc,device-width = <2>;
 		gpmc,sync-clk-ps = <0>;
@@ -164,7 +196,7 @@
 	vmmc-supply = <&vmmcsd_fixed>;
 	bus-width = <4>;
 	cd-gpios = <&gpio2 7 GPIO_ACTIVE_LOW>;
-	wp-gpios = <&gpio2 8 GPIO_ACTIVE_LOW>;
+	wp-gpios = <&gpio2 8 GPIO_ACTIVE_HIGH>;
 };
 
 /* At least dm8168-evm rev c won't support multipoint, later may */
diff --git a/arch/arm/boot/dts/dm816x.dtsi b/arch/arm/boot/dts/dm816x.dtsi
index 59cbf958fcc3..566b2a8c8b96 100644
--- a/arch/arm/boot/dts/dm816x.dtsi
+++ b/arch/arm/boot/dts/dm816x.dtsi
@@ -145,7 +145,7 @@
 		};
 
 		elm: elm@48080000 {
-			compatible = "ti,816-elm";
+			compatible = "ti,am3352-elm";
 			ti,hwmods = "elm";
 			reg = <0x48080000 0x2000>;
 			interrupts = <4>;
diff --git a/arch/arm/boot/dts/dra71-evm.dts b/arch/arm/boot/dts/dra71-evm.dts
index 4d57a55473af..a6298eb56978 100644
--- a/arch/arm/boot/dts/dra71-evm.dts
+++ b/arch/arm/boot/dts/dra71-evm.dts
@@ -190,7 +190,7 @@
 		ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_25_NS>;
 		ti,tx-internal-delay = <DP83867_RGMIIDCTL_250_PS>;
 		ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_8_B_NIB>;
-		ti,impedance-control = <0x1f>;
+		ti,min-output-impedance;
 	};
 
 	dp83867_1: ethernet-phy@3 {
@@ -198,7 +198,7 @@
 		ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_25_NS>;
 		ti,tx-internal-delay = <DP83867_RGMIIDCTL_250_PS>;
 		ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_8_B_NIB>;
-		ti,impedance-control = <0x1f>;
+		ti,min-output-impedance;
 	};
 };
 
diff --git a/arch/arm/boot/dts/exynos4.dtsi b/arch/arm/boot/dts/exynos4.dtsi
index 497a9470c888..5739389f5bb8 100644
--- a/arch/arm/boot/dts/exynos4.dtsi
+++ b/arch/arm/boot/dts/exynos4.dtsi
@@ -59,6 +59,9 @@
 		compatible = "samsung,exynos4210-audss-clock";
 		reg = <0x03810000 0x0C>;
 		#clock-cells = <1>;
+		clocks = <&clock CLK_FIN_PLL>, <&clock CLK_FOUT_EPLL>,
+			 <&clock CLK_SCLK_AUDIO0>, <&clock CLK_SCLK_AUDIO0>;
+		clock-names = "pll_ref", "pll_in", "sclk_audio", "sclk_pcm_in";
 	};
 
 	i2s0: i2s@03830000 {
diff --git a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
index f92f95741207..a183b56283f8 100644
--- a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
+++ b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
@@ -266,6 +266,7 @@
 
 &hdmicec {
 	status = "okay";
+	needs-hpd;
 };
 
 &hsi2c_4 {
diff --git a/arch/arm/boot/dts/imx25.dtsi b/arch/arm/boot/dts/imx25.dtsi
index dfcc8e00cf1c..0ade3619f3c3 100644
--- a/arch/arm/boot/dts/imx25.dtsi
+++ b/arch/arm/boot/dts/imx25.dtsi
@@ -297,6 +297,7 @@
 				#address-cells = <1>;
 				#size-cells = <1>;
 				status = "disabled";
+				ranges;
 
 				adc: adc@50030800 {
 					compatible = "fsl,imx25-gcq";
diff --git a/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi b/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi
index aeaa5a6e4fcf..a24e4f1911ab 100644
--- a/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi
+++ b/arch/arm/boot/dts/imx6qdl-nitrogen6_som2.dtsi
@@ -507,7 +507,7 @@
 	pinctrl_pcie: pciegrp {
 		fsl,pins = <
 			/* PCIe reset */
-			MX6QDL_PAD_EIM_BCLK__GPIO6_IO31	0x030b0
+			MX6QDL_PAD_EIM_DA0__GPIO3_IO00	0x030b0
 			MX6QDL_PAD_EIM_DA4__GPIO3_IO04	0x030b0
 		>;
 	};
@@ -668,7 +668,7 @@
 &pcie {
 	pinctrl-names = "default";
 	pinctrl-0 = <&pinctrl_pcie>;
-	reset-gpio = <&gpio6 31 GPIO_ACTIVE_LOW>;
+	reset-gpio = <&gpio3 0 GPIO_ACTIVE_LOW>;
 	status = "okay";
 };
 
diff --git a/arch/arm/boot/dts/imx7d-sdb.dts b/arch/arm/boot/dts/imx7d-sdb.dts
index 54c45402286b..0a24d1bf3c39 100644
--- a/arch/arm/boot/dts/imx7d-sdb.dts
+++ b/arch/arm/boot/dts/imx7d-sdb.dts
@@ -557,6 +557,14 @@
 			>;
 		};
 
+		pinctrl_spi4: spi4grp {
+			fsl,pins = <
+				MX7D_PAD_GPIO1_IO09__GPIO1_IO9	0x59
+				MX7D_PAD_GPIO1_IO12__GPIO1_IO12	0x59
+				MX7D_PAD_GPIO1_IO13__GPIO1_IO13	0x59
+			>;
+		};
+
 		pinctrl_tsc2046_pendown: tsc2046_pendown {
 			fsl,pins = <
 				MX7D_PAD_EPDC_BDR1__GPIO2_IO29		0x59
@@ -697,13 +705,5 @@
 		fsl,pins = <
 			MX7D_PAD_LPSR_GPIO1_IO01__PWM1_OUT		0x110b0
 		>;
-
-		pinctrl_spi4: spi4grp {
-			fsl,pins = <
-				MX7D_PAD_GPIO1_IO09__GPIO1_IO9	0x59
-				MX7D_PAD_GPIO1_IO12__GPIO1_IO12	0x59
-				MX7D_PAD_GPIO1_IO13__GPIO1_IO13	0x59
-			>;
-		};
 	};
 };
diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi
index 7bb9df2c1460..9319e1f0f1d8 100644
--- a/arch/arm/boot/dts/ls1021a.dtsi
+++ b/arch/arm/boot/dts/ls1021a.dtsi
@@ -129,14 +129,14 @@
 		};
 
 		msi1: msi-controller@1570e00 {
-			compatible = "fsl,1s1021a-msi";
+			compatible = "fsl,ls1021a-msi";
 			reg = <0x0 0x1570e00 0x0 0x8>;
 			msi-controller;
 			interrupts =  <GIC_SPI 179 IRQ_TYPE_LEVEL_HIGH>;
 		};
 
 		msi2: msi-controller@1570e08 {
-			compatible = "fsl,1s1021a-msi";
+			compatible = "fsl,ls1021a-msi";
 			reg = <0x0 0x1570e08 0x0 0x8>;
 			msi-controller;
 			interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_HIGH>;
@@ -699,7 +699,7 @@
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000   /* downstream I/O */
 				  0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
-			msi-parent = <&msi1>;
+			msi-parent = <&msi1>, <&msi2>;
 			#interrupt-cells = <1>;
 			interrupt-map-mask = <0 0 0 7>;
 			interrupt-map = <0000 0 0 1 &gic GIC_SPI 91  IRQ_TYPE_LEVEL_HIGH>,
@@ -722,7 +722,7 @@
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000   /* downstream I/O */
 				  0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
-			msi-parent = <&msi2>;
+			msi-parent = <&msi1>, <&msi2>;
 			#interrupt-cells = <1>;
 			interrupt-map-mask = <0 0 0 7>;
 			interrupt-map = <0000 0 0 1 &gic GIC_SPI 92  IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 2484f11761ea..858e1fed762a 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -1126,8 +1126,8 @@
 		};
 	};
 
-	gpu: mali@ffa30000 {
-		compatible = "rockchip,rk3288-mali", "arm,mali-t760", "arm,mali-midgard";
+	gpu: gpu@ffa30000 {
+		compatible = "rockchip,rk3288-mali", "arm,mali-t760";
 		reg = <0xffa30000 0x10000>;
 		interrupts = <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
 			     <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi
index cc06da394366..60e69aeacbdb 100644
--- a/arch/arm/boot/dts/sama5d2.dtsi
+++ b/arch/arm/boot/dts/sama5d2.dtsi
@@ -303,7 +303,7 @@
 			#size-cells = <1>;
 			atmel,smc = <&hsmc>;
 			reg = <0x10000000 0x10000000
-			       0x40000000 0x30000000>;
+			       0x60000000 0x30000000>;
 			ranges = <0x0 0x0 0x10000000 0x10000000
 				  0x1 0x0 0x60000000 0x10000000
 				  0x2 0x0 0x70000000 0x10000000
@@ -1048,18 +1048,18 @@
 			};
 
 			hsmc: hsmc@f8014000 {
-				compatible = "atmel,sama5d3-smc", "syscon", "simple-mfd";
+				compatible = "atmel,sama5d2-smc", "syscon", "simple-mfd";
 				reg = <0xf8014000 0x1000>;
-				interrupts = <5 IRQ_TYPE_LEVEL_HIGH 6>;
+				interrupts = <17 IRQ_TYPE_LEVEL_HIGH 6>;
 				clocks = <&hsmc_clk>;
 				#address-cells = <1>;
 				#size-cells = <1>;
 				ranges;
 
-				pmecc: ecc-engine@ffffc070 {
+				pmecc: ecc-engine@f8014070 {
 					compatible = "atmel,sama5d2-pmecc";
-					reg = <0xffffc070 0x490>,
-					      <0xffffc500 0x100>;
+					reg = <0xf8014070 0x490>,
+					      <0xf8014500 0x100>;
 				};
 			};
 
diff --git a/arch/arm/boot/dts/sun8i-a83t.dtsi b/arch/arm/boot/dts/sun8i-a83t.dtsi
index 8923ba625b76..19a8f4fcfab5 100644
--- a/arch/arm/boot/dts/sun8i-a83t.dtsi
+++ b/arch/arm/boot/dts/sun8i-a83t.dtsi
@@ -44,7 +44,9 @@
 
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 
+#include <dt-bindings/clock/sun8i-a83t-ccu.h>
 #include <dt-bindings/clock/sun8i-r-ccu.h>
+#include <dt-bindings/reset/sun8i-a83t-ccu.h>
 
 / {
 	interrupt-parent = <&gic>;
@@ -175,8 +177,8 @@
 			compatible = "allwinner,sun8i-a83t-dma";
 			reg = <0x01c02000 0x1000>;
 			interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
-			clocks = <&ccu 21>;
-			resets = <&ccu 7>;
+			clocks = <&ccu CLK_BUS_DMA>;
+			resets = <&ccu RST_BUS_DMA>;
 			#dma-cells = <1>;
 		};
 
@@ -195,7 +197,7 @@
 				     <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>;
 			reg = <0x01c20800 0x400>;
-			clocks = <&ccu 45>, <&osc24M>, <&osc16Md512>;
+			clocks = <&ccu CLK_BUS_PIO>, <&osc24M>, <&osc16Md512>;
 			clock-names = "apb", "hosc", "losc";
 			gpio-controller;
 			interrupt-controller;
@@ -247,8 +249,8 @@
 				     "allwinner,sun8i-h3-spdif";
 			reg = <0x01c21000 0x400>;
 			interrupts = <GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>;
-			clocks = <&ccu 44>, <&ccu 76>;
-			resets = <&ccu 32>;
+			clocks = <&ccu CLK_BUS_SPDIF>, <&ccu CLK_SPDIF>;
+			resets = <&ccu RST_BUS_SPDIF>;
 			clock-names = "apb", "spdif";
 			dmas = <&dma 2>;
 			dma-names = "tx";
@@ -263,8 +265,8 @@
 			interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>;
 			reg-shift = <2>;
 			reg-io-width = <4>;
-			clocks = <&ccu 53>;
-			resets = <&ccu 40>;
+			clocks = <&ccu CLK_BUS_UART0>;
+			resets = <&ccu RST_BUS_UART0>;
 			status = "disabled";
 		};
 
diff --git a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
index 6713d0f2b3f4..b1502df7b509 100644
--- a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
+++ b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
@@ -56,8 +56,6 @@
 
 	aliases {
 		serial0 = &uart0;
-		/* ethernet0 is the H3 emac, defined in sun8i-h3.dtsi */
-		ethernet0 = &emac;
 		ethernet1 = &xr819;
 	};
 
@@ -104,13 +102,6 @@
 	status = "okay";
 };
 
-&emac {
-	phy-handle = <&int_mii_phy>;
-	phy-mode = "mii";
-	allwinner,leds-active-low;
-	status = "okay";
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins_a>;
diff --git a/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts b/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts
index d756ff825116..a337af1de322 100644
--- a/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts
+++ b/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts
@@ -52,7 +52,6 @@
 	compatible = "sinovoip,bpi-m2-plus", "allwinner,sun8i-h3";
 
 	aliases {
-		ethernet0 = &emac;
 		serial0 = &uart0;
 		serial1 = &uart1;
 	};
@@ -115,30 +114,12 @@
 	status = "okay";
 };
 
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&emac_rgmii_pins>;
-	phy-supply = <&reg_gmac_3v3>;
-	phy-handle = <&ext_rgmii_phy>;
-	phy-mode = "rgmii";
-
-	allwinner,leds-active-low;
-	status = "okay";
-};
-
 &ir {
 	pinctrl-names = "default";
 	pinctrl-0 = <&ir_pins_a>;
 	status = "okay";
 };
 
-&mdio {
-	ext_rgmii_phy: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <0>;
-	};
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts b/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts
index 78f6c24952dd..8d2cc6e9a03f 100644
--- a/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts
+++ b/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts
@@ -46,10 +46,3 @@
 	model = "FriendlyARM NanoPi NEO";
 	compatible = "friendlyarm,nanopi-neo", "allwinner,sun8i-h3";
 };
-
-&emac {
-	phy-handle = <&int_mii_phy>;
-	phy-mode = "mii";
-	allwinner,leds-active-low;
-	status = "okay";
-};
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts
index 17cdeae19c6f..8ff71b1bb45b 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts
@@ -54,7 +54,6 @@
 	aliases {
 		serial0 = &uart0;
 		/* ethernet0 is the H3 emac, defined in sun8i-h3.dtsi */
-		ethernet0 = &emac;
 		ethernet1 = &rtl8189;
 	};
 
@@ -118,13 +117,6 @@
 	status = "okay";
 };
 
-&emac {
-	phy-handle = <&int_mii_phy>;
-	phy-mode = "mii";
-	allwinner,leds-active-low;
-	status = "okay";
-};
-
 &ir {
 	pinctrl-names = "default";
 	pinctrl-0 = <&ir_pins_a>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts
index 6880268e8b87..5fea430e0eb1 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts
@@ -52,7 +52,6 @@
 	compatible = "xunlong,orangepi-one", "allwinner,sun8i-h3";
 
 	aliases {
-		ethernet0 = &emac;
 		serial0 = &uart0;
 	};
 
@@ -98,13 +97,6 @@
 	status = "okay";
 };
 
-&emac {
-	phy-handle = <&int_mii_phy>;
-	phy-mode = "mii";
-	allwinner,leds-active-low;
-	status = "okay";
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts
index a10281b455f5..8b93f5c781a7 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts
@@ -53,11 +53,6 @@
 	};
 };
 
-&emac {
-	/* LEDs changed to active high on the plus */
-	/delete-property/ allwinner,leds-active-low;
-};
-
 &mmc1 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc1_pins_a>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts
index 998b60f8d295..1a044b17d6c6 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts
@@ -52,7 +52,6 @@
 	compatible = "xunlong,orangepi-pc", "allwinner,sun8i-h3";
 
 	aliases {
-		ethernet0 = &emac;
 		serial0 = &uart0;
 	};
 
@@ -114,13 +113,6 @@
 	status = "okay";
 };
 
-&emac {
-	phy-handle = <&int_mii_phy>;
-	phy-mode = "mii";
-	allwinner,leds-active-low;
-	status = "okay";
-};
-
 &ir {
 	pinctrl-names = "default";
 	pinctrl-0 = <&ir_pins_a>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts
index 331ed683ac62..828ae7a526d9 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts
@@ -47,10 +47,6 @@
 	model = "Xunlong Orange Pi Plus / Plus 2";
 	compatible = "xunlong,orangepi-plus", "allwinner,sun8i-h3";
 
-	aliases {
-		ethernet0 = &emac;
-	};
-
 	reg_gmac_3v3: gmac-3v3 {
 		compatible = "regulator-fixed";
 		regulator-name = "gmac-3v3";
@@ -78,24 +74,6 @@
 	status = "okay";
 };
 
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&emac_rgmii_pins>;
-	phy-supply = <&reg_gmac_3v3>;
-	phy-handle = <&ext_rgmii_phy>;
-	phy-mode = "rgmii";
-
-	allwinner,leds-active-low;
-	status = "okay";
-};
-
-&mdio {
-	ext_rgmii_phy: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <0>;
-	};
-};
-
 &mmc2 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc2_8bit_pins>;
diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts
index 80026f3caafc..97920b12a944 100644
--- a/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts
+++ b/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts
@@ -61,19 +61,3 @@
 		gpio = <&pio 3 6 GPIO_ACTIVE_HIGH>; /* PD6 */
 	};
 };
-
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&emac_rgmii_pins>;
-	phy-supply = <&reg_gmac_3v3>;
-	phy-handle = <&ext_rgmii_phy>;
-	phy-mode = "rgmii";
-	status = "okay";
-};
-
-&mdio {
-	ext_rgmii_phy: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <1>;
-	};
-};
diff --git a/arch/arm/boot/dts/sunxi-h3-h5.dtsi b/arch/arm/boot/dts/sunxi-h3-h5.dtsi
index 6f2162608006..11240a8313c2 100644
--- a/arch/arm/boot/dts/sunxi-h3-h5.dtsi
+++ b/arch/arm/boot/dts/sunxi-h3-h5.dtsi
@@ -391,32 +391,6 @@
 			clocks = <&osc24M>;
 		};
 
-		emac: ethernet@1c30000 {
-			compatible = "allwinner,sun8i-h3-emac";
-			syscon = <&syscon>;
-			reg = <0x01c30000 0x104>;
-			interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
-			interrupt-names = "macirq";
-			resets = <&ccu RST_BUS_EMAC>;
-			reset-names = "stmmaceth";
-			clocks = <&ccu CLK_BUS_EMAC>;
-			clock-names = "stmmaceth";
-			#address-cells = <1>;
-			#size-cells = <0>;
-			status = "disabled";
-
-			mdio: mdio {
-				#address-cells = <1>;
-				#size-cells = <0>;
-				int_mii_phy: ethernet-phy@1 {
-					compatible = "ethernet-phy-ieee802.3-c22";
-					reg = <1>;
-					clocks = <&ccu CLK_BUS_EPHY>;
-					resets = <&ccu RST_BUS_EPHY>;
-				};
-			};
-		};
-
 		spi0: spi@01c68000 {
 			compatible = "allwinner,sun8i-h3-spi";
 			reg = <0x01c68000 0x1000>;
diff --git a/arch/arm/boot/dts/tango4-vantage-1172.dts b/arch/arm/boot/dts/tango4-vantage-1172.dts
index 86d8df98802f..13bcc460bcb2 100644
--- a/arch/arm/boot/dts/tango4-vantage-1172.dts
+++ b/arch/arm/boot/dts/tango4-vantage-1172.dts
@@ -22,7 +22,7 @@
 };
 
 &eth0 {
-	phy-connection-type = "rgmii";
+	phy-connection-type = "rgmii-id";
 	phy-handle = <&eth0_phy>;
 	#address-cells = <1>;
 	#size-cells = <0>;
diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h
index 27475904e096..eee269321923 100644
--- a/arch/arm/include/asm/arch_gicv3.h
+++ b/arch/arm/include/asm/arch_gicv3.h
@@ -276,6 +276,12 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr)
 #define gicr_write_pendbaser(v, c)	__gic_writeq_nonatomic(v, c)
 
 /*
+ * GICR_xLPIR - only the lower bits are significant
+ */
+#define gic_read_lpir(c)		readl_relaxed(c)
+#define gic_write_lpir(v, c)		writel_relaxed(lower_32_bits(v), c)
+
+/*
  * GITS_TYPER is an ID register and doesn't need atomicity.
  */
 #define gits_read_typer(c)		__gic_readq_nonatomic(c)
@@ -291,5 +297,33 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr)
  */
 #define gits_write_cwriter(v, c)	__gic_writeq_nonatomic(v, c)
 
+/*
+ * GITS_VPROPBASER - hi and lo bits may be accessed independently.
+ */
+#define gits_write_vpropbaser(v, c)	__gic_writeq_nonatomic(v, c)
+
+/*
+ * GITS_VPENDBASER - the Valid bit must be cleared before changing
+ * anything else.
+ */
+static inline void gits_write_vpendbaser(u64 val, void * __iomem addr)
+{
+	u32 tmp;
+
+	tmp = readl_relaxed(addr + 4);
+	if (tmp & (GICR_VPENDBASER_Valid >> 32)) {
+		tmp &= ~(GICR_VPENDBASER_Valid >> 32);
+		writel_relaxed(tmp, addr + 4);
+	}
+
+	/*
+	 * Use the fact that __gic_writeq_nonatomic writes the second
+	 * half of the 64bit quantity after the first.
+	 */
+	__gic_writeq_nonatomic(val, addr);
+}
+
+#define gits_read_vpendbaser(c)		__gic_readq_nonatomic(c)
+
 #endif /* !__ASSEMBLY__ */
 #endif /* !__ASM_ARCH_GICV3_H */
diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h
index 4e6e88a6b2f4..2244a94ed9c9 100644
--- a/arch/arm/include/asm/bug.h
+++ b/arch/arm/include/asm/bug.h
@@ -37,7 +37,7 @@ do {								\
 		".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \
 		"2:\t.asciz " #__file "\n" 			\
 		".popsection\n" 				\
-		".pushsection __bug_table,\"a\"\n"		\
+		".pushsection __bug_table,\"aw\"\n"		\
 		".align 2\n"					\
 		"3:\t.word 1b, 2b\n"				\
 		"\t.hword " #__line ", 0\n"			\
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index d69bebf697e7..74504b154256 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -116,7 +116,7 @@ struct cpu_cache_fns {
 	void (*dma_unmap_area)(const void *, size_t, int);
 
 	void (*dma_flush_range)(const void *, const void *);
-};
+} __no_randomize_layout;
 
 /*
  * Select the calling method
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 6795368ad023..cc414382dab4 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 #endif /* !SMP */
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 #ifndef CONFIG_SMP
 	preempt_disable();
 #endif
@@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	preempt_enable();
 #endif
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arm/include/asm/kexec.h b/arch/arm/include/asm/kexec.h
index 1869af6bac5c..25021b798a1e 100644
--- a/arch/arm/include/asm/kexec.h
+++ b/arch/arm/include/asm/kexec.h
@@ -19,6 +19,11 @@
 
 #ifndef __ASSEMBLY__
 
+#define ARCH_HAS_KIMAGE_ARCH
+struct kimage_arch {
+	u32 kernel_r2;
+};
+
 /**
  * crash_setup_regs() - save registers for the panic kernel
  * @newregs: registers are saved here
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 127e2dd2e21c..4a879f6ff13b 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -225,12 +225,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
-/* We do not have shadow page tables, hence the empty hooks */
-static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
-							 unsigned long address)
-{
-}
-
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
 void kvm_arm_halt_guest(struct kvm *kvm);
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 4bec45442072..c030143c18c6 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -52,22 +52,6 @@ static inline void dsb_sev(void)
  * memory.
  */
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	u16 owner = READ_ONCE(lock->tickets.owner);
-
-	for (;;) {
-		arch_spinlock_t tmp = READ_ONCE(*lock);
-
-		if (tmp.tickets.owner == tmp.tickets.next ||
-		    tmp.tickets.owner != owner)
-			break;
-
-		wfe();
-	}
-	smp_acquire__after_ctrl_dep();
-}
-
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define TIF_NEED_RESCHED	1	/* rescheduling necessary */
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_UPROBE		3	/* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE	4	/* syscall trace active */
-#define TIF_SYSCALL_AUDIT	5	/* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
-#define TIF_SECCOMP		7	/* seccomp syscall filtering active */
+#define TIF_FSCHECK		4	/* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE	5	/* syscall trace active */
+#define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT	7	/* syscall tracepoint instrumentation */
+#define TIF_SECCOMP		8	/* seccomp syscall filtering active */
 
 #define TIF_NOHZ		12	/* in adaptive nohz mode */
 #define TIF_USING_IWMMXT	17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 /*
  * Change these and you break ASM code in entry-common.S
  */
-#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING |	\
+				 _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
+				 _TIF_FSCHECK)
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 3f2eb76243e3..d5562f9ce600 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -148,7 +148,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 }
 
 static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->fullmm = !(start | (end+1));
@@ -166,8 +167,14 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
 }
 
 static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+			unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->range_start = start;
+		tlb->range_end = end;
+	}
+
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h
index f555bb3664dc..683d9230984a 100644
--- a/arch/arm/include/asm/traps.h
+++ b/arch/arm/include/asm/traps.h
@@ -18,7 +18,6 @@ struct undef_hook {
 void register_undef_hook(struct undef_hook *hook);
 void unregister_undef_hook(struct undef_hook *hook);
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static inline int __in_irqentry_text(unsigned long ptr)
 {
 	extern char __irqentry_text_start[];
@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
 	return ptr >= (unsigned long)&__irqentry_text_start &&
 	       ptr < (unsigned long)&__irqentry_text_end;
 }
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
-	return 0;
-}
-#endif
 
 static inline int in_exception_text(unsigned long ptr)
 {
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 0bf2347495f1..87936dd5d151 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -70,6 +70,8 @@ static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
 	modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
 }
 
 #define segment_eq(a, b)	((a) == (b))
diff --git a/arch/arm/include/asm/ucontext.h b/arch/arm/include/asm/ucontext.h
index 14749aec94bf..921d8274855c 100644
--- a/arch/arm/include/asm/ucontext.h
+++ b/arch/arm/include/asm/ucontext.h
@@ -35,6 +35,12 @@ struct ucontext {
  * bytes, to prevent unpredictable padding in the signal frame.
  */
 
+/*
+ * Dummy padding block: if this magic is encountered, the block should
+ * be skipped using the corresponding size field.
+ */
+#define DUMMY_MAGIC		0xb0d9ed01
+
 #ifdef CONFIG_CRUNCH
 #define CRUNCH_MAGIC		0x5065cf03
 #define CRUNCH_STORAGE_SIZE	(CRUNCH_SIZE + 8)
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -41,7 +41,9 @@ ret_fast_syscall:
  UNWIND(.cantunwind	)
 	disable_irq_notrace			@ disable interrupts
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
-	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+	tst	r1, #_TIF_SYSCALL_WORK
+	bne	fast_work_pending
+	tst	r1, #_TIF_WORK_MASK
 	bne	fast_work_pending
 
 	/* perform architecture specific actions before user return */
@@ -67,12 +69,15 @@ ret_fast_syscall:
 	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
 	disable_irq_notrace			@ disable interrupts
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
-	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+	tst	r1, #_TIF_SYSCALL_WORK
+	bne	fast_work_pending
+	tst	r1, #_TIF_WORK_MASK
 	beq	no_work_pending
  UNWIND(.fnend		)
 ENDPROC(ret_fast_syscall)
 
 	/* Slower path - fall through to work_pending */
+fast_work_pending:
 #endif
 
 	tst	r1, #_TIF_SYSCALL_WORK
diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index 15495887ca14..fe1419eeb932 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -30,7 +30,6 @@ extern unsigned long kexec_boot_atags;
 
 static atomic_t waiting_for_crash_ipi;
 
-static unsigned long dt_mem;
 /*
  * Provide a dummy crash_notes definition while crash dump arrives to arm.
  * This prevents breakage of crash_notes attribute in kernel/ksysfs.c.
@@ -42,6 +41,9 @@ int machine_kexec_prepare(struct kimage *image)
 	__be32 header;
 	int i, err;
 
+	image->arch.kernel_r2 = image->start - KEXEC_ARM_ZIMAGE_OFFSET
+				     + KEXEC_ARM_ATAGS_OFFSET;
+
 	/*
 	 * Validate that if the current HW supports SMP, then the SW supports
 	 * and implements CPU hotplug for the current HW. If not, we won't be
@@ -66,8 +68,8 @@ int machine_kexec_prepare(struct kimage *image)
 		if (err)
 			return err;
 
-		if (be32_to_cpu(header) == OF_DT_HEADER)
-			dt_mem = current_segment->mem;
+		if (header == cpu_to_be32(OF_DT_HEADER))
+			image->arch.kernel_r2 = current_segment->mem;
 	}
 	return 0;
 }
@@ -165,8 +167,7 @@ void machine_kexec(struct kimage *image)
 	kexec_start_address = image->start;
 	kexec_indirection_page = page_list;
 	kexec_mach_type = machine_arch_type;
-	kexec_boot_atags = dt_mem ?: image->start - KEXEC_ARM_ZIMAGE_OFFSET
-				     + KEXEC_ARM_ATAGS_OFFSET;
+	kexec_boot_atags = image->arch.kernel_r2;
 
 	/* copy our kernel relocation code to the control code page */
 	reboot_entry = fncpy(reboot_code_buffer,
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 4e80bf7420d4..8e9a3e40d949 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -987,6 +987,9 @@ static void __init reserve_crashkernel(void)
 
 	if (crash_base <= 0) {
 		unsigned long long crash_max = idmap_to_phys((u32)~0);
+		unsigned long long lowmem_max = __pa(high_memory - 1) + 1;
+		if (crash_max > lowmem_max)
+			crash_max = lowmem_max;
 		crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max,
 						    crash_size, CRASH_ALIGN);
 		if (!crash_base) {
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f2141427b..e2de50bf8742 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/tracehook.h>
 #include <linux/uprobes.h>
+#include <linux/syscalls.h>
 
 #include <asm/elf.h>
 #include <asm/cacheflush.h>
@@ -40,8 +41,10 @@ static int preserve_crunch_context(struct crunch_sigframe __user *frame)
 	return __copy_to_user(frame, kframe, sizeof(*frame));
 }
 
-static int restore_crunch_context(struct crunch_sigframe __user *frame)
+static int restore_crunch_context(char __user **auxp)
 {
+	struct crunch_sigframe __user *frame =
+		(struct crunch_sigframe __user *)*auxp;
 	char kbuf[sizeof(*frame) + 8];
 	struct crunch_sigframe *kframe;
 
@@ -52,6 +55,7 @@ static int restore_crunch_context(struct crunch_sigframe __user *frame)
 	if (kframe->magic != CRUNCH_MAGIC ||
 	    kframe->size != CRUNCH_STORAGE_SIZE)
 		return -1;
+	*auxp += CRUNCH_STORAGE_SIZE;
 	crunch_task_restore(current_thread_info(), &kframe->storage);
 	return 0;
 }
@@ -59,21 +63,39 @@ static int restore_crunch_context(struct crunch_sigframe __user *frame)
 
 #ifdef CONFIG_IWMMXT
 
-static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
+static int preserve_iwmmxt_context(struct iwmmxt_sigframe __user *frame)
 {
 	char kbuf[sizeof(*frame) + 8];
 	struct iwmmxt_sigframe *kframe;
+	int err = 0;
 
 	/* the iWMMXt context must be 64 bit aligned */
 	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
-	kframe->magic = IWMMXT_MAGIC;
-	kframe->size = IWMMXT_STORAGE_SIZE;
-	iwmmxt_task_copy(current_thread_info(), &kframe->storage);
-	return __copy_to_user(frame, kframe, sizeof(*frame));
+
+	if (test_thread_flag(TIF_USING_IWMMXT)) {
+		kframe->magic = IWMMXT_MAGIC;
+		kframe->size = IWMMXT_STORAGE_SIZE;
+		iwmmxt_task_copy(current_thread_info(), &kframe->storage);
+
+		err = __copy_to_user(frame, kframe, sizeof(*frame));
+	} else {
+		/*
+		 * For bug-compatibility with older kernels, some space
+		 * has to be reserved for iWMMXt even if it's not used.
+		 * Set the magic and size appropriately so that properly
+		 * written userspace can skip it reliably:
+		 */
+		__put_user_error(DUMMY_MAGIC, &frame->magic, err);
+		__put_user_error(IWMMXT_STORAGE_SIZE, &frame->size, err);
+	}
+
+	return err;
 }
 
-static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
+static int restore_iwmmxt_context(char __user **auxp)
 {
+	struct iwmmxt_sigframe __user *frame =
+		(struct iwmmxt_sigframe __user *)*auxp;
 	char kbuf[sizeof(*frame) + 8];
 	struct iwmmxt_sigframe *kframe;
 
@@ -81,10 +103,28 @@ static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
 	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
 	if (__copy_from_user(kframe, frame, sizeof(*frame)))
 		return -1;
-	if (kframe->magic != IWMMXT_MAGIC ||
-	    kframe->size != IWMMXT_STORAGE_SIZE)
+
+	/*
+	 * For non-iWMMXt threads: a single iwmmxt_sigframe-sized dummy
+	 * block is discarded for compatibility with setup_sigframe() if
+	 * present, but we don't mandate its presence.  If some other
+	 * magic is here, it's not for us:
+	 */
+	if (!test_thread_flag(TIF_USING_IWMMXT) &&
+	    kframe->magic != DUMMY_MAGIC)
+		return 0;
+
+	if (kframe->size != IWMMXT_STORAGE_SIZE)
 		return -1;
-	iwmmxt_task_restore(current_thread_info(), &kframe->storage);
+
+	if (test_thread_flag(TIF_USING_IWMMXT)) {
+		if (kframe->magic != IWMMXT_MAGIC)
+			return -1;
+
+		iwmmxt_task_restore(current_thread_info(), &kframe->storage);
+	}
+
+	*auxp += IWMMXT_STORAGE_SIZE;
 	return 0;
 }
 
@@ -107,8 +147,10 @@ static int preserve_vfp_context(struct vfp_sigframe __user *frame)
 	return vfp_preserve_user_clear_hwstate(&frame->ufp, &frame->ufp_exc);
 }
 
-static int restore_vfp_context(struct vfp_sigframe __user *frame)
+static int restore_vfp_context(char __user **auxp)
 {
+	struct vfp_sigframe __user *frame =
+		(struct vfp_sigframe __user *)*auxp;
 	unsigned long magic;
 	unsigned long size;
 	int err = 0;
@@ -121,6 +163,7 @@ static int restore_vfp_context(struct vfp_sigframe __user *frame)
 	if (magic != VFP_MAGIC || size != VFP_STORAGE_SIZE)
 		return -EINVAL;
 
+	*auxp += size;
 	return vfp_restore_user_hwstate(&frame->ufp, &frame->ufp_exc);
 }
 
@@ -141,7 +184,7 @@ struct rt_sigframe {
 
 static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf)
 {
-	struct aux_sigframe __user *aux;
+	char __user *aux;
 	sigset_t set;
 	int err;
 
@@ -169,18 +212,18 @@ static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf)
 
 	err |= !valid_user_regs(regs);
 
-	aux = (struct aux_sigframe __user *) sf->uc.uc_regspace;
+	aux = (char __user *) sf->uc.uc_regspace;
 #ifdef CONFIG_CRUNCH
 	if (err == 0)
-		err |= restore_crunch_context(&aux->crunch);
+		err |= restore_crunch_context(&aux);
 #endif
 #ifdef CONFIG_IWMMXT
-	if (err == 0 && test_thread_flag(TIF_USING_IWMMXT))
-		err |= restore_iwmmxt_context(&aux->iwmmxt);
+	if (err == 0)
+		err |= restore_iwmmxt_context(&aux);
 #endif
 #ifdef CONFIG_VFP
 	if (err == 0)
-		err |= restore_vfp_context(&aux->vfp);
+		err |= restore_vfp_context(&aux);
 #endif
 
 	return err;
@@ -286,7 +329,7 @@ setup_sigframe(struct sigframe __user *sf, struct pt_regs *regs, sigset_t *set)
 		err |= preserve_crunch_context(&aux->crunch);
 #endif
 #ifdef CONFIG_IWMMXT
-	if (err == 0 && test_thread_flag(TIF_USING_IWMMXT))
+	if (err == 0)
 		err |= preserve_iwmmxt_context(&aux->iwmmxt);
 #endif
 #ifdef CONFIG_VFP
@@ -571,6 +614,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 	 * Update the trace code with the current status.
 	 */
 	trace_hardirqs_off();
+
+	/* Check valid user FS if needed */
+	addr_limit_user_check();
+
 	do {
 		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 			schedule();
diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
index d735e5fc4772..195da38cb9a2 100644
--- a/arch/arm/mach-at91/Kconfig
+++ b/arch/arm/mach-at91/Kconfig
@@ -1,7 +1,7 @@
 menuconfig ARCH_AT91
 	bool "Atmel SoCs"
 	depends on ARCH_MULTI_V4T || ARCH_MULTI_V5 || ARCH_MULTI_V7 || ARM_SINGLE_ARMV7M
-	select ARM_CPU_SUSPEND if PM
+	select ARM_CPU_SUSPEND if PM && ARCH_MULTI_V7
 	select COMMON_CLK_AT91
 	select GPIOLIB
 	select PINCTRL
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 667fddac3856..5036f996e694 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -608,6 +608,9 @@ static void __init at91_pm_init(void (*pm_idle)(void))
 
 void __init at91rm9200_pm_init(void)
 {
+	if (!IS_ENABLED(CONFIG_SOC_AT91RM9200))
+		return;
+
 	at91_dt_ramc();
 
 	/*
@@ -620,18 +623,27 @@ void __init at91rm9200_pm_init(void)
 
 void __init at91sam9_pm_init(void)
 {
+	if (!IS_ENABLED(CONFIG_SOC_AT91SAM9))
+		return;
+
 	at91_dt_ramc();
 	at91_pm_init(at91sam9_idle);
 }
 
 void __init sama5_pm_init(void)
 {
+	if (!IS_ENABLED(CONFIG_SOC_SAMA5))
+		return;
+
 	at91_dt_ramc();
 	at91_pm_init(NULL);
 }
 
 void __init sama5d2_pm_init(void)
 {
+	if (!IS_ENABLED(CONFIG_SOC_SAMA5D2))
+		return;
+
 	at91_pm_backup_init();
 	sama5_pm_init();
 }
diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c
index b5625d009288..e568c8c6f69c 100644
--- a/arch/arm/mach-davinci/board-da850-evm.c
+++ b/arch/arm/mach-davinci/board-da850-evm.c
@@ -1166,7 +1166,7 @@ static struct tvp514x_platform_data tvp5146_pdata = {
 
 #define TVP514X_STD_ALL (V4L2_STD_NTSC | V4L2_STD_PAL)
 
-static const struct vpif_input da850_ch0_inputs[] = {
+static struct vpif_input da850_ch0_inputs[] = {
 	{
 		.input = {
 			.index = 0,
@@ -1181,7 +1181,7 @@ static const struct vpif_input da850_ch0_inputs[] = {
 	},
 };
 
-static const struct vpif_input da850_ch1_inputs[] = {
+static struct vpif_input da850_ch1_inputs[] = {
 	{
 		.input = {
 			.index = 0,
diff --git a/arch/arm/mach-davinci/clock.c b/arch/arm/mach-davinci/clock.c
index f5dce9b4e617..f77a4f766050 100644
--- a/arch/arm/mach-davinci/clock.c
+++ b/arch/arm/mach-davinci/clock.c
@@ -218,6 +218,15 @@ int clk_set_parent(struct clk *clk, struct clk *parent)
 }
 EXPORT_SYMBOL(clk_set_parent);
 
+struct clk *clk_get_parent(struct clk *clk)
+{
+	if (!clk)
+		return NULL;
+
+	return clk->parent;
+}
+EXPORT_SYMBOL(clk_get_parent);
+
 int clk_register(struct clk *clk)
 {
 	if (clk == NULL || IS_ERR(clk))
diff --git a/arch/arm/mach-ep93xx/clock.c b/arch/arm/mach-ep93xx/clock.c
index 39ef3b613912..beec5f16443a 100644
--- a/arch/arm/mach-ep93xx/clock.c
+++ b/arch/arm/mach-ep93xx/clock.c
@@ -475,6 +475,26 @@ int clk_set_rate(struct clk *clk, unsigned long rate)
 }
 EXPORT_SYMBOL(clk_set_rate);
 
+long clk_round_rate(struct clk *clk, unsigned long rate)
+{
+	WARN_ON(clk);
+	return 0;
+}
+EXPORT_SYMBOL(clk_round_rate);
+
+int clk_set_parent(struct clk *clk, struct clk *parent)
+{
+	WARN_ON(clk);
+	return 0;
+}
+EXPORT_SYMBOL(clk_set_parent);
+
+struct clk *clk_get_parent(struct clk *clk)
+{
+	return clk->parent;
+}
+EXPORT_SYMBOL(clk_get_parent);
+
 
 static char fclk_divisors[] = { 1, 2, 4, 8, 16, 1, 1, 1 };
 static char hclk_divisors[] = { 1, 2, 4, 5, 6, 8, 16, 32 };
diff --git a/arch/arm/mach-hisi/Kconfig b/arch/arm/mach-hisi/Kconfig
index a3b091a4d344..65a048fa08ec 100644
--- a/arch/arm/mach-hisi/Kconfig
+++ b/arch/arm/mach-hisi/Kconfig
@@ -39,6 +39,7 @@ config ARCH_HIP04
 	select HAVE_ARM_ARCH_TIMER
 	select MCPM if SMP
 	select MCPM_QUAD_CLUSTER if SMP
+	select GENERIC_IRQ_EFFECTIVE_AFF_MASK
 	help
 	  Support for Hisilicon HiP04 SoC family
 
diff --git a/arch/arm/mach-ixp4xx/include/mach/io.h b/arch/arm/mach-ixp4xx/include/mach/io.h
index 7a0c13bf4269..844e8ac593e2 100644
--- a/arch/arm/mach-ixp4xx/include/mach/io.h
+++ b/arch/arm/mach-ixp4xx/include/mach/io.h
@@ -95,8 +95,10 @@ static inline void __indirect_writeb(u8 value, volatile void __iomem *p)
 }
 
 static inline void __indirect_writesb(volatile void __iomem *bus_addr,
-				      const u8 *vaddr, int count)
+				      const void *p, int count)
 {
+	const u8 *vaddr = p;
+
 	while (count--)
 		writeb(*vaddr++, bus_addr);
 }
@@ -118,8 +120,10 @@ static inline void __indirect_writew(u16 value, volatile void __iomem *p)
 }
 
 static inline void __indirect_writesw(volatile void __iomem *bus_addr,
-				      const u16 *vaddr, int count)
+				      const void *p, int count)
 {
+	const u16 *vaddr = p;
+
 	while (count--)
 		writew(*vaddr++, bus_addr);
 }
@@ -137,8 +141,9 @@ static inline void __indirect_writel(u32 value, volatile void __iomem *p)
 }
 
 static inline void __indirect_writesl(volatile void __iomem *bus_addr,
-				      const u32 *vaddr, int count)
+				      const void *p, int count)
 {
+	const u32 *vaddr = p;
 	while (count--)
 		writel(*vaddr++, bus_addr);
 }
@@ -160,8 +165,10 @@ static inline u8 __indirect_readb(const volatile void __iomem *p)
 }
 
 static inline void __indirect_readsb(const volatile void __iomem *bus_addr,
-				     u8 *vaddr, u32 count)
+				     void *p, u32 count)
 {
+	u8 *vaddr = p;
+
 	while (count--)
 		*vaddr++ = readb(bus_addr);
 }
@@ -183,8 +190,10 @@ static inline u16 __indirect_readw(const volatile void __iomem *p)
 }
 
 static inline void __indirect_readsw(const volatile void __iomem *bus_addr,
-				     u16 *vaddr, u32 count)
+				     void *p, u32 count)
 {
+	u16 *vaddr = p;
+
 	while (count--)
 		*vaddr++ = readw(bus_addr);
 }
@@ -204,8 +213,10 @@ static inline u32 __indirect_readl(const volatile void __iomem *p)
 }
 
 static inline void __indirect_readsl(const volatile void __iomem *bus_addr,
-				     u32 *vaddr, u32 count)
+				     void *p, u32 count)
 {
+	u32 *vaddr = p;
+
 	while (count--)
 		*vaddr++ = readl(bus_addr);
 }
@@ -523,8 +534,15 @@ static inline void iowrite32_rep(void __iomem *addr, const void *vaddr,
 #endif
 }
 
-#define	ioport_map(port, nr)		((void __iomem*)(port + PIO_OFFSET))
-#define	ioport_unmap(addr)
+#define ioport_map(port, nr) ioport_map(port, nr)
+static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
+{
+	return ((void __iomem*)((port) + PIO_OFFSET));
+}
+#define	ioport_unmap(addr) ioport_unmap(addr)
+static inline void ioport_unmap(void __iomem *addr)
+{
+}
 #endif /* CONFIG_PCI */
 
 #endif /* __ASM_ARM_ARCH_IO_H */
diff --git a/arch/arm/mach-mmp/devices.c b/arch/arm/mach-mmp/devices.c
index 3330ac7cfbef..671c7a09ab3d 100644
--- a/arch/arm/mach-mmp/devices.c
+++ b/arch/arm/mach-mmp/devices.c
@@ -238,7 +238,7 @@ void pxa_usb_phy_deinit(void __iomem *phy_reg)
 #endif
 
 #if IS_ENABLED(CONFIG_USB_SUPPORT)
-static u64 usb_dma_mask = ~(u32)0;
+static u64 __maybe_unused usb_dma_mask = ~(u32)0;
 
 #if IS_ENABLED(CONFIG_USB_MV_UDC)
 struct resource pxa168_u2o_resources[] = {
diff --git a/arch/arm/mach-mvebu/platsmp.c b/arch/arm/mach-mvebu/platsmp.c
index e62273aacb43..4ffbbd217e82 100644
--- a/arch/arm/mach-mvebu/platsmp.c
+++ b/arch/arm/mach-mvebu/platsmp.c
@@ -211,7 +211,7 @@ static int mv98dx3236_resume_set_cpu_boot_addr(int hw_cpu, void *boot_addr)
 		return PTR_ERR(base);
 
 	writel(0, base + MV98DX3236_CPU_RESUME_CTRL_REG);
-	writel(virt_to_phys(boot_addr), base + MV98DX3236_CPU_RESUME_ADDR_REG);
+	writel(__pa_symbol(boot_addr), base + MV98DX3236_CPU_RESUME_ADDR_REG);
 
 	iounmap(base);
 
diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index 6613a6ff5dbc..6cbc69c92913 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -510,6 +510,7 @@ static void __init ams_delta_init(void)
 static void modem_pm(struct uart_port *port, unsigned int state, unsigned old)
 {
 	struct modem_private_data *priv = port->private_data;
+	int ret;
 
 	if (IS_ERR(priv->regulator))
 		return;
@@ -518,9 +519,16 @@ static void modem_pm(struct uart_port *port, unsigned int state, unsigned old)
 		return;
 
 	if (state == 0)
-		regulator_enable(priv->regulator);
+		ret = regulator_enable(priv->regulator);
 	else if (old == 0)
-		regulator_disable(priv->regulator);
+		ret = regulator_disable(priv->regulator);
+	else
+		ret = 0;
+
+	if (ret)
+		dev_warn(port->dev,
+			 "ams_delta modem_pm: failed to %sable regulator: %d\n",
+			 state ? "dis" : "en", ret);
 }
 
 static struct plat_serial8250_port ams_delta_modem_ports[] = {
diff --git a/arch/arm/mach-omap1/board-osk.c b/arch/arm/mach-omap1/board-osk.c
index 4dfb99504810..95ac1929aede 100644
--- a/arch/arm/mach-omap1/board-osk.c
+++ b/arch/arm/mach-omap1/board-osk.c
@@ -441,13 +441,11 @@ static struct spi_board_info __initdata mistral_boardinfo[] = { {
 	.chip_select		= 0,
 } };
 
-#ifdef	CONFIG_PM
 static irqreturn_t
 osk_mistral_wake_interrupt(int irq, void *ignored)
 {
 	return IRQ_HANDLED;
 }
-#endif
 
 static void __init osk_mistral_init(void)
 {
@@ -515,7 +513,6 @@ static void __init osk_mistral_init(void)
 
 		gpio_direction_input(OMAP_MPUIO(2));
 		irq_set_irq_type(irq, IRQ_TYPE_EDGE_RISING);
-#ifdef	CONFIG_PM
 		/* share the IRQ in case someone wants to use the
 		 * button for more than wakeup from system sleep.
 		 */
@@ -529,7 +526,6 @@ static void __init osk_mistral_init(void)
 				ret);
 		} else
 			enable_irq_wake(irq);
-#endif
 	} else
 		printk(KERN_ERR "OSK+Mistral: wakeup button is awol\n");
 
diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile
index 779fb1f680b3..b3b3b3a19183 100644
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -8,7 +8,7 @@ ccflags-y := -I$(srctree)/$(src)/include \
 # Common support
 obj-y := id.o io.o control.o devices.o fb.o timer.o pm.o \
 	 common.o dma.o wd_timer.o display.o i2c.o hdq1w.o omap_hwmod.o \
-	 omap_device.o omap-headsmp.o sram.o drm.o
+	 omap_device.o omap-headsmp.o sram.o
 
 hwmod-common				= omap_hwmod.o omap_hwmod_reset.o \
 					  omap_hwmod_common_data.o
diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c
index dc9e34e670a2..583fc39d84cd 100644
--- a/arch/arm/mach-omap2/board-generic.c
+++ b/arch/arm/mach-omap2/board-generic.c
@@ -28,11 +28,12 @@ static const struct of_device_id omap_dt_match_table[] __initconst = {
 	{ }
 };
 
-static void __init omap_generic_init(void)
+static void __init __maybe_unused omap_generic_init(void)
 {
 	pdata_quirks_init(omap_dt_match_table);
 
 	omapdss_init_of();
+	omap_soc_device_init();
 }
 
 #ifdef CONFIG_SOC_OMAP2420
diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c
index 8fa01c0ecdb2..b3f6eb5d04a2 100644
--- a/arch/arm/mach-omap2/display.c
+++ b/arch/arm/mach-omap2/display.c
@@ -66,6 +66,7 @@
  */
 #define FRAMEDONE_IRQ_TIMEOUT		100
 
+#if defined(CONFIG_FB_OMAP2)
 static struct platform_device omap_display_device = {
 	.name          = "omapdss",
 	.id            = -1,
@@ -163,6 +164,65 @@ static enum omapdss_version __init omap_display_get_version(void)
 		return OMAPDSS_VER_UNKNOWN;
 }
 
+static int __init omapdss_init_fbdev(void)
+{
+	static struct omap_dss_board_info board_data = {
+		.dsi_enable_pads = omap_dsi_enable_pads,
+		.dsi_disable_pads = omap_dsi_disable_pads,
+		.set_min_bus_tput = omap_dss_set_min_bus_tput,
+	};
+	struct device_node *node;
+	int r;
+
+	board_data.version = omap_display_get_version();
+	if (board_data.version == OMAPDSS_VER_UNKNOWN) {
+		pr_err("DSS not supported on this SoC\n");
+		return -ENODEV;
+	}
+
+	omap_display_device.dev.platform_data = &board_data;
+
+	r = platform_device_register(&omap_display_device);
+	if (r < 0) {
+		pr_err("Unable to register omapdss device\n");
+		return r;
+	}
+
+	/* create vrfb device */
+	r = omap_init_vrfb();
+	if (r < 0) {
+		pr_err("Unable to register omapvrfb device\n");
+		return r;
+	}
+
+	/* create FB device */
+	r = omap_init_fb();
+	if (r < 0) {
+		pr_err("Unable to register omapfb device\n");
+		return r;
+	}
+
+	/* create V4L2 display device */
+	r = omap_init_vout();
+	if (r < 0) {
+		pr_err("Unable to register omap_vout device\n");
+		return r;
+	}
+
+	/* add DSI info for omap4 */
+	node = of_find_node_by_name(NULL, "omap4_padconf_global");
+	if (node)
+		omap4_dsi_mux_syscon = syscon_node_to_regmap(node);
+
+	return 0;
+}
+#else
+static inline int omapdss_init_fbdev(void)
+{
+	return 0;
+}
+#endif /* CONFIG_FB_OMAP2 */
+
 static void dispc_disable_outputs(void)
 {
 	u32 v, irq_mask = 0;
@@ -335,16 +395,9 @@ static struct device_node * __init omapdss_find_dss_of_node(void)
 int __init omapdss_init_of(void)
 {
 	int r;
-	enum omapdss_version ver;
 	struct device_node *node;
 	struct platform_device *pdev;
 
-	static struct omap_dss_board_info board_data = {
-		.dsi_enable_pads = omap_dsi_enable_pads,
-		.dsi_disable_pads = omap_dsi_disable_pads,
-		.set_min_bus_tput = omap_dss_set_min_bus_tput,
-	};
-
 	/* only create dss helper devices if dss is enabled in the .dts */
 
 	node = omapdss_find_dss_of_node();
@@ -354,13 +407,6 @@ int __init omapdss_init_of(void)
 	if (!of_device_is_available(node))
 		return 0;
 
-	ver = omap_display_get_version();
-
-	if (ver == OMAPDSS_VER_UNKNOWN) {
-		pr_err("DSS not supported on this SoC\n");
-		return -ENODEV;
-	}
-
 	pdev = of_find_device_by_node(node);
 
 	if (!pdev) {
@@ -374,48 +420,5 @@ int __init omapdss_init_of(void)
 		return r;
 	}
 
-	board_data.version = ver;
-
-	omap_display_device.dev.platform_data = &board_data;
-
-	r = platform_device_register(&omap_display_device);
-	if (r < 0) {
-		pr_err("Unable to register omapdss device\n");
-		return r;
-	}
-
-	/* create DRM device */
-	r = omap_init_drm();
-	if (r < 0) {
-		pr_err("Unable to register omapdrm device\n");
-		return r;
-	}
-
-	/* create vrfb device */
-	r = omap_init_vrfb();
-	if (r < 0) {
-		pr_err("Unable to register omapvrfb device\n");
-		return r;
-	}
-
-	/* create FB device */
-	r = omap_init_fb();
-	if (r < 0) {
-		pr_err("Unable to register omapfb device\n");
-		return r;
-	}
-
-	/* create V4L2 display device */
-	r = omap_init_vout();
-	if (r < 0) {
-		pr_err("Unable to register omap_vout device\n");
-		return r;
-	}
-
-	/* add DSI info for omap4 */
-	node = of_find_node_by_name(NULL, "omap4_padconf_global");
-	if (node)
-		omap4_dsi_mux_syscon = syscon_node_to_regmap(node);
-
-	return 0;
+	return omapdss_init_fbdev();
 }
diff --git a/arch/arm/mach-omap2/display.h b/arch/arm/mach-omap2/display.h
index 9a39646d4316..42ec2e99a2f4 100644
--- a/arch/arm/mach-omap2/display.h
+++ b/arch/arm/mach-omap2/display.h
@@ -26,7 +26,6 @@ struct omap_dss_dispc_dev_attr {
 	bool	has_framedonetv_irq;
 };
 
-int omap_init_drm(void);
 int omap_init_vrfb(void);
 int omap_init_fb(void);
 int omap_init_vout(void);
diff --git a/arch/arm/mach-omap2/drm.c b/arch/arm/mach-omap2/drm.c
deleted file mode 100644
index 44fef961bb70..000000000000
--- a/arch/arm/mach-omap2/drm.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * DRM/KMS device registration for TI OMAP platforms
- *
- * Copyright (C) 2012 Texas Instruments
- * Author: Rob Clark <rob.clark@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/dma-mapping.h>
-#include <linux/platform_data/omap_drm.h>
-
-#include "soc.h"
-#include "display.h"
-
-#if IS_ENABLED(CONFIG_DRM_OMAP)
-
-static struct omap_drm_platform_data platform_data;
-
-static struct platform_device omap_drm_device = {
-	.dev = {
-		.coherent_dma_mask = DMA_BIT_MASK(32),
-		.platform_data = &platform_data,
-	},
-	.name = "omapdrm",
-	.id = 0,
-};
-
-int __init omap_init_drm(void)
-{
-	platform_data.omaprev = GET_OMAP_TYPE;
-
-	return platform_device_register(&omap_drm_device);
-
-}
-#else
-int __init omap_init_drm(void) { return 0; }
-#endif
diff --git a/arch/arm/mach-omap2/hsmmc.c b/arch/arm/mach-omap2/hsmmc.c
index be517b048762..5b614388d72f 100644
--- a/arch/arm/mach-omap2/hsmmc.c
+++ b/arch/arm/mach-omap2/hsmmc.c
@@ -32,120 +32,6 @@ static u16 control_devconf1_offset;
 
 #define HSMMC_NAME_LEN	9
 
-static void omap_hsmmc1_before_set_reg(struct device *dev,
-				       int power_on, int vdd)
-{
-	u32 reg, prog_io;
-	struct omap_hsmmc_platform_data *mmc = dev->platform_data;
-
-	if (mmc->remux)
-		mmc->remux(dev, power_on);
-
-	/*
-	 * Assume we power both OMAP VMMC1 (for CMD, CLK, DAT0..3) and the
-	 * card with Vcc regulator (from twl4030 or whatever).  OMAP has both
-	 * 1.8V and 3.0V modes, controlled by the PBIAS register.
-	 *
-	 * In 8-bit modes, OMAP VMMC1A (for DAT4..7) needs a supply, which
-	 * is most naturally TWL VSIM; those pins also use PBIAS.
-	 *
-	 * FIXME handle VMMC1A as needed ...
-	 */
-	if (power_on) {
-		if (cpu_is_omap2430()) {
-			reg = omap_ctrl_readl(OMAP243X_CONTROL_DEVCONF1);
-			if ((1 << vdd) >= MMC_VDD_30_31)
-				reg |= OMAP243X_MMC1_ACTIVE_OVERWRITE;
-			else
-				reg &= ~OMAP243X_MMC1_ACTIVE_OVERWRITE;
-			omap_ctrl_writel(reg, OMAP243X_CONTROL_DEVCONF1);
-		}
-
-		if (mmc->internal_clock) {
-			reg = omap_ctrl_readl(OMAP2_CONTROL_DEVCONF0);
-			reg |= OMAP2_MMCSDIO1ADPCLKISEL;
-			omap_ctrl_writel(reg, OMAP2_CONTROL_DEVCONF0);
-		}
-
-		reg = omap_ctrl_readl(control_pbias_offset);
-		if (cpu_is_omap3630()) {
-			/* Set MMC I/O to 52MHz */
-			prog_io = omap_ctrl_readl(OMAP343X_CONTROL_PROG_IO1);
-			prog_io |= OMAP3630_PRG_SDMMC1_SPEEDCTRL;
-			omap_ctrl_writel(prog_io, OMAP343X_CONTROL_PROG_IO1);
-		} else {
-			reg |= OMAP2_PBIASSPEEDCTRL0;
-		}
-		reg &= ~OMAP2_PBIASLITEPWRDNZ0;
-		omap_ctrl_writel(reg, control_pbias_offset);
-	} else {
-		reg = omap_ctrl_readl(control_pbias_offset);
-		reg &= ~OMAP2_PBIASLITEPWRDNZ0;
-		omap_ctrl_writel(reg, control_pbias_offset);
-	}
-}
-
-static void omap_hsmmc1_after_set_reg(struct device *dev, int power_on, int vdd)
-{
-	u32 reg;
-
-	/* 100ms delay required for PBIAS configuration */
-	msleep(100);
-
-	if (power_on) {
-		reg = omap_ctrl_readl(control_pbias_offset);
-		reg |= (OMAP2_PBIASLITEPWRDNZ0 | OMAP2_PBIASSPEEDCTRL0);
-		if ((1 << vdd) <= MMC_VDD_165_195)
-			reg &= ~OMAP2_PBIASLITEVMODE0;
-		else
-			reg |= OMAP2_PBIASLITEVMODE0;
-		omap_ctrl_writel(reg, control_pbias_offset);
-	} else {
-		reg = omap_ctrl_readl(control_pbias_offset);
-		reg |= (OMAP2_PBIASSPEEDCTRL0 | OMAP2_PBIASLITEPWRDNZ0 |
-			OMAP2_PBIASLITEVMODE0);
-		omap_ctrl_writel(reg, control_pbias_offset);
-	}
-}
-
-static void hsmmc2_select_input_clk_src(struct omap_hsmmc_platform_data *mmc)
-{
-	u32 reg;
-
-	reg = omap_ctrl_readl(control_devconf1_offset);
-	if (mmc->internal_clock)
-		reg |= OMAP2_MMCSDIO2ADPCLKISEL;
-	else
-		reg &= ~OMAP2_MMCSDIO2ADPCLKISEL;
-	omap_ctrl_writel(reg, control_devconf1_offset);
-}
-
-static void hsmmc2_before_set_reg(struct device *dev, int power_on, int vdd)
-{
-	struct omap_hsmmc_platform_data *mmc = dev->platform_data;
-
-	if (mmc->remux)
-		mmc->remux(dev, power_on);
-
-	if (power_on)
-		hsmmc2_select_input_clk_src(mmc);
-}
-
-static int am35x_hsmmc2_set_power(struct device *dev, int power_on, int vdd)
-{
-	struct omap_hsmmc_platform_data *mmc = dev->platform_data;
-
-	if (power_on)
-		hsmmc2_select_input_clk_src(mmc);
-
-	return 0;
-}
-
-static int nop_mmc_set_power(struct device *dev, int power_on, int vdd)
-{
-	return 0;
-}
-
 static int __init omap_hsmmc_pdata_init(struct omap2_hsmmc_info *c,
 					struct omap_hsmmc_platform_data *mmc)
 {
@@ -157,101 +43,11 @@ static int __init omap_hsmmc_pdata_init(struct omap2_hsmmc_info *c,
 		return -ENOMEM;
 	}
 
-	if (c->name)
-		strncpy(hc_name, c->name, HSMMC_NAME_LEN);
-	else
-		snprintf(hc_name, (HSMMC_NAME_LEN + 1), "mmc%islot%i",
-								c->mmc, 1);
+	snprintf(hc_name, (HSMMC_NAME_LEN + 1), "mmc%islot%i", c->mmc, 1);
 	mmc->name = hc_name;
 	mmc->caps = c->caps;
-	mmc->internal_clock = !c->ext_clock;
 	mmc->reg_offset = 0;
 
-	if (c->cover_only) {
-		/* detect if mobile phone cover removed */
-		mmc->gpio_cd = -EINVAL;
-		mmc->gpio_cod = c->gpio_cd;
-	} else {
-		/* card detect pin on the mmc socket itself */
-		mmc->gpio_cd = c->gpio_cd;
-		mmc->gpio_cod = -EINVAL;
-	}
-	mmc->gpio_wp = c->gpio_wp;
-
-	mmc->remux = c->remux;
-	mmc->init_card = c->init_card;
-
-	if (c->nonremovable)
-		mmc->nonremovable = 1;
-
-	/*
-	 * NOTE:  MMC slots should have a Vcc regulator set up.
-	 * This may be from a TWL4030-family chip, another
-	 * controllable regulator, or a fixed supply.
-	 *
-	 * temporary HACK: ocr_mask instead of fixed supply
-	 */
-	if (soc_is_am35xx())
-		mmc->ocr_mask = MMC_VDD_165_195 |
-					 MMC_VDD_26_27 |
-					 MMC_VDD_27_28 |
-					 MMC_VDD_29_30 |
-					 MMC_VDD_30_31 |
-					 MMC_VDD_31_32;
-	else
-		mmc->ocr_mask = c->ocr_mask;
-
-	if (!soc_is_am35xx())
-		mmc->features |= HSMMC_HAS_PBIAS;
-
-	switch (c->mmc) {
-	case 1:
-		if (mmc->features & HSMMC_HAS_PBIAS) {
-			/* on-chip level shifting via PBIAS0/PBIAS1 */
-			mmc->before_set_reg =
-					omap_hsmmc1_before_set_reg;
-			mmc->after_set_reg =
-					omap_hsmmc1_after_set_reg;
-		}
-
-		if (soc_is_am35xx())
-			mmc->set_power = nop_mmc_set_power;
-
-		/* OMAP3630 HSMMC1 supports only 4-bit */
-		if (cpu_is_omap3630() &&
-				(c->caps & MMC_CAP_8_BIT_DATA)) {
-			c->caps &= ~MMC_CAP_8_BIT_DATA;
-			c->caps |= MMC_CAP_4_BIT_DATA;
-			mmc->caps = c->caps;
-		}
-		break;
-	case 2:
-		if (soc_is_am35xx())
-			mmc->set_power = am35x_hsmmc2_set_power;
-
-		if (c->ext_clock)
-			c->transceiver = 1;
-		if (c->transceiver && (c->caps & MMC_CAP_8_BIT_DATA)) {
-			c->caps &= ~MMC_CAP_8_BIT_DATA;
-			c->caps |= MMC_CAP_4_BIT_DATA;
-		}
-		if (mmc->features & HSMMC_HAS_PBIAS) {
-			/* off-chip level shifting, or none */
-			mmc->before_set_reg = hsmmc2_before_set_reg;
-			mmc->after_set_reg = NULL;
-		}
-		break;
-	case 3:
-	case 4:
-	case 5:
-		mmc->before_set_reg = NULL;
-		mmc->after_set_reg = NULL;
-		break;
-	default:
-		pr_err("MMC%d configuration not supported!\n", c->mmc);
-		kfree(hc_name);
-		return -ENODEV;
-	}
 	return 0;
 }
 
@@ -260,7 +56,6 @@ static int omap_hsmmc_done;
 void omap_hsmmc_late_init(struct omap2_hsmmc_info *c)
 {
 	struct platform_device *pdev;
-	struct omap_hsmmc_platform_data *mmc_pdata;
 	int res;
 
 	if (omap_hsmmc_done != 1)
@@ -269,32 +64,12 @@ void omap_hsmmc_late_init(struct omap2_hsmmc_info *c)
 	omap_hsmmc_done++;
 
 	for (; c->mmc; c++) {
-		if (!c->deferred)
-			continue;
-
 		pdev = c->pdev;
 		if (!pdev)
 			continue;
-
-		mmc_pdata = pdev->dev.platform_data;
-		if (!mmc_pdata)
-			continue;
-
-		if (c->cover_only) {
-			/* detect if mobile phone cover removed */
-			mmc_pdata->gpio_cd = -EINVAL;
-			mmc_pdata->gpio_cod = c->gpio_cd;
-		} else {
-			/* card detect pin on the mmc socket itself */
-			mmc_pdata->gpio_cd = c->gpio_cd;
-			mmc_pdata->gpio_cod = -EINVAL;
-		}
-		mmc_pdata->gpio_wp = c->gpio_wp;
-
 		res = omap_device_register(pdev);
 		if (res)
-			pr_err("Could not late init MMC %s\n",
-			       c->name);
+			pr_err("Could not late init MMC\n");
 	}
 }
 
@@ -336,13 +111,6 @@ static void __init omap_hsmmc_init_one(struct omap2_hsmmc_info *hsmmcinfo,
 	if (oh->dev_attr != NULL) {
 		mmc_dev_attr = oh->dev_attr;
 		mmc_data->controller_flags = mmc_dev_attr->flags;
-		/*
-		 * erratum 2.1.1.128 doesn't apply if board has
-		 * a transceiver is attached
-		 */
-		if (hsmmcinfo->transceiver)
-			mmc_data->controller_flags &=
-				~OMAP_HSMMC_BROKEN_MULTIBLOCK_READ;
 	}
 
 	pdev = platform_device_alloc(name, ctrl_nr - 1);
@@ -367,9 +135,6 @@ static void __init omap_hsmmc_init_one(struct omap2_hsmmc_info *hsmmcinfo,
 
 	hsmmcinfo->pdev = pdev;
 
-	if (hsmmcinfo->deferred)
-		goto free_mmc;
-
 	res = omap_device_register(pdev);
 	if (res) {
 		pr_err("Could not register od for %s\n", name);
diff --git a/arch/arm/mach-omap2/hsmmc.h b/arch/arm/mach-omap2/hsmmc.h
index 69b619ddc765..af9af5094ec3 100644
--- a/arch/arm/mach-omap2/hsmmc.h
+++ b/arch/arm/mach-omap2/hsmmc.h
@@ -12,18 +12,9 @@ struct omap2_hsmmc_info {
 	u8	mmc;		/* controller 1/2/3 */
 	u32	caps;		/* 4/8 wires and any additional host
 				 * capabilities OR'd (ref. linux/mmc/host.h) */
-	bool	transceiver;	/* MMC-2 option */
-	bool	ext_clock;	/* use external pin for input clock */
-	bool	cover_only;	/* No card detect - just cover switch */
-	bool	nonremovable;	/* Nonremovable e.g. eMMC */
-	bool	deferred;	/* mmc needs a deferred probe */
 	int	gpio_cd;	/* or -EINVAL */
 	int	gpio_wp;	/* or -EINVAL */
-	char	*name;		/* or NULL for default */
 	struct platform_device *pdev;	/* mmc controller instance */
-	int	ocr_mask;	/* temporary HACK */
-	/* Remux (pad configuration) when powering on/off */
-	void (*remux)(struct device *dev, int power_on);
 	/* init some special card */
 	void (*init_card)(struct mmc_card *card);
 };
diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c
index 1d739d1a0a65..cb5d7314cf99 100644
--- a/arch/arm/mach-omap2/io.c
+++ b/arch/arm/mach-omap2/io.c
@@ -410,7 +410,7 @@ static int _set_hwmod_postsetup_state(struct omap_hwmod *oh, void *data)
 	return omap_hwmod_set_postsetup_state(oh, *(u8 *)data);
 }
 
-static void __init omap_hwmod_init_postsetup(void)
+static void __init __maybe_unused omap_hwmod_init_postsetup(void)
 {
 	u8 postsetup_state;
 
@@ -428,7 +428,6 @@ static void __init omap_hwmod_init_postsetup(void)
 static void __init __maybe_unused omap_common_late_init(void)
 {
 	omap2_common_pm_late_init();
-	omap_soc_device_init();
 }
 
 #ifdef CONFIG_SOC_OMAP2420
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index d44e0e2f1106..841ba19d64a6 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -486,7 +486,6 @@ int __init omap3_pm_init(void)
 	ret = request_irq(omap_prcm_event_to_irq("io"),
 		_prcm_int_handle_io, IRQF_SHARED | IRQF_NO_SUSPEND, "pm_io",
 		omap3_pm_init);
-	enable_irq(omap_prcm_event_to_irq("io"));
 
 	if (ret) {
 		pr_err("pm: Failed to request pm_io irq\n");
diff --git a/arch/arm/mach-omap2/prm3xxx.c b/arch/arm/mach-omap2/prm3xxx.c
index 382e236fbfd9..64f6451499a7 100644
--- a/arch/arm/mach-omap2/prm3xxx.c
+++ b/arch/arm/mach-omap2/prm3xxx.c
@@ -692,7 +692,6 @@ static int omap3xxx_prm_late_init(void)
 {
 	struct device_node *np;
 	int irq_num;
-	int ret;
 
 	if (!(prm_features & PRM_HAS_IO_WAKEUP))
 		return 0;
@@ -712,12 +711,8 @@ static int omap3xxx_prm_late_init(void)
 	}
 
 	omap3xxx_prm_enable_io_wakeup();
-	ret = omap_prcm_register_chain_handler(&omap3_prcm_irq_setup);
-	if (!ret)
-		irq_set_status_flags(omap_prcm_event_to_irq("io"),
-				     IRQ_NOAUTOEN);
 
-	return ret;
+	return omap_prcm_register_chain_handler(&omap3_prcm_irq_setup);
 }
 
 static void __exit omap3xxx_prm_exit(void)
diff --git a/arch/arm/mach-omap2/prm44xx.c b/arch/arm/mach-omap2/prm44xx.c
index 87e86a4a9ead..3ab5df1ce900 100644
--- a/arch/arm/mach-omap2/prm44xx.c
+++ b/arch/arm/mach-omap2/prm44xx.c
@@ -337,6 +337,27 @@ static void omap44xx_prm_reconfigure_io_chain(void)
 }
 
 /**
+ * omap44xx_prm_enable_io_wakeup - enable wakeup events from I/O wakeup latches
+ *
+ * Activates the I/O wakeup event latches and allows events logged by
+ * those latches to signal a wakeup event to the PRCM.  For I/O wakeups
+ * to occur, WAKEUPENABLE bits must be set in the pad mux registers, and
+ * omap44xx_prm_reconfigure_io_chain() must be called.  No return value.
+ */
+static void __init omap44xx_prm_enable_io_wakeup(void)
+{
+	s32 inst = omap4_prmst_get_prm_dev_inst();
+
+	if (inst == PRM_INSTANCE_UNKNOWN)
+		return;
+
+	omap4_prm_rmw_inst_reg_bits(OMAP4430_GLOBAL_WUEN_MASK,
+				    OMAP4430_GLOBAL_WUEN_MASK,
+				    inst,
+				    omap4_prcm_irq_setup.pm_ctrl);
+}
+
+/**
  * omap44xx_prm_read_reset_sources - return the last SoC reset source
  *
  * Return a u32 representing the last reset sources of the SoC.  The
@@ -668,6 +689,8 @@ struct pwrdm_ops omap4_pwrdm_operations = {
 	.pwrdm_has_voltdm	= omap4_check_vcvp,
 };
 
+static int omap44xx_prm_late_init(void);
+
 /*
  * XXX document
  */
@@ -675,6 +698,7 @@ static struct prm_ll_data omap44xx_prm_ll_data = {
 	.read_reset_sources = &omap44xx_prm_read_reset_sources,
 	.was_any_context_lost_old = &omap44xx_prm_was_any_context_lost_old,
 	.clear_context_loss_flags_old = &omap44xx_prm_clear_context_loss_flags_old,
+	.late_init = &omap44xx_prm_late_init,
 	.assert_hardreset	= omap4_prminst_assert_hardreset,
 	.deassert_hardreset	= omap4_prminst_deassert_hardreset,
 	.is_hardreset_asserted	= omap4_prminst_is_hardreset_asserted,
@@ -711,6 +735,37 @@ int __init omap44xx_prm_init(const struct omap_prcm_init_data *data)
 	return prm_register(&omap44xx_prm_ll_data);
 }
 
+static int omap44xx_prm_late_init(void)
+{
+	int irq_num;
+
+	if (!(prm_features & PRM_HAS_IO_WAKEUP))
+		return 0;
+
+	irq_num = of_irq_get(prm_init_data->np, 0);
+	/*
+	 * Already have OMAP4 IRQ num. For all other platforms, we need
+	 * IRQ numbers from DT
+	 */
+	if (irq_num < 0 && !(prm_init_data->flags & PRM_IRQ_DEFAULT)) {
+		if (irq_num == -EPROBE_DEFER)
+			return irq_num;
+
+		/* Have nothing to do */
+		return 0;
+	}
+
+	/* Once OMAP4 DT is filled as well */
+	if (irq_num >= 0) {
+		omap4_prcm_irq_setup.irq = irq_num;
+		omap4_prcm_irq_setup.xlate_irq = NULL;
+	}
+
+	omap44xx_prm_enable_io_wakeup();
+
+	return omap_prcm_register_chain_handler(&omap4_prcm_irq_setup);
+}
+
 static void __exit omap44xx_prm_exit(void)
 {
 	prm_unregister(&omap44xx_prm_ll_data);
diff --git a/arch/arm/mach-prima2/common.c b/arch/arm/mach-prima2/common.c
index 8cadb302a7d2..ffe05c27087e 100644
--- a/arch/arm/mach-prima2/common.c
+++ b/arch/arm/mach-prima2/common.c
@@ -15,7 +15,7 @@
 #include <linux/of_platform.h>
 #include "common.h"
 
-static void __init sirfsoc_init_late(void)
+static void __init __maybe_unused sirfsoc_init_late(void)
 {
 	sirfsoc_pm_init();
 }
diff --git a/arch/arm/mach-pxa/Kconfig b/arch/arm/mach-pxa/Kconfig
index 76fbc115ec33..ce7d97babb0f 100644
--- a/arch/arm/mach-pxa/Kconfig
+++ b/arch/arm/mach-pxa/Kconfig
@@ -566,6 +566,7 @@ config MACH_ICONTROL
 config ARCH_PXA_ESERIES
 	bool "PXA based Toshiba e-series PDAs"
 	select FB_W100
+	select FB
 	select PXA25x
 
 config MACH_E330
diff --git a/arch/arm/mach-pxa/include/mach/mtd-xip.h b/arch/arm/mach-pxa/include/mach/mtd-xip.h
index 990d2bf2fb45..9bf4ea6a6f74 100644
--- a/arch/arm/mach-pxa/include/mach/mtd-xip.h
+++ b/arch/arm/mach-pxa/include/mach/mtd-xip.h
@@ -17,11 +17,15 @@
 
 #include <mach/regs-ost.h>
 
-#define xip_irqpending()	(ICIP & ICMR)
+/* restored July 2017, this did not build since 2011! */
+
+#define ICIP			io_p2v(0x40d00000)
+#define ICMR			io_p2v(0x40d00004)
+#define xip_irqpending()	(readl(ICIP) & readl(ICMR))
 
 /* we sample OSCR and convert desired delta to usec (1/4 ~= 1000000/3686400) */
-#define xip_currtime()		(OSCR)
-#define xip_elapsed_since(x)	(signed)((OSCR - (x)) / 4)
+#define xip_currtime()		readl(OSCR)
+#define xip_elapsed_since(x)	(signed)((readl(OSCR) - (x)) / 4)
 
 /*
  * xip_cpu_idle() is used when waiting for a delay equal or larger than
diff --git a/arch/arm/mach-rpc/include/mach/hardware.h b/arch/arm/mach-rpc/include/mach/hardware.h
index aa79fa47373a..622d4e5df029 100644
--- a/arch/arm/mach-rpc/include/mach/hardware.h
+++ b/arch/arm/mach-rpc/include/mach/hardware.h
@@ -25,8 +25,8 @@
  *  *_SIZE  is the size of the region
  *  *_BASE  is the virtual address
  */
-#define RAM_SIZE		0x10000000
-#define RAM_START		0x10000000
+#define RPC_RAM_SIZE		0x10000000
+#define RPC_RAM_START		0x10000000
 
 #define EASI_SIZE		0x08000000	/* EASI I/O */
 #define EASI_START		0x08000000
diff --git a/arch/arm/mach-sa1100/clock.c b/arch/arm/mach-sa1100/clock.c
index 0db46895c82a..7d52cd97d96e 100644
--- a/arch/arm/mach-sa1100/clock.c
+++ b/arch/arm/mach-sa1100/clock.c
@@ -35,6 +35,31 @@ struct clk clk_##_name = {				\
 
 static DEFINE_SPINLOCK(clocks_lock);
 
+/* Dummy clk routine to build generic kernel parts that may be using them */
+long clk_round_rate(struct clk *clk, unsigned long rate)
+{
+	return clk_get_rate(clk);
+}
+EXPORT_SYMBOL(clk_round_rate);
+
+int clk_set_rate(struct clk *clk, unsigned long rate)
+{
+	return 0;
+}
+EXPORT_SYMBOL(clk_set_rate);
+
+int clk_set_parent(struct clk *clk, struct clk *parent)
+{
+	return 0;
+}
+EXPORT_SYMBOL(clk_set_parent);
+
+struct clk *clk_get_parent(struct clk *clk)
+{
+	return NULL;
+}
+EXPORT_SYMBOL(clk_get_parent);
+
 static void clk_gpio27_enable(struct clk *clk)
 {
 	/*
diff --git a/arch/arm/mach-sa1100/include/mach/mtd-xip.h b/arch/arm/mach-sa1100/include/mach/mtd-xip.h
index b3d684098fbf..cb76096a2e36 100644
--- a/arch/arm/mach-sa1100/include/mach/mtd-xip.h
+++ b/arch/arm/mach-sa1100/include/mach/mtd-xip.h
@@ -20,7 +20,7 @@
 #define xip_irqpending()	(ICIP & ICMR)
 
 /* we sample OSCR and convert desired delta to usec (1/4 ~= 1000000/3686400) */
-#define xip_currtime()		(OSCR)
-#define xip_elapsed_since(x)	(signed)((OSCR - (x)) / 4)
+#define xip_currtime()		readl_relaxed(OSCR)
+#define xip_elapsed_since(x)	(signed)((readl_relaxed(OSCR) - (x)) / 4)
 
 #endif /* __ARCH_SA1100_MTD_XIP_H__ */
diff --git a/arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c b/arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c
index 73e3adbc1330..44438f344dc8 100644
--- a/arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c
+++ b/arch/arm/mach-shmobile/regulator-quirk-rcar-gen2.c
@@ -67,8 +67,12 @@ static int regulator_quirk_notify(struct notifier_block *nb,
 {
 	struct device *dev = data;
 	struct i2c_client *client;
+	static bool done;
 	u32 mon;
 
+	if (done)
+		return 0;
+
 	mon = ioread32(irqc + IRQC_MONITOR);
 	dev_dbg(dev, "%s: %ld, IRQC_MONITOR = 0x%x\n", __func__, action, mon);
 	if (mon & REGULATOR_IRQ_MASK)
@@ -99,7 +103,7 @@ static int regulator_quirk_notify(struct notifier_block *nb,
 remove:
 	dev_info(dev, "IRQ2 is not asserted, removing quirk\n");
 
-	bus_unregister_notifier(&i2c_bus_type, nb);
+	done = true;
 	iounmap(irqc);
 	return 0;
 }
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c
index 28083ef72819..71a34e8c345a 100644
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -133,6 +133,7 @@ static irqreturn_t db8500_pmu_handler(int irq, void *dev, irq_handler_t handler)
 
 static struct arm_pmu_platdata db8500_pmu_platdata = {
 	.handle_irq		= db8500_pmu_handler,
+	.irq_flags		= IRQF_NOBALANCING | IRQF_NO_THREAD,
 };
 
 static struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = {
diff --git a/arch/arm/mach-w90x900/clock.c b/arch/arm/mach-w90x900/clock.c
index ac6fd1a2cb59..3f93fac98d97 100644
--- a/arch/arm/mach-w90x900/clock.c
+++ b/arch/arm/mach-w90x900/clock.c
@@ -93,3 +93,32 @@ void nuc900_subclk_enable(struct clk *clk, int enable)
 
 	__raw_writel(clken, W90X900_VA_CLKPWR + SUBCLK);
 }
+
+/* dummy functions, should not be called */
+long clk_round_rate(struct clk *clk, unsigned long rate)
+{
+	WARN_ON(clk);
+	return 0;
+}
+EXPORT_SYMBOL(clk_round_rate);
+
+int clk_set_rate(struct clk *clk, unsigned long rate)
+{
+	WARN_ON(clk);
+	return 0;
+}
+EXPORT_SYMBOL(clk_set_rate);
+
+int clk_set_parent(struct clk *clk, struct clk *parent)
+{
+	WARN_ON(clk);
+	return 0;
+}
+EXPORT_SYMBOL(clk_set_parent);
+
+struct clk *clk_get_parent(struct clk *clk)
+{
+	WARN_ON(clk);
+	return NULL;
+}
+EXPORT_SYMBOL(clk_get_parent);
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 90ee354d803e..6db5fc26d154 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -40,9 +40,21 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t size,
 
 {
 	const struct dma_map_ops *ops = &dma_noop_ops;
+	void *ret;
 
 	/*
-	 * We are here because:
+	 * Try generic allocator first if we are advertised that
+	 * consistency is not required.
+	 */
+
+	if (attrs & DMA_ATTR_NON_CONSISTENT)
+		return ops->alloc(dev, size, dma_handle, gfp, attrs);
+
+	ret = dma_alloc_from_global_coherent(size, dma_handle);
+
+	/*
+	 * dma_alloc_from_global_coherent() may fail because:
+	 *
 	 * - no consistent DMA region has been defined, so we can't
 	 *   continue.
 	 * - there is no space left in consistent DMA region, so we
@@ -50,11 +62,8 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t size,
 	 *   advertised that consistency is not required.
 	 */
 
-	if (attrs & DMA_ATTR_NON_CONSISTENT)
-		return ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-	WARN_ON_ONCE(1);
-	return NULL;
+	WARN_ON_ONCE(ret == NULL);
+	return ret;
 }
 
 static void arm_nommu_dma_free(struct device *dev, size_t size,
@@ -63,14 +72,31 @@ static void arm_nommu_dma_free(struct device *dev, size_t size,
 {
 	const struct dma_map_ops *ops = &dma_noop_ops;
 
-	if (attrs & DMA_ATTR_NON_CONSISTENT)
+	if (attrs & DMA_ATTR_NON_CONSISTENT) {
 		ops->free(dev, size, cpu_addr, dma_addr, attrs);
-	else
-		WARN_ON_ONCE(1);
+	} else {
+		int ret = dma_release_from_global_coherent(get_order(size),
+							   cpu_addr);
+
+		WARN_ON_ONCE(ret == 0);
+	}
 
 	return;
 }
 
+static int arm_nommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+			      void *cpu_addr, dma_addr_t dma_addr, size_t size,
+			      unsigned long attrs)
+{
+	int ret;
+
+	if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret))
+		return ret;
+
+	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size);
+}
+
+
 static void __dma_page_cpu_to_dev(phys_addr_t paddr, size_t size,
 				  enum dma_data_direction dir)
 {
@@ -173,6 +199,7 @@ static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist
 const struct dma_map_ops arm_nommu_dma_ops = {
 	.alloc			= arm_nommu_dma_alloc,
 	.free			= arm_nommu_dma_free,
+	.mmap			= arm_nommu_dma_mmap,
 	.map_page		= arm_nommu_dma_map_page,
 	.unmap_page		= arm_nommu_dma_unmap_page,
 	.map_sg			= arm_nommu_dma_map_sg,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index e7380bafbfa6..fcf1473d6fed 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -851,7 +851,7 @@ static int __arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 	unsigned long pfn = dma_to_pfn(dev, dma_addr);
 	unsigned long off = vma->vm_pgoff;
 
-	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
 		return ret;
 
 	if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) {
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
index 0d1f026d831a..6872135d7f84 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts
@@ -67,14 +67,6 @@
 	};
 };
 
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&rgmii_pins>;
-	phy-mode = "rgmii";
-	phy-handle = <&ext_rgmii_phy>;
-	status = "okay";
-};
-
 &i2c1 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&i2c1_pins>;
@@ -85,13 +77,6 @@
 	bias-pull-up;
 };
 
-&mdio {
-	ext_rgmii_phy: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <1>;
-	};
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
index 24f1aac366d6..f82ccf332c0f 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
@@ -48,18 +48,3 @@
 
 	/* TODO: Camera, touchscreen, etc. */
 };
-
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&rgmii_pins>;
-	phy-mode = "rgmii";
-	phy-handle = <&ext_rgmii_phy>;
-	status = "okay";
-};
-
-&mdio {
-	ext_rgmii_phy: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <1>;
-	};
-};
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
index 08cda24ea194..7c533b6d4ba9 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts
@@ -78,15 +78,6 @@
 	status = "okay";
 };
 
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&rmii_pins>;
-	phy-mode = "rmii";
-	phy-handle = <&ext_rmii_phy1>;
-	status = "okay";
-
-};
-
 &i2c1 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&i2c1_pins>;
@@ -97,13 +88,6 @@
 	bias-pull-up;
 };
 
-&mdio {
-	ext_rmii_phy1: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <1>;
-	};
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
index 17eb1cc5bf6b..d891a1a27f6c 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts
@@ -76,21 +76,6 @@
 	status = "okay";
 };
 
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&rgmii_pins>;
-	phy-mode = "rgmii";
-	phy-handle = <&ext_rgmii_phy>;
-	status = "okay";
-};
-
-&mdio {
-	ext_rgmii_phy: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <1>;
-	};
-};
-
 &mmc2 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc2_pins>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
index 9d00622ce845..68aadc9b96dc 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
@@ -449,26 +449,6 @@
 			#size-cells = <0>;
 		};
 
-		emac: ethernet@1c30000 {
-			compatible = "allwinner,sun50i-a64-emac";
-			syscon = <&syscon>;
-			reg = <0x01c30000 0x100>;
-			interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
-			interrupt-names = "macirq";
-			resets = <&ccu RST_BUS_EMAC>;
-			reset-names = "stmmaceth";
-			clocks = <&ccu CLK_BUS_EMAC>;
-			clock-names = "stmmaceth";
-			status = "disabled";
-			#address-cells = <1>;
-			#size-cells = <0>;
-
-			mdio: mdio {
-				#address-cells = <1>;
-				#size-cells = <0>;
-			};
-		};
-
 		gic: interrupt-controller@1c81000 {
 			compatible = "arm,gic-400";
 			reg = <0x01c81000 0x1000>,
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
index 968908761194..1c2387bd5df6 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts
@@ -50,7 +50,6 @@
 	compatible = "friendlyarm,nanopi-neo2", "allwinner,sun50i-h5";
 
 	aliases {
-		ethernet0 = &emac;
 		serial0 = &uart0;
 	};
 
@@ -109,22 +108,6 @@
 	status = "okay";
 };
 
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&emac_rgmii_pins>;
-	phy-supply = <&reg_gmac_3v3>;
-	phy-handle = <&ext_rgmii_phy>;
-	phy-mode = "rgmii";
-	status = "okay";
-};
-
-&mdio {
-	ext_rgmii_phy: ethernet-phy@7 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <7>;
-	};
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts
index a8296feee884..4f77c8470f6c 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts
@@ -59,7 +59,6 @@
 	};
 
 	aliases {
-		ethernet0 = &emac;
 		serial0 = &uart0;
 	};
 
@@ -137,28 +136,12 @@
 	status = "okay";
 };
 
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&emac_rgmii_pins>;
-	phy-supply = <&reg_gmac_3v3>;
-	phy-handle = <&ext_rgmii_phy>;
-	phy-mode = "rgmii";
-	status = "okay";
-};
-
 &ir {
 	pinctrl-names = "default";
 	pinctrl-0 = <&ir_pins_a>;
 	status = "okay";
 };
 
-&mdio {
-	ext_rgmii_phy: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <1>;
-	};
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts
index d906b302cbcd..6be06873e5af 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts
@@ -54,7 +54,6 @@
 	compatible = "xunlong,orangepi-prime", "allwinner,sun50i-h5";
 
 	aliases {
-		ethernet0 = &emac;
 		serial0 = &uart0;
 	};
 
@@ -144,28 +143,12 @@
 	status = "okay";
 };
 
-&emac {
-	pinctrl-names = "default";
-	pinctrl-0 = <&emac_rgmii_pins>;
-	phy-supply = <&reg_gmac_3v3>;
-	phy-handle = <&ext_rgmii_phy>;
-	phy-mode = "rgmii";
-	status = "okay";
-};
-
 &ir {
 	pinctrl-names = "default";
 	pinctrl-0 = <&ir_pins_a>;
 	status = "okay";
 };
 
-&mdio {
-	ext_rgmii_phy: ethernet-phy@1 {
-		compatible = "ethernet-phy-ieee802.3-c22";
-		reg = <1>;
-	};
-};
-
 &mmc0 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
index 732e2e06f503..d9a720bff05d 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
@@ -120,5 +120,8 @@
 };
 
 &pio {
+	interrupts = <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>,
+		     <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>;
 	compatible = "allwinner,sun50i-h5-pinctrl";
 };
diff --git a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
index 35b8c88c3220..738ed689ff69 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
@@ -400,7 +400,7 @@
 			};
 
 			pwm_AO_ab: pwm@550 {
-				compatible = "amlogic,meson-gx-pwm", "amlogic,meson-gxbb-pwm";
+				compatible = "amlogic,meson-gx-ao-pwm", "amlogic,meson-gxbb-ao-pwm";
 				reg = <0x0 0x00550 0x0 0x10>;
 				#pwm-cells = <3>;
 				status = "disabled";
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts
index 72c5a9f64ca8..94567eb17875 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-khadas-vim.dts
@@ -109,8 +109,8 @@
 	status = "okay";
 	pinctrl-0 = <&pwm_ao_a_3_pins>, <&pwm_ao_b_pins>;
 	pinctrl-names = "default";
-	clocks = <&clkc CLKID_FCLK_DIV4>;
-	clock-names = "clkin0";
+	clocks = <&xtal> , <&xtal>;
+	clock-names = "clkin0", "clkin1" ;
 };
 
 &pwm_ef {
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
index 890821d6e52b..266fbcf3e47f 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
@@ -10,12 +10,20 @@
 
 #include <dt-bindings/input/input.h>
 
-#include "meson-gxl-s905x-p212.dtsi"
+#include "meson-gxl-s905x.dtsi"
 
 / {
 	compatible = "libretech,cc", "amlogic,s905x", "amlogic,meson-gxl";
 	model = "Libre Technology CC";
 
+	aliases {
+		serial0 = &uart_AO;
+	};
+
+	chosen {
+		stdout-path = "serial0:115200n8";
+	};
+
 	cvbs-connector {
 		compatible = "composite-video-connector";
 
@@ -26,6 +34,11 @@
 		};
 	};
 
+	emmc_pwrseq: emmc-pwrseq {
+		compatible = "mmc-pwrseq-emmc";
+		reset-gpios = <&gpio BOOT_9 GPIO_ACTIVE_LOW>;
+	};
+
 	hdmi-connector {
 		compatible = "hdmi-connector";
 		type = "a";
@@ -53,6 +66,39 @@
 			linux,default-trigger = "heartbeat";
 		};
 	};
+
+	memory@0 {
+		device_type = "memory";
+		reg = <0x0 0x0 0x0 0x80000000>;
+	};
+
+	vcc_3v3: regulator-vcc_3v3 {
+		compatible = "regulator-fixed";
+		regulator-name = "VCC_3V3";
+		regulator-min-microvolt = <3300000>;
+		regulator-max-microvolt = <3300000>;
+	};
+
+	vcc_card: regulator-vcc-card {
+		compatible = "regulator-gpio";
+
+		regulator-name = "VCC_CARD";
+		regulator-min-microvolt = <1800000>;
+		regulator-max-microvolt = <3300000>;
+
+		gpios = <&gpio_ao GPIOAO_3 GPIO_ACTIVE_HIGH>;
+		gpios-states = <0>;
+
+		states = <3300000 0>,
+			 <1800000 1>;
+	};
+
+	vddio_boot: regulator-vddio_boot {
+		compatible = "regulator-fixed";
+		regulator-name = "VDDIO_BOOT";
+		regulator-min-microvolt = <3300000>;
+		regulator-max-microvolt = <3300000>;
+	};
 };
 
 &cvbs_vdac_port {
@@ -61,6 +107,16 @@
 	};
 };
 
+&ethmac {
+	status = "okay";
+};
+
+&ir {
+	status = "okay";
+	pinctrl-0 = <&remote_input_ao_pins>;
+	pinctrl-names = "default";
+};
+
 &hdmi_tx {
 	status = "okay";
 	pinctrl-0 = <&hdmi_hpd_pins>, <&hdmi_i2c_pins>;
@@ -73,20 +129,43 @@
 	};
 };
 
-/*
- * The following devices exists but are exposed on the general
- * purpose GPIO header. End user may well decide to use those pins
- * for another purpose
- */
+/* SD card */
+&sd_emmc_b {
+	status = "okay";
+	pinctrl-0 = <&sdcard_pins>;
+	pinctrl-names = "default";
+
+	bus-width = <4>;
+	cap-sd-highspeed;
+	max-frequency = <100000000>;
+	disable-wp;
+
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
+	cd-inverted;
 
-&sd_emmc_a {
-	status = "disabled";
+	vmmc-supply = <&vcc_3v3>;
+	vqmmc-supply = <&vcc_card>;
 };
 
-&uart_A {
-	status = "disabled";
+/* eMMC */
+&sd_emmc_c {
+	status = "okay";
+	pinctrl-0 = <&emmc_pins>;
+	pinctrl-names = "default";
+
+	bus-width = <8>;
+	cap-mmc-highspeed;
+	max-frequency = <50000000>;
+	non-removable;
+	disable-wp;
+
+	mmc-pwrseq = <&emmc_pwrseq>;
+	vmmc-supply = <&vcc_3v3>;
+	vqmmc-supply = <&vddio_boot>;
 };
 
-&wifi32k {
-	status = "disabled";
+&uart_AO {
+	status = "okay";
+	pinctrl-0 = <&uart_ao_a_pins>;
+	pinctrl-names = "default";
 };
diff --git a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
index e2b0da2c0bc7..105b2938082f 100644
--- a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
+++ b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
@@ -280,9 +280,6 @@
 
 &decon {
 	status = "okay";
-
-	i80-if-timings {
-	};
 };
 
 &decon_tv {
@@ -1116,9 +1113,6 @@
 
 &mic {
 	status = "okay";
-
-	i80-if-timings {
-	};
 };
 
 &pmu_system_controller {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
index 31fd77f82ced..d16b9cc1e825 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
@@ -653,21 +653,21 @@
 		};
 
 		msi1: msi-controller1@1571000 {
-			compatible = "fsl,1s1043a-msi";
+			compatible = "fsl,ls1043a-msi";
 			reg = <0x0 0x1571000 0x0 0x8>;
 			msi-controller;
 			interrupts = <0 116 0x4>;
 		};
 
 		msi2: msi-controller2@1572000 {
-			compatible = "fsl,1s1043a-msi";
+			compatible = "fsl,ls1043a-msi";
 			reg = <0x0 0x1572000 0x0 0x8>;
 			msi-controller;
 			interrupts = <0 126 0x4>;
 		};
 
 		msi3: msi-controller3@1573000 {
-			compatible = "fsl,1s1043a-msi";
+			compatible = "fsl,ls1043a-msi";
 			reg = <0x0 0x1573000 0x0 0x8>;
 			msi-controller;
 			interrupts = <0 160 0x4>;
@@ -689,7 +689,7 @@
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000   /* downstream I/O */
 				  0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
-			msi-parent = <&msi1>;
+			msi-parent = <&msi1>, <&msi2>, <&msi3>;
 			#interrupt-cells = <1>;
 			interrupt-map-mask = <0 0 0 7>;
 			interrupt-map = <0000 0 0 1 &gic 0 110 0x4>,
@@ -714,7 +714,7 @@
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000   /* downstream I/O */
 				  0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
-			msi-parent = <&msi2>;
+			msi-parent = <&msi1>, <&msi2>, <&msi3>;
 			#interrupt-cells = <1>;
 			interrupt-map-mask = <0 0 0 7>;
 			interrupt-map = <0000 0 0 1 &gic 0 120  0x4>,
@@ -739,7 +739,7 @@
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000   /* downstream I/O */
 				  0x82000000 0x0 0x40000000 0x50 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */
-			msi-parent = <&msi3>;
+			msi-parent = <&msi1>, <&msi2>, <&msi3>;
 			#interrupt-cells = <1>;
 			interrupt-map-mask = <0 0 0 7>;
 			interrupt-map = <0000 0 0 1 &gic 0 154 0x4>,
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index dc1640be0345..c8ff0baddf1d 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
@@ -630,6 +630,37 @@
 			interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&clockgen 4 1>;
 		};
+
+		msi1: msi-controller@1580000 {
+			compatible = "fsl,ls1046a-msi";
+			msi-controller;
+			reg = <0x0 0x1580000 0x0 0x10000>;
+			interrupts = <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>;
+		};
+
+		msi2: msi-controller@1590000 {
+			compatible = "fsl,ls1046a-msi";
+			msi-controller;
+			reg = <0x0 0x1590000 0x0 0x10000>;
+			interrupts = <GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>;
+		};
+
+		msi3: msi-controller@15a0000 {
+			compatible = "fsl,ls1046a-msi";
+			msi-controller;
+			reg = <0x0 0x15a0000 0x0 0x10000>;
+			interrupts = <GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>;
+		};
+
 	};
 
 	reserved-memory {
diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
index dbcc3d4e2ed5..51763d674050 100644
--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
@@ -219,7 +219,7 @@
 				reg = <0x18800 0x100>, <0x18C00 0x20>;
 				gpiosb: gpio {
 					#gpio-cells = <2>;
-					gpio-ranges = <&pinctrl_sb 0 0 29>;
+					gpio-ranges = <&pinctrl_sb 0 0 30>;
 					gpio-controller;
 					interrupts =
 					<GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi
index 1eb1f1e9aac4..4d360713ed12 100644
--- a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi
@@ -268,10 +268,10 @@
 				ap_gpio: gpio {
 					compatible = "marvell,armada-8k-gpio";
 					offset = <0x1040>;
-					ngpios = <19>;
+					ngpios = <20>;
 					gpio-controller;
 					#gpio-cells = <2>;
-					gpio-ranges = <&ap_pinctrl 0 0 19>;
+					gpio-ranges = <&ap_pinctrl 0 0 20>;
 				};
 			};
 		};
diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
index 726528ce54e9..4c68605675a8 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
@@ -270,6 +270,7 @@
 				interrupt-names = "mem", "ring0", "ring1",
 				"ring2", "ring3", "eip";
 				clocks = <&cpm_clk 1 26>;
+				dma-coherent;
 			};
 		};
 
diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
index 95f8e5f607f6..923f354b02f0 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
@@ -64,7 +64,7 @@
 				compatible = "marvell,armada-8k-rtc";
 				reg = <0x284000 0x20>, <0x284080 0x24>;
 				reg-names = "rtc", "rtc-soc";
-				interrupts = <GIC_SPI 71 IRQ_TYPE_LEVEL_HIGH>;
+				interrupts = <ICU_GRP_NSR 77 IRQ_TYPE_LEVEL_HIGH>;
 			};
 
 			cps_ethernet: ethernet@0 {
@@ -261,6 +261,7 @@
 				interrupt-names = "mem", "ring0", "ring1",
 						  "ring2", "ring3", "eip";
 				clocks = <&cps_clk 1 26>;
+				dma-coherent;
 				/*
 				 * The cryptographic engine found on the cp110
 				 * master is enabled by default at the SoC
diff --git a/arch/arm64/boot/dts/renesas/salvator-common.dtsi b/arch/arm64/boot/dts/renesas/salvator-common.dtsi
index aef35e0b685a..f903957da504 100644
--- a/arch/arm64/boot/dts/renesas/salvator-common.dtsi
+++ b/arch/arm64/boot/dts/renesas/salvator-common.dtsi
@@ -45,7 +45,7 @@
 		stdout-path = "serial0:115200n8";
 	};
 
-	audio_clkout: audio_clkout {
+	audio_clkout: audio-clkout {
 		/*
 		 * This is same as <&rcar_sound 0>
 		 * but needed to avoid cs2000/rcar_sound probe dead-lock
@@ -508,7 +508,7 @@
 
 	/* audio_clkout0/1/2/3 */
 	#clock-cells = <1>;
-	clock-frequency = <11289600 12288000>;
+	clock-frequency = <12288000 11289600>;
 
 	status = "okay";
 
diff --git a/arch/arm64/boot/dts/renesas/ulcb.dtsi b/arch/arm64/boot/dts/renesas/ulcb.dtsi
index b5c6ee07d7f9..d1a3f3b7a0ab 100644
--- a/arch/arm64/boot/dts/renesas/ulcb.dtsi
+++ b/arch/arm64/boot/dts/renesas/ulcb.dtsi
@@ -281,7 +281,7 @@
 
 	/* audio_clkout0/1/2/3 */
 	#clock-cells = <1>;
-	clock-frequency = <11289600 12288000>;
+	clock-frequency = <12288000 11289600>;
 
 	status = "okay";
 
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 6c7d147eed54..b4ca115b3be1 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -476,6 +476,7 @@ CONFIG_QCOM_CLK_SMD_RPM=y
 CONFIG_MSM_GCC_8916=y
 CONFIG_MSM_GCC_8994=y
 CONFIG_MSM_MMCC_8996=y
+CONFIG_HWSPINLOCK=y
 CONFIG_HWSPINLOCK_QCOM=y
 CONFIG_ARM_MHU=y
 CONFIG_PLATFORM_MHU=y
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 8cef47fa2218..b7e3f74822da 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -116,6 +116,8 @@ static inline void gic_write_bpr1(u32 val)
 
 #define gic_read_typer(c)		readq_relaxed(c)
 #define gic_write_irouter(v, c)		writeq_relaxed(v, c)
+#define gic_read_lpir(c)		readq_relaxed(c)
+#define gic_write_lpir(v, c)		writeq_relaxed(v, c)
 
 #define gic_flush_dcache_to_poc(a,l)	__flush_dcache_area((a), (l))
 
@@ -133,5 +135,10 @@ static inline void gic_write_bpr1(u32 val)
 #define gicr_write_pendbaser(v, c)	writeq_relaxed(v, c)
 #define gicr_read_pendbaser(c)		readq_relaxed(c)
 
+#define gits_write_vpropbaser(v, c)	writeq_relaxed(v, c)
+
+#define gits_write_vpendbaser(v, c)	writeq_relaxed(v, c)
+#define gits_read_vpendbaser(c)		readq_relaxed(c)
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_ARCH_GICV3_H */
diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
index 74d08e44a651..a652ce0a5cb2 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -65,13 +65,13 @@ DECLARE_PER_CPU(const struct arch_timer_erratum_workaround *,
 	u64 _val;							\
 	if (needs_unstable_timer_counter_workaround()) {		\
 		const struct arch_timer_erratum_workaround *wa;		\
-		preempt_disable();					\
+		preempt_disable_notrace();				\
 		wa = __this_cpu_read(timer_unstable_counter_workaround); \
 		if (wa && wa->read_##reg)				\
 			_val = wa->read_##reg();			\
 		else							\
 			_val = read_sysreg(reg);			\
-		preempt_enable();					\
+		preempt_enable_notrace();				\
 	} else {							\
 		_val = read_sysreg(reg);				\
 	}								\
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 99fa69c9c3cf..9ef0797380cb 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -435,7 +435,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	"	sub	x30, x30, %[ret]\n"
 	"	cbnz	x30, 1b\n"
 	"2:")
-	: [ret] "+&r" (x0), [v] "+Q" (v->counter)
+	: [ret] "+r" (x0), [v] "+Q" (v->counter)
 	:
 	: __LL_SC_CLOBBERS, "cc", "memory");
 
diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 366448eb0fb7..a02a57186f56 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -36,7 +36,7 @@
 #ifdef CONFIG_GENERIC_BUG
 
 #define __BUG_ENTRY(flags) 				\
-		".pushsection __bug_table,\"a\"\n\t"	\
+		".pushsection __bug_table,\"aw\"\n\t"	\
 		".align 2\n\t"				\
 	"0:	.long 1f - 0b\n\t"			\
 _BUGVERBOSE_LOCATION(__FILE__, __LINE__)		\
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index acae781f7359..3288c2b36731 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -114,10 +114,10 @@
 
 /*
  * This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
  * space open for things that want to use the area for 32-bit pointers.
  */
-#define ELF_ET_DYN_BASE		0x100000000UL
+#define ELF_ET_DYN_BASE		(2 * TASK_SIZE_64 / 3)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index f32b42e8725d..5bb2fd4674e7 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -48,20 +48,10 @@ do {									\
 } while (0)
 
 static inline int
-futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (int)(encoded_op << 8) >> 20;
-	int cmparg = (int)(encoded_op << 20) >> 20;
 	int oldval = 0, ret, tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1U << (oparg & 0x1f);
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d68630007b14..e923b58606e2 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -326,12 +326,6 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
-/* We do not have shadow page tables, hence the empty hooks */
-static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
-							 unsigned long address)
-{
-}
-
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 void kvm_arm_halt_guest(struct kvm *kvm);
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 32f82723338a..ef39dcb9ca6a 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -64,8 +64,10 @@
  * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
  */
 #define VA_BITS			(CONFIG_ARM64_VA_BITS)
-#define VA_START		(UL(0xffffffffffffffff) << VA_BITS)
-#define PAGE_OFFSET		(UL(0xffffffffffffffff) << (VA_BITS - 1))
+#define VA_START		(UL(0xffffffffffffffff) - \
+	(UL(1) << VA_BITS) + 1)
+#define PAGE_OFFSET		(UL(0xffffffffffffffff) - \
+	(UL(1) << (VA_BITS - 1)) + 1)
 #define KIMAGE_VADDR		(MODULES_END)
 #define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
 #define MODULES_VADDR		(VA_START + KASAN_SHADOW_SIZE)
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index cae331d553f8..95ad7102b63c 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -26,58 +26,6 @@
  * The memory barriers are implicit with the load-acquire and store-release
  * instructions.
  */
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	unsigned int tmp;
-	arch_spinlock_t lockval;
-	u32 owner;
-
-	/*
-	 * Ensure prior spin_lock operations to other locks have completed
-	 * on this CPU before we test whether "lock" is locked.
-	 */
-	smp_mb();
-	owner = READ_ONCE(lock->owner) << 16;
-
-	asm volatile(
-"	sevl\n"
-"1:	wfe\n"
-"2:	ldaxr	%w0, %2\n"
-	/* Is the lock free? */
-"	eor	%w1, %w0, %w0, ror #16\n"
-"	cbz	%w1, 3f\n"
-	/* Lock taken -- has there been a subsequent unlock->lock transition? */
-"	eor	%w1, %w3, %w0, lsl #16\n"
-"	cbz	%w1, 1b\n"
-	/*
-	 * The owner has been updated, so there was an unlock->lock
-	 * transition that we missed. That means we can rely on the
-	 * store-release of the unlock operation paired with the
-	 * load-acquire of the lock operation to publish any of our
-	 * previous stores to the new lock owner and therefore don't
-	 * need to bother with the writeback below.
-	 */
-"	b	4f\n"
-"3:\n"
-	/*
-	 * Serialise against any concurrent lockers by writing back the
-	 * unlocked lock value
-	 */
-	ARM64_LSE_ATOMIC_INSN(
-	/* LL/SC */
-"	stxr	%w1, %w0, %2\n"
-	__nops(2),
-	/* LSE atomics */
-"	mov	%w1, %w0\n"
-"	cas	%w0, %w0, %2\n"
-"	eor	%w1, %w1, %w0\n")
-	/* Somebody else wrote to the lock, GOTO 10 and reload the value */
-"	cbnz	%w1, 2b\n"
-"4:"
-	: "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
-	: "r" (owner)
-	: "memory");
-}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
@@ -176,7 +124,11 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	smp_mb(); /* See arch_spin_unlock_wait */
+	/*
+	 * Ensure prior spin_lock operations to other locks have completed
+	 * on this CPU before we test whether "lock" is locked.
+	 */
+	smp_mb(); /* ^^^ */
 	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
@@ -358,14 +310,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
 
-/*
- * Accesses appearing in program order before a spin_lock() operation
- * can be reordered with accesses inside the critical section, by virtue
- * of arch_spin_lock being constructed using acquire semantics.
- *
- * In cases where this is problematic (e.g. try_to_wake_up), an
- * smp_mb__before_spinlock() can restore the required ordering.
- */
-#define smp_mb__before_spinlock()	smp_mb()
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()	smp_mb()
 
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 16e44fa9b3b6..248339e4aaf5 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -492,7 +492,7 @@ asm(
  * the "%x0" template means XZR.
  */
 #define write_sysreg(v, r) do {					\
-	u64 __val = (u64)v;					\
+	u64 __val = (u64)(v);					\
 	asm volatile("msr " __stringify(r) ", %x0"		\
 		     : : "rZ" (__val));				\
 } while (0)
@@ -508,7 +508,7 @@ asm(
 })
 
 #define write_sysreg_s(v, r) do {					\
-	u64 __val = (u64)v;						\
+	u64 __val = (u64)(v);						\
 	asm volatile("msr_s " __stringify(r) ", %x0" : : "rZ" (__val));	\
 } while (0)
 
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 46c3b93cf865..c5ba565544ee 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE	3	/* CPU's FP state is not current's */
 #define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
+#define TIF_FSCHECK		5	/* Check FS is USER_DS on return */
 #define TIF_NOHZ		7
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
@@ -107,11 +108,12 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 #define _TIF_32BIT		(1 << TIF_32BIT)
 
 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
-				 _TIF_UPROBE)
+				 _TIF_UPROBE | _TIF_FSCHECK)
 
 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 02e9035b0685..47a9066f7c86 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook);
 
 void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr);
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static inline int __in_irqentry_text(unsigned long ptr)
 {
 	return ptr >= (unsigned long)&__irqentry_text_start &&
 	       ptr < (unsigned long)&__irqentry_text_end;
 }
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
-	return 0;
-}
-#endif
 
 static inline int in_exception_text(unsigned long ptr)
 {
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 8f0a1de11e4a..a801a48a7972 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -45,6 +45,9 @@ static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
 
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+
 	/*
 	 * Enable/disable UAO so that copy_to_user() etc can access
 	 * kernel memory with the unprivileged instructions.
@@ -69,7 +72,7 @@ static inline void set_fs(mm_segment_t fs)
  */
 #define __range_ok(addr, size)						\
 ({									\
-	unsigned long __addr = (unsigned long __force)(addr);		\
+	unsigned long __addr = (unsigned long)(addr);			\
 	unsigned long flag, roksum;					\
 	__chk_user_ptr(addr);						\
 	asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, ls"		\
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c
index e137ceaf5016..d16978213c5b 100644
--- a/arch/arm64/kernel/cpu_ops.c
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -82,8 +82,8 @@ static const char *__init cpu_read_enable_method(int cpu)
 			 * Don't warn spuriously.
 			 */
 			if (cpu != 0)
-				pr_err("%s: missing enable-method property\n",
-					dn->full_name);
+				pr_err("%pOF: missing enable-method property\n",
+					dn);
 		}
 	} else {
 		enable_method = acpi_get_enable_method(cpu);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 06da8ea16bbe..c7b4995868e1 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -161,9 +161,11 @@ void fpsimd_flush_thread(void)
 {
 	if (!system_supports_fpsimd())
 		return;
+	preempt_disable();
 	memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
 	fpsimd_flush_task_state(current);
 	set_thread_flag(TIF_FOREIGN_FPSTATE);
+	preempt_enable();
 }
 
 /*
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 973df7de7bf8..adb0910b88f5 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -354,7 +354,6 @@ __primary_switched:
 	tst	x23, ~(MIN_KIMG_ALIGN - 1)	// already running randomized?
 	b.ne	0f
 	mov	x0, x21				// pass FDT address in x0
-	mov	x1, x23				// pass modulo offset in x1
 	bl	kaslr_early_init		// parse FDT for KASLR options
 	cbz	x0, 0f				// KASLR disabled? just proceed
 	orr	x23, x23, x0			// record KASLR offset
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index a9710efb8c01..47080c49cc7e 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -75,7 +75,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size,
  * containing function pointers) to be reinitialized, and zero-initialized
  * .bss variables will be reset to 0.
  */
-u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
+u64 __init kaslr_early_init(u64 dt_phys)
 {
 	void *fdt;
 	u64 seed, offset, mask, module_range;
@@ -131,15 +131,17 @@ u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
 	/*
 	 * The kernel Image should not extend across a 1GB/32MB/512MB alignment
 	 * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
-	 * happens, increase the KASLR offset by the size of the kernel image
-	 * rounded up by SWAPPER_BLOCK_SIZE.
+	 * happens, round down the KASLR offset by (1 << SWAPPER_TABLE_SHIFT).
+	 *
+	 * NOTE: The references to _text and _end below will already take the
+	 *       modulo offset (the physical displacement modulo 2 MB) into
+	 *       account, given that the physical placement is controlled by
+	 *       the loader, and will not change as a result of the virtual
+	 *       mapping we choose.
 	 */
-	if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) !=
-	    (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT)) {
-		u64 kimg_sz = _end - _text;
-		offset = (offset + round_up(kimg_sz, SWAPPER_BLOCK_SIZE))
-				& mask;
-	}
+	if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) !=
+	    (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT))
+		offset = round_down(offset, 1 << SWAPPER_TABLE_SHIFT);
 
 	if (IS_ENABLED(CONFIG_KASAN))
 		/*
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 659ae8094ed5..c8f7d98d8cb9 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
 	 * the thread migrates to a different CPU.
+	 * This full barrier is also required by the membarrier system
+	 * call.
 	 */
 	dsb(ish);
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 089c3747995d..e3e3293d1123 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -29,6 +29,7 @@
 #include <linux/string.h>
 #include <linux/tracehook.h>
 #include <linux/ratelimit.h>
+#include <linux/syscalls.h>
 
 #include <asm/debug-monitors.h>
 #include <asm/elf.h>
@@ -749,6 +750,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 	 * Update the trace code with the current status.
 	 */
 	trace_hardirqs_off();
+
+	/* Check valid user FS if needed */
+	addr_limit_user_check();
+
 	do {
 		if (thread_flags & _TIF_NEED_RESCHED) {
 			schedule();
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 321119881abf..dc66e6ec3a99 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -469,7 +469,7 @@ static u64 __init of_get_cpu_mpidr(struct device_node *dn)
 	 */
 	cell = of_get_property(dn, "reg", NULL);
 	if (!cell) {
-		pr_err("%s: missing reg property\n", dn->full_name);
+		pr_err("%pOF: missing reg property\n", dn);
 		return INVALID_HWID;
 	}
 
@@ -478,7 +478,7 @@ static u64 __init of_get_cpu_mpidr(struct device_node *dn)
 	 * Non affinity bits must be set to 0 in the DT
 	 */
 	if (hwid & ~MPIDR_HWID_BITMASK) {
-		pr_err("%s: invalid reg property\n", dn->full_name);
+		pr_err("%pOF: invalid reg property\n", dn);
 		return INVALID_HWID;
 	}
 	return hwid;
@@ -627,8 +627,8 @@ static void __init of_parse_and_init_cpus(void)
 			goto next;
 
 		if (is_mpidr_duplicate(cpu_count, hwid)) {
-			pr_err("%s: duplicate cpu reg properties in the DT\n",
-				dn->full_name);
+			pr_err("%pOF: duplicate cpu reg properties in the DT\n",
+				dn);
 			goto next;
 		}
 
@@ -640,8 +640,8 @@ static void __init of_parse_and_init_cpus(void)
 		 */
 		if (hwid == cpu_logical_map(0)) {
 			if (bootcpu_valid) {
-				pr_err("%s: duplicate boot cpu reg property in DT\n",
-					dn->full_name);
+				pr_err("%pOF: duplicate boot cpu reg property in DT\n",
+					dn);
 				goto next;
 			}
 
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 79244c75eaec..8d48b233e6ce 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -45,7 +45,7 @@ static int __init get_cpu_for_node(struct device_node *node)
 		}
 	}
 
-	pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);
+	pr_crit("Unable to find CPU node for %pOF\n", cpu_node);
 
 	of_node_put(cpu_node);
 	return -1;
@@ -71,8 +71,8 @@ static int __init parse_core(struct device_node *core, int cluster_id,
 				cpu_topology[cpu].core_id = core_id;
 				cpu_topology[cpu].thread_id = i;
 			} else {
-				pr_err("%s: Can't get CPU for thread\n",
-				       t->full_name);
+				pr_err("%pOF: Can't get CPU for thread\n",
+				       t);
 				of_node_put(t);
 				return -EINVAL;
 			}
@@ -84,15 +84,15 @@ static int __init parse_core(struct device_node *core, int cluster_id,
 	cpu = get_cpu_for_node(core);
 	if (cpu >= 0) {
 		if (!leaf) {
-			pr_err("%s: Core has both threads and CPU\n",
-			       core->full_name);
+			pr_err("%pOF: Core has both threads and CPU\n",
+			       core);
 			return -EINVAL;
 		}
 
 		cpu_topology[cpu].cluster_id = cluster_id;
 		cpu_topology[cpu].core_id = core_id;
 	} else if (leaf) {
-		pr_err("%s: Can't get CPU for leaf core\n", core->full_name);
+		pr_err("%pOF: Can't get CPU for leaf core\n", core);
 		return -EINVAL;
 	}
 
@@ -137,8 +137,8 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
 			has_cores = true;
 
 			if (depth == 0) {
-				pr_err("%s: cpu-map children should be clusters\n",
-				       c->full_name);
+				pr_err("%pOF: cpu-map children should be clusters\n",
+				       c);
 				of_node_put(c);
 				return -EINVAL;
 			}
@@ -146,8 +146,8 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
 			if (leaf) {
 				ret = parse_core(c, cluster_id, core_id++);
 			} else {
-				pr_err("%s: Non-leaf cluster with core %s\n",
-				       cluster->full_name, name);
+				pr_err("%pOF: Non-leaf cluster with core %s\n",
+				       cluster, name);
 				ret = -EINVAL;
 			}
 
@@ -159,7 +159,7 @@ static int __init parse_cluster(struct device_node *cluster, int depth)
 	} while (c);
 
 	if (leaf && !has_cores)
-		pr_warn("%s: empty cluster\n", cluster->full_name);
+		pr_warn("%pOF: empty cluster\n", cluster);
 
 	if (leaf)
 		cluster_id++;
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index c7c7088097be..8a62648848e5 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -274,10 +274,12 @@ static DEFINE_RAW_SPINLOCK(die_lock);
 void die(const char *str, struct pt_regs *regs, int err)
 {
 	int ret;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&die_lock, flags);
 
 	oops_enter();
 
-	raw_spin_lock_irq(&die_lock);
 	console_verbose();
 	bust_spinlocks(1);
 	ret = __die(str, err, regs);
@@ -287,13 +289,15 @@ void die(const char *str, struct pt_regs *regs, int err)
 
 	bust_spinlocks(0);
 	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-	raw_spin_unlock_irq(&die_lock);
 	oops_exit();
 
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
 	if (panic_on_oops)
 		panic("Fatal exception");
+
+	raw_spin_unlock_irqrestore(&die_lock, flags);
+
 	if (ret != NOTIFY_STOP)
 		do_exit(SIGSEGV);
 }
@@ -519,7 +523,7 @@ static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs)
 {
 	int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
 
-	pt_regs_write_reg(regs, rt, read_sysreg(cntfrq_el0));
+	pt_regs_write_reg(regs, rt, arch_timer_get_rate());
 	regs->pc += 4;
 }
 
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 77862881ae86..2e070d3baf9f 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -764,7 +764,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 	if (p->is_write) {
 		if (r->CRm & 0x2)
 			/* accessing PMOVSSET_EL0 */
-			kvm_pmu_overflow_set(vcpu, p->regval & mask);
+			vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask);
 		else
 			/* accessing PMOVSCLR_EL0 */
 			vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index c3cd65e31814..076c43715e64 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -30,9 +30,10 @@
  */
 ENTRY(copy_page)
 alternative_if ARM64_HAS_NO_HW_PREFETCH
-	# Prefetch two cache lines ahead.
-	prfm    pldl1strm, [x1, #128]
-	prfm    pldl1strm, [x1, #256]
+	// Prefetch three cache lines ahead.
+	prfm	pldl1strm, [x1, #128]
+	prfm	pldl1strm, [x1, #256]
+	prfm	pldl1strm, [x1, #384]
 alternative_else_nop_endif
 
 	ldp	x2, x3, [x1]
@@ -50,7 +51,7 @@ alternative_else_nop_endif
 	subs	x18, x18, #128
 
 alternative_if ARM64_HAS_NO_HW_PREFETCH
-	prfm    pldl1strm, [x1, #384]
+	prfm	pldl1strm, [x1, #384]
 alternative_else_nop_endif
 
 	stnp	x2, x3, [x0]
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index e90cd1db42a8..f27d4dd04384 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -329,7 +329,7 @@ static int __swiotlb_mmap(struct device *dev,
 	vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot,
 					     is_device_dma_coherent(dev));
 
-	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
 		return ret;
 
 	return __swiotlb_mmap_pfn(vma, pfn, size);
@@ -706,7 +706,7 @@ static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 	vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot,
 					     is_device_dma_coherent(dev));
 
-	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
 		return ret;
 
 	if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c7861c9864e6..1f22a41565a3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -163,26 +163,27 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 	/* only preserve the access flags and write permission */
 	pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY;
 
-	/*
-	 * PTE_RDONLY is cleared by default in the asm below, so set it in
-	 * back if necessary (read-only or clean PTE).
-	 */
+	/* set PTE_RDONLY if actual read-only or clean PTE */
 	if (!pte_write(entry) || !pte_sw_dirty(entry))
 		pte_val(entry) |= PTE_RDONLY;
 
 	/*
 	 * Setting the flags must be done atomically to avoid racing with the
-	 * hardware update of the access/dirty state.
+	 * hardware update of the access/dirty state. The PTE_RDONLY bit must
+	 * be set to the most permissive (lowest value) of *ptep and entry
+	 * (calculated as: a & b == ~(~a | ~b)).
 	 */
+	pte_val(entry) ^= PTE_RDONLY;
 	asm volatile("//	ptep_set_access_flags\n"
 	"	prfm	pstl1strm, %2\n"
 	"1:	ldxr	%0, %2\n"
-	"	and	%0, %0, %3		// clear PTE_RDONLY\n"
+	"	eor	%0, %0, %3		// negate PTE_RDONLY in *ptep\n"
 	"	orr	%0, %0, %4		// set flags\n"
+	"	eor	%0, %0, %3		// negate final PTE_RDONLY\n"
 	"	stxr	%w1, %0, %2\n"
 	"	cbnz	%w1, 1b\n"
 	: "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))
-	: "L" (~PTE_RDONLY), "r" (pte_val(entry)));
+	: "L" (PTE_RDONLY), "r" (pte_val(entry)));
 
 	flush_tlb_fix_spurious_fault(vma, address);
 	return 1;
@@ -434,8 +435,11 @@ retry:
 		 * the mmap_sem because it would already be released
 		 * in __lock_page_or_retry in mm/filemap.c.
 		 */
-		if (fatal_signal_pending(current))
+		if (fatal_signal_pending(current)) {
+			if (!user_mode(regs))
+				goto no_context;
 			return 0;
+		}
 
 		/*
 		 * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 23c2d89a362e..f1eb15e0e864 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -496,7 +496,7 @@ void mark_rodata_ro(void)
 
 static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
 				      pgprot_t prot, struct vm_struct *vma,
-				      int flags)
+				      int flags, unsigned long vm_flags)
 {
 	phys_addr_t pa_start = __pa_symbol(va_start);
 	unsigned long size = va_end - va_start;
@@ -507,10 +507,13 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
 	__create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
 			     early_pgtable_alloc, flags);
 
+	if (!(vm_flags & VM_NO_GUARD))
+		size += PAGE_SIZE;
+
 	vma->addr	= va_start;
 	vma->phys_addr	= pa_start;
 	vma->size	= size;
-	vma->flags	= VM_MAP;
+	vma->flags	= VM_MAP | vm_flags;
 	vma->caller	= __builtin_return_address(0);
 
 	vm_area_add_early(vma);
@@ -541,14 +544,15 @@ static void __init map_kernel(pgd_t *pgd)
 	 * Only rodata will be remapped with different permissions later on,
 	 * all other segments are allowed to use contiguous mappings.
 	 */
-	map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text, 0);
+	map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text, 0,
+			   VM_NO_GUARD);
 	map_kernel_segment(pgd, __start_rodata, __inittext_begin, PAGE_KERNEL,
-			   &vmlinux_rodata, NO_CONT_MAPPINGS);
+			   &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
 	map_kernel_segment(pgd, __inittext_begin, __inittext_end, text_prot,
-			   &vmlinux_inittext, 0);
+			   &vmlinux_inittext, 0, VM_NO_GUARD);
 	map_kernel_segment(pgd, __initdata_begin, __initdata_end, PAGE_KERNEL,
-			   &vmlinux_initdata, 0);
-	map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data, 0);
+			   &vmlinux_initdata, 0, VM_NO_GUARD);
+	map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
 
 	if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
 		/*
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index b388a99fea7b..dad128ba98bf 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -208,8 +208,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 	}
 
 	node_set(nid, numa_nodes_parsed);
-	pr_info("Adding memblock [0x%llx - 0x%llx] on node %d\n",
-			start, (end - 1), nid);
 	return ret;
 }
 
@@ -223,10 +221,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
 	void *nd;
 	int tnid;
 
-	if (start_pfn < end_pfn)
-		pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
-			start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
-	else
+	if (start_pfn >= end_pfn)
 		pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
 
 	nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
diff --git a/arch/blackfin/include/asm/bug.h b/arch/blackfin/include/asm/bug.h
index 8d9b1eba89c4..76b2e82ee730 100644
--- a/arch/blackfin/include/asm/bug.h
+++ b/arch/blackfin/include/asm/bug.h
@@ -21,7 +21,7 @@
 #define _BUG_OR_WARN(flags)						\
 	asm volatile(							\
 		"1:	.hword	%0\n"					\
-		"	.section __bug_table,\"a\",@progbits\n"		\
+		"	.section __bug_table,\"aw\",@progbits\n"	\
 		"2:	.long	1b\n"					\
 		"	.long	%1\n"					\
 		"	.short	%2\n"					\
@@ -38,7 +38,7 @@
 #define _BUG_OR_WARN(flags)						\
 	asm volatile(							\
 		"1:	.hword	%0\n"					\
-		"	.section __bug_table,\"a\",@progbits\n"		\
+		"	.section __bug_table,\"aw\",@progbits\n"	\
 		"2:	.long	1b\n"					\
 		"	.short	%1\n"					\
 		"	.org	2b + %2\n"				\
diff --git a/arch/blackfin/include/asm/flat.h b/arch/blackfin/include/asm/flat.h
index 296d7f56fbfd..f1d6ba7afbf2 100644
--- a/arch/blackfin/include/asm/flat.h
+++ b/arch/blackfin/include/asm/flat.h
@@ -44,8 +44,7 @@ flat_get_relocate_addr (unsigned long relval)
 	return relval & 0x03ffffff; /* Mask out top 6 bits */
 }
 
-static inline int flat_set_persistent(unsigned long relval,
-				      unsigned long *persistent)
+static inline int flat_set_persistent(u32 relval, u32 *persistent)
 {
 	int type = (relval >> 26) & 7;
 	if (type == 3) {
diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h
index c58f4a83ed6f..f6431439d15d 100644
--- a/arch/blackfin/include/asm/spinlock.h
+++ b/arch/blackfin/include/asm/spinlock.h
@@ -48,11 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	__raw_spin_unlock_asm(&lock->lock);
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 static inline int arch_read_can_lock(arch_rwlock_t *rw)
 {
 	return __raw_uncached_fetch_asm(&rw->lock) > 0;
diff --git a/arch/blackfin/kernel/flat.c b/arch/blackfin/kernel/flat.c
index d29ab6a2e909..8ebc54daaa8e 100644
--- a/arch/blackfin/kernel/flat.c
+++ b/arch/blackfin/kernel/flat.c
@@ -32,7 +32,7 @@ unsigned long bfin_get_addr_from_rp(u32 *ptr,
 		break;
 
 	case FLAT_BFIN_RELOC_TYPE_32_BIT:
-		pr_debug("*ptr = %lx", get_unaligned(ptr));
+		pr_debug("*ptr = %x", get_unaligned(ptr));
 		val = get_unaligned(ptr);
 		break;
 
@@ -77,7 +77,7 @@ void bfin_put_addr_at_rp(u32 *ptr, u32 addr, u32 relval)
 
 	case FLAT_BFIN_RELOC_TYPE_32_BIT:
 		put_unaligned(addr, ptr);
-		pr_debug("new ptr =%lx", get_unaligned(ptr));
+		pr_debug("new ptr =%x", get_unaligned(ptr));
 		break;
 	}
 }
diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c
index 0188c933b155..15af5768c403 100644
--- a/arch/blackfin/kernel/module.c
+++ b/arch/blackfin/kernel/module.c
@@ -4,8 +4,6 @@
  * Licensed under the GPL-2 or later
  */
 
-#define pr_fmt(fmt) "module %s: " fmt, mod->name
-
 #include <linux/moduleloader.h>
 #include <linux/elf.h>
 #include <linux/vmalloc.h>
@@ -16,6 +14,11 @@
 #include <asm/cacheflush.h>
 #include <linux/uaccess.h>
 
+#define mod_err(mod, fmt, ...)						\
+	pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+#define mod_debug(mod, fmt, ...)					\
+	pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+
 /* Transfer the section to the L1 memory */
 int
 module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
@@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_inst_sram_alloc(s->sh_size);
 			mod->arch.text_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 inst memory allocation failed\n");
+				mod_err(mod, "L1 inst memory allocation failed\n");
 				return -1;
 			}
 			dma_memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_sram_alloc(s->sh_size);
 			mod->arch.data_a_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_sram_zalloc(s->sh_size);
 			mod->arch.bss_a_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 
@@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_B_sram_alloc(s->sh_size);
 			mod->arch.data_b_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_B_sram_alloc(s->sh_size);
 			mod->arch.bss_b_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memset(dest, 0, s->sh_size);
@@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_alloc(s->sh_size);
 			mod->arch.text_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_alloc(s->sh_size);
 			mod->arch.data_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_zalloc(s->sh_size);
 			mod->arch.bss_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 
@@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 	Elf32_Sym *sym;
 	unsigned long location, value, size;
 
-	pr_debug("applying relocate section %u to %u\n",
-		relsec, sechdrs[relsec].sh_info);
+	mod_debug(mod, "applying relocate section %u to %u\n",
+		  relsec, sechdrs[relsec].sh_info);
 
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
 		/* This is where to make the change */
@@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 
 #ifdef CONFIG_SMP
 		if (location >= COREB_L1_DATA_A_START) {
-			pr_err("cannot relocate in L1: %u (SMP kernel)\n",
+			mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
 #endif
 
-		pr_debug("location is %lx, value is %lx type is %d\n",
-			location, value, ELF32_R_TYPE(rel[i].r_info));
+		mod_debug(mod, "location is %lx, value is %lx type is %d\n",
+			  location, value, ELF32_R_TYPE(rel[i].r_info));
 
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
 
@@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 		case R_BFIN_PCREL12_JUMP:
 		case R_BFIN_PCREL12_JUMP_S:
 		case R_BFIN_PCREL10:
-			pr_err("unsupported relocation: %u (no -mlong-calls?)\n",
+			mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 
 		default:
-			pr_err("unknown relocation: %u\n",
+			mod_err(mod, "unknown relocation: %u\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
@@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 			isram_memcpy((void *)location, &value, size);
 			break;
 		default:
-			pr_err("invalid relocation for %#lx\n", location);
+			mod_err(mod, "invalid relocation for %#lx\n", location);
 			return -ENOEXEC;
 		}
 	}
diff --git a/arch/c6x/configs/dsk6455_defconfig b/arch/c6x/configs/dsk6455_defconfig
index 4663487c67a1..d764ea4cce7f 100644
--- a/arch/c6x/configs/dsk6455_defconfig
+++ b/arch/c6x/configs/dsk6455_defconfig
@@ -1,5 +1,4 @@
 CONFIG_SOC_TMS320C6455=y
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_SPARSE_IRQ=y
@@ -25,7 +24,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=2
 CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
diff --git a/arch/c6x/configs/evmc6457_defconfig b/arch/c6x/configs/evmc6457_defconfig
index bba40e195ec4..05d0b4a25ab1 100644
--- a/arch/c6x/configs/evmc6457_defconfig
+++ b/arch/c6x/configs/evmc6457_defconfig
@@ -1,5 +1,4 @@
 CONFIG_SOC_TMS320C6457=y
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_SPARSE_IRQ=y
@@ -26,7 +25,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=2
 CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
diff --git a/arch/c6x/configs/evmc6472_defconfig b/arch/c6x/configs/evmc6472_defconfig
index 8c46155f6d31..8d81fcf86b0e 100644
--- a/arch/c6x/configs/evmc6472_defconfig
+++ b/arch/c6x/configs/evmc6472_defconfig
@@ -1,5 +1,4 @@
 CONFIG_SOC_TMS320C6472=y
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_SPARSE_IRQ=y
@@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=2
 CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
diff --git a/arch/c6x/configs/evmc6474_defconfig b/arch/c6x/configs/evmc6474_defconfig
index 15533f632313..8156a98f3958 100644
--- a/arch/c6x/configs/evmc6474_defconfig
+++ b/arch/c6x/configs/evmc6474_defconfig
@@ -1,5 +1,4 @@
 CONFIG_SOC_TMS320C6474=y
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_SPARSE_IRQ=y
@@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=2
 CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
diff --git a/arch/c6x/configs/evmc6678_defconfig b/arch/c6x/configs/evmc6678_defconfig
index 5f126d4905b1..c4f433c25b69 100644
--- a/arch/c6x/configs/evmc6678_defconfig
+++ b/arch/c6x/configs/evmc6678_defconfig
@@ -1,5 +1,4 @@
 CONFIG_SOC_TMS320C6678=y
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_SPARSE_IRQ=y
@@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=2
 CONFIG_BLK_DEV_RAM_SIZE=17000
-CONFIG_MISC_DEVICES=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
diff --git a/arch/c6x/platforms/megamod-pic.c b/arch/c6x/platforms/megamod-pic.c
index 43afc03e4125..9519fa5f97d0 100644
--- a/arch/c6x/platforms/megamod-pic.c
+++ b/arch/c6x/platforms/megamod-pic.c
@@ -208,14 +208,14 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
 
 	pic = kzalloc(sizeof(struct megamod_pic), GFP_KERNEL);
 	if (!pic) {
-		pr_err("%s: Could not alloc PIC structure.\n", np->full_name);
+		pr_err("%pOF: Could not alloc PIC structure.\n", np);
 		return NULL;
 	}
 
 	pic->irqhost = irq_domain_add_linear(np, NR_COMBINERS * 32,
 					     &megamod_domain_ops, pic);
 	if (!pic->irqhost) {
-		pr_err("%s: Could not alloc host.\n", np->full_name);
+		pr_err("%pOF: Could not alloc host.\n", np);
 		goto error_free;
 	}
 
@@ -225,7 +225,7 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
 
 	pic->regs = of_iomap(np, 0);
 	if (!pic->regs) {
-		pr_err("%s: Could not map registers.\n", np->full_name);
+		pr_err("%pOF: Could not map registers.\n", np);
 		goto error_free;
 	}
 
@@ -253,8 +253,8 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
 
 		irq_data = irq_get_irq_data(irq);
 		if (!irq_data) {
-			pr_err("%s: combiner-%d no irq_data for virq %d!\n",
-			       np->full_name, i, irq);
+			pr_err("%pOF: combiner-%d no irq_data for virq %d!\n",
+			       np, i, irq);
 			continue;
 		}
 
@@ -265,16 +265,16 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
 		 * of the core priority interrupts (4 - 15).
 		 */
 		if (hwirq < 4 || hwirq >= NR_PRIORITY_IRQS) {
-			pr_err("%s: combiner-%d core irq %ld out of range!\n",
-			       np->full_name, i, hwirq);
+			pr_err("%pOF: combiner-%d core irq %ld out of range!\n",
+			       np, i, hwirq);
 			continue;
 		}
 
 		/* record the mapping */
 		mapping[hwirq - 4] = i;
 
-		pr_debug("%s: combiner-%d cascading to hwirq %ld\n",
-			 np->full_name, i, hwirq);
+		pr_debug("%pOF: combiner-%d cascading to hwirq %ld\n",
+			 np, i, hwirq);
 
 		cascade_data[i].pic = pic;
 		cascade_data[i].index = i;
@@ -290,8 +290,8 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np)
 	/* Finally, set up the MUX registers */
 	for (i = 0; i < NR_MUX_OUTPUTS; i++) {
 		if (mapping[i] != IRQ_UNMAPPED) {
-			pr_debug("%s: setting mux %d to priority %d\n",
-				 np->full_name, mapping[i], i + 4);
+			pr_debug("%pOF: setting mux %d to priority %d\n",
+				 np, mapping[i], i + 4);
 			set_megamod_mux(pic, mapping[i], i);
 		}
 	}
diff --git a/arch/c6x/platforms/plldata.c b/arch/c6x/platforms/plldata.c
index 755359eb6286..e8b6cc6a7b5a 100644
--- a/arch/c6x/platforms/plldata.c
+++ b/arch/c6x/platforms/plldata.c
@@ -436,8 +436,8 @@ void __init c64x_setup_clocks(void)
 
 	err = of_property_read_u32(node, "clock-frequency", &val);
 	if (err || val == 0) {
-		pr_err("%s: no clock-frequency found! Using %dMHz\n",
-		       node->full_name, (int)val / 1000000);
+		pr_err("%pOF: no clock-frequency found! Using %dMHz\n",
+		       node, (int)val / 1000000);
 		val = 25000000;
 	}
 	clkin1.rate = val;
diff --git a/arch/c6x/platforms/timer64.c b/arch/c6x/platforms/timer64.c
index 0bd0452ded80..241a9a607193 100644
--- a/arch/c6x/platforms/timer64.c
+++ b/arch/c6x/platforms/timer64.c
@@ -204,14 +204,14 @@ void __init timer64_init(void)
 
 	timer = of_iomap(np, 0);
 	if (!timer) {
-		pr_debug("%s: Cannot map timer registers.\n", np->full_name);
+		pr_debug("%pOF: Cannot map timer registers.\n", np);
 		goto out;
 	}
-	pr_debug("%s: Timer registers=%p.\n", np->full_name, timer);
+	pr_debug("%pOF: Timer registers=%p.\n", np, timer);
 
 	cd->irq	= irq_of_parse_and_map(np, 0);
 	if (cd->irq == NO_IRQ) {
-		pr_debug("%s: Cannot find interrupt.\n", np->full_name);
+		pr_debug("%pOF: Cannot find interrupt.\n", np);
 		iounmap(timer);
 		goto out;
 	}
@@ -229,7 +229,7 @@ void __init timer64_init(void)
 		dscr_set_devstate(timer64_devstate_id, DSCR_DEVSTATE_ENABLED);
 	}
 
-	pr_debug("%s: Timer irq=%d.\n", np->full_name, cd->irq);
+	pr_debug("%pOF: Timer irq=%d.\n", np, cd->irq);
 
 	clockevents_calc_mult_shift(cd, c6x_core_freq / TIMER_DIVISOR, 5);
 
diff --git a/arch/cris/arch-v32/mach-a3/arbiter.c b/arch/cris/arch-v32/mach-a3/arbiter.c
index ab5c421a4de8..735a9b0abdb8 100644
--- a/arch/cris/arch-v32/mach-a3/arbiter.c
+++ b/arch/cris/arch-v32/mach-a3/arbiter.c
@@ -227,7 +227,7 @@ static void crisv32_arbiter_config(int arbiter, int region, int unused_slots)
 	}
 }
 
-extern char _stext, _etext;
+extern char _stext[], _etext[];
 
 static void crisv32_arbiter_init(void)
 {
@@ -265,7 +265,7 @@ static void crisv32_arbiter_init(void)
 
 #ifndef CONFIG_ETRAX_KGDB
 	/* Global watch for writes to kernel text segment. */
-	crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext,
+	crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext,
 		MARB_CLIENTS(arbiter_all_clients, arbiter_bar_all_clients),
 			      arbiter_all_write, NULL);
 #endif
diff --git a/arch/cris/arch-v32/mach-fs/arbiter.c b/arch/cris/arch-v32/mach-fs/arbiter.c
index c97f4d8120f9..047c70bdbb23 100644
--- a/arch/cris/arch-v32/mach-fs/arbiter.c
+++ b/arch/cris/arch-v32/mach-fs/arbiter.c
@@ -158,7 +158,7 @@ static void crisv32_arbiter_config(int region, int unused_slots)
 	}
 }
 
-extern char _stext, _etext;
+extern char _stext[], _etext[];
 
 static void crisv32_arbiter_init(void)
 {
@@ -190,7 +190,7 @@ static void crisv32_arbiter_init(void)
 
 #ifndef CONFIG_ETRAX_KGDB
 	/* Global watch for writes to kernel text segment. */
-	crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext,
+	crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext,
 			      arbiter_all_clients, arbiter_all_write, NULL);
 #endif
 }
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index a01636a12a6e..d98131c45bb5 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -42,7 +42,7 @@ void (*nmi_handler)(struct pt_regs *);
 void show_trace(unsigned long *stack)
 {
 	unsigned long addr, module_start, module_end;
-	extern char _stext, _etext;
+	extern char _stext[], _etext[];
 	int i;
 
 	pr_err("\nCall Trace: ");
@@ -69,8 +69,8 @@ void show_trace(unsigned long *stack)
 		 * down the cause of the crash will be able to figure
 		 * out the call path that was taken.
 		 */
-		if (((addr >= (unsigned long)&_stext) &&
-		     (addr <= (unsigned long)&_etext)) ||
+		if (((addr >= (unsigned long)_stext) &&
+		     (addr <= (unsigned long)_etext)) ||
 		    ((addr >= module_start) && (addr <= module_end))) {
 #ifdef CONFIG_KALLSYMS
 			print_ip_sym(addr);
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 2e1da71e27a4..ab346f5f8820 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -7,7 +7,8 @@
 #include <asm/errno.h>
 #include <linux/uaccess.h>
 
-extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
+extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr);
 
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index d155ca9e5098..37f7b2bf7f73 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o
 /*
  * do the futex operations
  */
-int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS; break;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
 
 	return ret;
 
-} /* end futex_atomic_op_inuser() */
+} /* end arch_futex_atomic_op_inuser() */
diff --git a/arch/h8300/include/asm/flat.h b/arch/h8300/include/asm/flat.h
index 18d024251738..7e0bd6fa1532 100644
--- a/arch/h8300/include/asm/flat.h
+++ b/arch/h8300/include/asm/flat.h
@@ -24,7 +24,7 @@ static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags,
 					u32 *addr, u32 *persistent)
 {
 	u32 val = get_unaligned((__force u32 *)rp);
-	if (!(flags & FLAT_FLAG_GOTPIC)
+	if (!(flags & FLAT_FLAG_GOTPIC))
 		val &= 0x00ffffff;
 	*addr = val;
 	return 0;
diff --git a/arch/h8300/include/asm/traps.h b/arch/h8300/include/asm/traps.h
index 15e701130b27..1c5a30ec2df8 100644
--- a/arch/h8300/include/asm/traps.h
+++ b/arch/h8300/include/asm/traps.h
@@ -33,9 +33,9 @@ extern unsigned long *_interrupt_redirect_table;
 #define TRAP2_VEC 10
 #define TRAP3_VEC 11
 
-extern char _start, _etext;
+extern char _start[], _etext[];
 #define check_kernel_text(addr) \
-	((addr >= (unsigned long)(&_start)) && \
-	 (addr <  (unsigned long)(&_etext)) && !(addr & 1))
+	((addr >= (unsigned long)(_start)) && \
+	 (addr <  (unsigned long)(_etext)) && !(addr & 1))
 
 #endif /* _H8300_TRAPS_H */
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index a62ba368b27d..fb3dfb2a667e 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new)
 	);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 /**
  * atomic_read - reads a word, atomically
  * @v: pointer to atomic value
diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h
index 7e597f8434da..c607b77c8215 100644
--- a/arch/hexagon/include/asm/futex.h
+++ b/arch/hexagon/include/asm/futex.h
@@ -31,18 +31,9 @@
 
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h
index a1c55788c5d6..53a8d5885887 100644
--- a/arch/hexagon/include/asm/spinlock.h
+++ b/arch/hexagon/include/asm/spinlock.h
@@ -179,11 +179,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
  */
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 #define arch_spin_is_locked(x) ((x)->lock != 0)
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index a3d0211970e9..c86a947f5368 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
 	buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
 }
 
-#define acpi_unlazy_tlb(x)
-
 #ifdef CONFIG_ACPI_NUMA
 extern cpumask_t early_cpu_possible_map;
 #define for_each_possible_early_cpu(cpu)  \
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index 76acbcd5c060..6d67dc1eaf2b 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -45,18 +45,9 @@ do {									\
 } while (0)
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index ca9e76149a4a..df2c121164b8 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -76,22 +76,6 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
 	ACCESS_ONCE(*p) = (tmp + 2) & ~1;
 }
 
-static __always_inline void __ticket_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	int	*p = (int *)&lock->lock, ticket;
-
-	ia64_invala();
-
-	for (;;) {
-		asm volatile ("ld4.c.nc %0=[%1]" : "=r"(ticket) : "r"(p) : "memory");
-		if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK))
-			return;
-		cpu_relax();
-	}
-
-	smp_acquire__after_ctrl_dep();
-}
-
 static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
 	long tmp = ACCESS_ONCE(lock->lock);
@@ -143,11 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 	arch_spin_lock(lock);
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	__ticket_spin_unlock_wait(lock);
-}
-
 #define arch_read_can_lock(rw)		(*(volatile int *)(rw) >= 0)
 #define arch_write_can_lock(rw)	(*(volatile int *)(rw) == 0)
 
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index fced197b9626..cbe5ac3699bf 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -168,7 +168,8 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb)
 
 
 static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->max = ARRAY_SIZE(tlb->local);
@@ -185,8 +186,11 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
  * collected.
  */
 static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+			unsigned long start, unsigned long end, bool force)
 {
+	if (force)
+		tlb->need_flush = 1;
 	/*
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
 	 * tlb->end_addr.
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 121295637d0d..81416000c5e0 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
 	return 0;
 }
 
-u32
+int
 efi_mem_type (unsigned long phys_addr)
 {
 	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
 
 	if (md)
 		return md->type;
-	return 0;
+	return -EINVAL;
 }
 
 u64
diff --git a/arch/m32r/include/asm/flat.h b/arch/m32r/include/asm/flat.h
index 455ce7ddbf14..dfcb0e4eb256 100644
--- a/arch/m32r/include/asm/flat.h
+++ b/arch/m32r/include/asm/flat.h
@@ -95,7 +95,7 @@ static inline unsigned long m32r_flat_get_addr_from_rp (u32 *rp,
 	return ~0;      /* bogus value */
 }
 
-static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval)
+static inline int flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval)
 {
         unsigned int reloc = flat_m32r_get_reloc_type (relval);
 	if (reloc & 0xf0) {
@@ -133,6 +133,7 @@ static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval)
 			break;
 		}
 	}
+	return 0;
 }
 
 // kludge - text_len is a local variable in the only user.
diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h
index 323c7fc953cd..a56825592b90 100644
--- a/arch/m32r/include/asm/spinlock.h
+++ b/arch/m32r/include/asm/spinlock.h
@@ -30,11 +30,6 @@
 #define arch_spin_is_locked(x)		(*(volatile int *)(&(x)->slock) <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, VAL > 0);
-}
-
 /**
  * arch_spin_trylock - Try spin lock and return a result
  * @lock: Pointer to the lock variable
diff --git a/arch/m68k/include/asm/flat.h b/arch/m68k/include/asm/flat.h
index 48b62790fe70..b2a41f5b3890 100644
--- a/arch/m68k/include/asm/flat.h
+++ b/arch/m68k/include/asm/flat.h
@@ -30,8 +30,7 @@ static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 rel)
 }
 #define	flat_get_relocate_addr(rel)		(rel)
 
-static inline int flat_set_persistent(unsigned long relval,
-				      unsigned long *persistent)
+static inline int flat_set_persistent(u32 relval, u32 *persistent)
 {
 	return 0;
 }
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index 5b7a45d99cfb..7d8b322e5101 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -26,6 +26,7 @@ config METAG
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNDERSCORE_SYMBOL_PREFIX
 	select IRQ_DOMAIN
+	select GENERIC_IRQ_EFFECTIVE_AFF_MASK
 	select MODULES_USE_ELF_RELA
 	select OF
 	select OF_EARLY_FLATTREE
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index 6c1380a8a0d4..eee779f26cc4 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i)
 	return i;
 }
 
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
 #define ATOMIC_OP(op, c_op)						\
 static inline void atomic_##op(int i, atomic_t *v)			\
 {									\
diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h
index c0c7a22be1ae..ddf7fe5708a6 100644
--- a/arch/metag/include/asm/spinlock.h
+++ b/arch/metag/include/asm/spinlock.h
@@ -15,11 +15,6 @@
  * locked.
  */
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 #define	arch_read_lock_flags(lock, flags) arch_read_lock(lock)
diff --git a/arch/microblaze/include/asm/flat.h b/arch/microblaze/include/asm/flat.h
index f23c3d266bae..3d2747d4c967 100644
--- a/arch/microblaze/include/asm/flat.h
+++ b/arch/microblaze/include/asm/flat.h
@@ -60,7 +60,7 @@ static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags,
  * unaligned.
  */
 
-static inline void
+static inline int
 flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 relval)
 {
 	u32 *p = (__force u32 *)rp;
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index 01848f056f43..a9dad9e5e132 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -29,18 +29,9 @@
 })
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8dd20358464f..48d91d5be4e9 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2260,7 +2260,7 @@ config CPU_R4K_CACHE_TLB
 
 config MIPS_MT_SMP
 	bool "MIPS MT SMP support (1 TC on each available VPE)"
-	depends on SYS_SUPPORTS_MULTITHREADING && !CPU_MIPSR6
+	depends on SYS_SUPPORTS_MULTITHREADING && !CPU_MIPSR6 && !CPU_MICROMIPS
 	select CPU_MIPSR2_IRQ_VI
 	select CPU_MIPSR2_IRQ_EI
 	select SYNC_R4K
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 04343625b929..bc2708c9ada4 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -243,8 +243,21 @@ include arch/mips/Kbuild.platforms
 ifdef CONFIG_PHYSICAL_START
 load-y					= $(CONFIG_PHYSICAL_START)
 endif
-entry-y				= 0x$(shell $(NM) vmlinux 2>/dev/null \
+
+entry-noisa-y				= 0x$(shell $(NM) vmlinux 2>/dev/null \
 					| grep "\bkernel_entry\b" | cut -f1 -d \ )
+ifdef CONFIG_CPU_MICROMIPS
+  #
+  # Set the ISA bit, since the kernel_entry symbol in the ELF will have it
+  # clear which would lead to images containing addresses which bootloaders may
+  # jump to as MIPS32 code.
+  #
+  entry-y = $(patsubst %0,%1,$(patsubst %2,%3,$(patsubst %4,%5, \
+              $(patsubst %6,%7,$(patsubst %8,%9,$(patsubst %a,%b, \
+              $(patsubst %c,%d,$(patsubst %e,%f,$(entry-noisa-y)))))))))
+else
+  entry-y = $(entry-noisa-y)
+endif
 
 cflags-y			+= -I$(srctree)/arch/mips/include/asm/mach-generic
 drivers-$(CONFIG_PCI)		+= arch/mips/pci/
diff --git a/arch/mips/boot/compressed/.gitignore b/arch/mips/boot/compressed/.gitignore
new file mode 100644
index 000000000000..ebae133f1d00
--- /dev/null
+++ b/arch/mips/boot/compressed/.gitignore
@@ -0,0 +1,2 @@
+ashldi3.c
+bswapsi.c
diff --git a/arch/mips/cavium-octeon/octeon-usb.c b/arch/mips/cavium-octeon/octeon-usb.c
index 542be1cd0f32..bfdfaf32d2c4 100644
--- a/arch/mips/cavium-octeon/octeon-usb.c
+++ b/arch/mips/cavium-octeon/octeon-usb.c
@@ -13,9 +13,9 @@
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/of_platform.h>
+#include <linux/io.h>
 
 #include <asm/octeon/octeon.h>
-#include <asm/octeon/cvmx-gpio-defs.h>
 
 /* USB Control Register */
 union cvm_usbdrd_uctl_ctl {
diff --git a/arch/mips/dec/int-handler.S b/arch/mips/dec/int-handler.S
index 1910223a9c02..cea2bb1621e6 100644
--- a/arch/mips/dec/int-handler.S
+++ b/arch/mips/dec/int-handler.S
@@ -147,23 +147,12 @@
 		 * Find irq with highest priority
 		 */
 		# open coded PTR_LA t1, cpu_mask_nr_tbl
-#if (_MIPS_SZPTR == 32)
+#if defined(CONFIG_32BIT) || defined(KBUILD_64BIT_SYM32)
 		# open coded la t1, cpu_mask_nr_tbl
 		lui	t1, %hi(cpu_mask_nr_tbl)
 		addiu	t1, %lo(cpu_mask_nr_tbl)
-
-#endif
-#if (_MIPS_SZPTR == 64)
-		# open coded dla t1, cpu_mask_nr_tbl
-		.set	push
-		.set	noat
-		lui	t1, %highest(cpu_mask_nr_tbl)
-		lui	AT, %hi(cpu_mask_nr_tbl)
-		daddiu	t1, t1, %higher(cpu_mask_nr_tbl)
-		daddiu	AT, AT, %lo(cpu_mask_nr_tbl)
-		dsll	t1, 32
-		daddu	t1, t1, AT
-		.set	pop
+#else
+#error GCC `-msym32' option required for 64-bit DECstation builds
 #endif
 1:		lw	t2,(t1)
 		nop
@@ -214,23 +203,12 @@
 		 * Find irq with highest priority
 		 */
 		# open coded PTR_LA t1,asic_mask_nr_tbl
-#if (_MIPS_SZPTR == 32)
+#if defined(CONFIG_32BIT) || defined(KBUILD_64BIT_SYM32)
 		# open coded la t1, asic_mask_nr_tbl
 		lui	t1, %hi(asic_mask_nr_tbl)
 		addiu	t1, %lo(asic_mask_nr_tbl)
-
-#endif
-#if (_MIPS_SZPTR == 64)
-		# open coded dla t1, asic_mask_nr_tbl
-		.set	push
-		.set	noat
-		lui	t1, %highest(asic_mask_nr_tbl)
-		lui	AT, %hi(asic_mask_nr_tbl)
-		daddiu	t1, t1, %higher(asic_mask_nr_tbl)
-		daddiu	AT, AT, %lo(asic_mask_nr_tbl)
-		dsll	t1, 32
-		daddu	t1, t1, AT
-		.set	pop
+#else
+#error GCC `-msym32' option required for 64-bit DECstation builds
 #endif
 2:		lw	t2,(t1)
 		nop
diff --git a/arch/mips/include/asm/cache.h b/arch/mips/include/asm/cache.h
index fc67947ed658..8b14c2706aa5 100644
--- a/arch/mips/include/asm/cache.h
+++ b/arch/mips/include/asm/cache.h
@@ -9,6 +9,8 @@
 #ifndef _ASM_CACHE_H
 #define _ASM_CACHE_H
 
+#include <kmalloc.h>
+
 #define L1_CACHE_SHIFT		CONFIG_MIPS_L1_CACHE_SHIFT
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index 8baa9033b181..721b698bfe3c 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -428,6 +428,9 @@
 #ifndef cpu_scache_line_size
 #define cpu_scache_line_size()	cpu_data[0].scache.linesz
 #endif
+#ifndef cpu_tcache_line_size
+#define cpu_tcache_line_size()	cpu_data[0].tcache.linesz
+#endif
 
 #ifndef cpu_hwrena_impl_bits
 #define cpu_hwrena_impl_bits		0
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index 1de190bdfb9c..a9e61ea54ca9 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -83,18 +83,9 @@
 }
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 2998479fd4e8..a9af1d2dcd69 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -938,11 +938,6 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
-static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
-							 unsigned long address)
-{
-}
-
 /* Emulation */
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
diff --git a/arch/mips/include/asm/mach-ralink/ralink_regs.h b/arch/mips/include/asm/mach-ralink/ralink_regs.h
index 9df1a53bcb36..b4e7dfa214eb 100644
--- a/arch/mips/include/asm/mach-ralink/ralink_regs.h
+++ b/arch/mips/include/asm/mach-ralink/ralink_regs.h
@@ -13,6 +13,8 @@
 #ifndef _RALINK_REGS_H_
 #define _RALINK_REGS_H_
 
+#include <linux/io.h>
+
 enum ralink_soc_type {
 	RALINK_UNKNOWN = 0,
 	RT2880_SOC,
diff --git a/arch/mips/include/asm/octeon/cvmx-l2c-defs.h b/arch/mips/include/asm/octeon/cvmx-l2c-defs.h
index d045973ddb33..3ea84acf1814 100644
--- a/arch/mips/include/asm/octeon/cvmx-l2c-defs.h
+++ b/arch/mips/include/asm/octeon/cvmx-l2c-defs.h
@@ -33,6 +33,10 @@
 #define CVMX_L2C_DBG (CVMX_ADD_IO_SEG(0x0001180080000030ull))
 #define CVMX_L2C_CFG (CVMX_ADD_IO_SEG(0x0001180080000000ull))
 #define CVMX_L2C_CTL (CVMX_ADD_IO_SEG(0x0001180080800000ull))
+#define CVMX_L2C_ERR_TDTX(block_id)					       \
+	(CVMX_ADD_IO_SEG(0x0001180080A007E0ull) + ((block_id) & 3) * 0x40000ull)
+#define CVMX_L2C_ERR_TTGX(block_id)					       \
+	(CVMX_ADD_IO_SEG(0x0001180080A007E8ull) + ((block_id) & 3) * 0x40000ull)
 #define CVMX_L2C_LCKBASE (CVMX_ADD_IO_SEG(0x0001180080000058ull))
 #define CVMX_L2C_LCKOFF (CVMX_ADD_IO_SEG(0x0001180080000060ull))
 #define CVMX_L2C_PFCTL (CVMX_ADD_IO_SEG(0x0001180080000090ull))
@@ -66,9 +70,40 @@
 		((offset) & 1) * 8)
 #define CVMX_L2C_WPAR_PPX(offset) (CVMX_ADD_IO_SEG(0x0001180080840000ull)    + \
 		((offset) & 31) * 8)
-#define CVMX_L2D_FUS3 (CVMX_ADD_IO_SEG(0x00011800800007B8ull))
 
 
+union cvmx_l2c_err_tdtx {
+	uint64_t u64;
+	struct cvmx_l2c_err_tdtx_s {
+		__BITFIELD_FIELD(uint64_t dbe:1,
+		__BITFIELD_FIELD(uint64_t sbe:1,
+		__BITFIELD_FIELD(uint64_t vdbe:1,
+		__BITFIELD_FIELD(uint64_t vsbe:1,
+		__BITFIELD_FIELD(uint64_t syn:10,
+		__BITFIELD_FIELD(uint64_t reserved_22_49:28,
+		__BITFIELD_FIELD(uint64_t wayidx:18,
+		__BITFIELD_FIELD(uint64_t reserved_2_3:2,
+		__BITFIELD_FIELD(uint64_t type:2,
+		;)))))))))
+	} s;
+};
+
+union cvmx_l2c_err_ttgx {
+	uint64_t u64;
+	struct cvmx_l2c_err_ttgx_s {
+		__BITFIELD_FIELD(uint64_t dbe:1,
+		__BITFIELD_FIELD(uint64_t sbe:1,
+		__BITFIELD_FIELD(uint64_t noway:1,
+		__BITFIELD_FIELD(uint64_t reserved_56_60:5,
+		__BITFIELD_FIELD(uint64_t syn:6,
+		__BITFIELD_FIELD(uint64_t reserved_22_49:28,
+		__BITFIELD_FIELD(uint64_t wayidx:15,
+		__BITFIELD_FIELD(uint64_t reserved_2_6:5,
+		__BITFIELD_FIELD(uint64_t type:2,
+		;)))))))))
+	} s;
+};
+
 union cvmx_l2c_cfg {
 	uint64_t u64;
 	struct cvmx_l2c_cfg_s {
diff --git a/arch/mips/include/asm/octeon/cvmx-l2d-defs.h b/arch/mips/include/asm/octeon/cvmx-l2d-defs.h
new file mode 100644
index 000000000000..a951ad5d65ad
--- /dev/null
+++ b/arch/mips/include/asm/octeon/cvmx-l2d-defs.h
@@ -0,0 +1,60 @@
+/***********************license start***************
+ * Author: Cavium Networks
+ *
+ * Contact: support@caviumnetworks.com
+ * This file is part of the OCTEON SDK
+ *
+ * Copyright (c) 2003-2017 Cavium, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, Version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * AS-IS and WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or
+ * NONINFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this file; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ * or visit http://www.gnu.org/licenses/.
+ *
+ * This file may also be available under a different license from Cavium.
+ * Contact Cavium Networks for more information
+ ***********************license end**************************************/
+
+#ifndef __CVMX_L2D_DEFS_H__
+#define __CVMX_L2D_DEFS_H__
+
+#define CVMX_L2D_ERR	(CVMX_ADD_IO_SEG(0x0001180080000010ull))
+#define CVMX_L2D_FUS3	(CVMX_ADD_IO_SEG(0x00011800800007B8ull))
+
+
+union cvmx_l2d_err {
+	uint64_t u64;
+	struct cvmx_l2d_err_s {
+		__BITFIELD_FIELD(uint64_t reserved_6_63:58,
+		__BITFIELD_FIELD(uint64_t bmhclsel:1,
+		__BITFIELD_FIELD(uint64_t ded_err:1,
+		__BITFIELD_FIELD(uint64_t sec_err:1,
+		__BITFIELD_FIELD(uint64_t ded_intena:1,
+		__BITFIELD_FIELD(uint64_t sec_intena:1,
+		__BITFIELD_FIELD(uint64_t ecc_ena:1,
+		;)))))))
+	} s;
+};
+
+union cvmx_l2d_fus3 {
+	uint64_t u64;
+	struct cvmx_l2d_fus3_s {
+		__BITFIELD_FIELD(uint64_t reserved_40_63:24,
+		__BITFIELD_FIELD(uint64_t ema_ctl:3,
+		__BITFIELD_FIELD(uint64_t reserved_34_36:3,
+		__BITFIELD_FIELD(uint64_t q3fus:34,
+		;))))
+	} s;
+};
+
+#endif
diff --git a/arch/mips/include/asm/octeon/cvmx.h b/arch/mips/include/asm/octeon/cvmx.h
index 9742202f2a32..e638735cc3ac 100644
--- a/arch/mips/include/asm/octeon/cvmx.h
+++ b/arch/mips/include/asm/octeon/cvmx.h
@@ -62,6 +62,7 @@ enum cvmx_mips_space {
 #include <asm/octeon/cvmx-iob-defs.h>
 #include <asm/octeon/cvmx-ipd-defs.h>
 #include <asm/octeon/cvmx-l2c-defs.h>
+#include <asm/octeon/cvmx-l2d-defs.h>
 #include <asm/octeon/cvmx-l2t-defs.h>
 #include <asm/octeon/cvmx-led-defs.h>
 #include <asm/octeon/cvmx-mio-defs.h>
diff --git a/arch/mips/include/uapi/asm/ioctls.h b/arch/mips/include/uapi/asm/ioctls.h
index 68e19b689a00..1609cb0907ac 100644
--- a/arch/mips/include/uapi/asm/ioctls.h
+++ b/arch/mips/include/uapi/asm/ioctls.h
@@ -91,7 +91,7 @@
 #define TIOCGPKT	_IOR('T', 0x38, int) /* Get packet mode state */
 #define TIOCGPTLCK	_IOR('T', 0x39, int) /* Get Pty lock state */
 #define TIOCGEXCL	_IOR('T', 0x40, int) /* Get exclusive mode state */
-#define TIOCGPTPEER	_IOR('T', 0x41, int) /* Safely open the slave */
+#define TIOCGPTPEER	_IO('T', 0x41) /* Safely open the slave */
 
 /* I hope the range from 0x5480 on is free ... */
 #define TIOCSCTTY	0x5480		/* become controlling tty */
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 6dd13641a418..1395654cfc8d 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -872,15 +872,13 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
 	if (unlikely(test_thread_flag(TIF_SECCOMP))) {
 		int ret, i;
 		struct seccomp_data sd;
+		unsigned long args[6];
 
 		sd.nr = syscall;
 		sd.arch = syscall_get_arch();
-		for (i = 0; i < 6; i++) {
-			unsigned long v, r;
-
-			r = mips_get_syscall_arg(&v, current, regs, i);
-			sd.args[i] = r ? 0 : v;
-		}
+		syscall_get_arguments(current, regs, 0, 6, args);
+		for (i = 0; i < 6; i++)
+			sd.args[i] = args[i];
 		sd.instruction_pointer = KSTK_EIP(current);
 
 		ret = __secure_computing(&sd);
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 27c2f90eeb21..a9a7d78803cd 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -190,12 +190,6 @@ illegal_syscall:
 	sll	t1, t0, 2
 	beqz	v0, einval
 	lw	t2, sys_call_table(t1)		# syscall routine
-	sw	a0, PT_R2(sp)			# call routine directly on restart
-
-	/* Some syscalls like execve get their arguments from struct pt_regs
-	   and claim zero arguments in the syscall table. Thus we have to
-	   assume the worst case and shuffle around all potential arguments.
-	   If you want performance, don't use indirect syscalls. */
 
 	move	a0, a1				# shift argument registers
 	move	a1, a2
@@ -207,11 +201,6 @@ illegal_syscall:
 	sw	t4, 16(sp)
 	sw	t5, 20(sp)
 	sw	t6, 24(sp)
-	sw	a0, PT_R4(sp)			# .. and push back a0 - a3, some
-	sw	a1, PT_R5(sp)			# syscalls expect them there
-	sw	a2, PT_R6(sp)
-	sw	a3, PT_R7(sp)
-	sw	a3, PT_R26(sp)			# update a3 for syscall restarting
 	jr	t2
 	/* Unreached */
 
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index c30bc520885f..9ebe3e2403b1 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -198,7 +198,6 @@ LEAF(sys32_syscall)
 	dsll	t1, t0, 3
 	beqz	v0, einval
 	ld	t2, sys32_call_table(t1)		# syscall routine
-	sd	a0, PT_R2(sp)		# call routine directly on restart
 
 	move	a0, a1			# shift argument registers
 	move	a1, a2
@@ -207,11 +206,6 @@ LEAF(sys32_syscall)
 	move	a4, a5
 	move	a5, a6
 	move	a6, a7
-	sd	a0, PT_R4(sp)		# ... and push back a0 - a3, some
-	sd	a1, PT_R5(sp)		# syscalls expect them there
-	sd	a2, PT_R6(sp)
-	sd	a3, PT_R7(sp)
-	sd	a3, PT_R26(sp)		# update a3 for syscall restarting
 	jr	t2
 	/* Unreached */
 
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 770d4d1516cb..c7cbddfcdc3b 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -376,9 +376,6 @@ asmlinkage void start_secondary(void)
 	cpumask_set_cpu(cpu, &cpu_coherent_mask);
 	notify_cpu_starting(cpu);
 
-	complete(&cpu_running);
-	synchronise_count_slave(cpu);
-
 	set_cpu_online(cpu, true);
 
 	set_cpu_sibling_map(cpu);
@@ -386,6 +383,9 @@ asmlinkage void start_secondary(void)
 
 	calculate_cpu_foreign_map();
 
+	complete(&cpu_running);
+	synchronise_count_slave(cpu);
+
 	/*
 	 * irq will be enabled in ->smp_finish(), enabling it too early
 	 * is dangerous.
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 
 static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
-static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
 
 void tick_broadcast(const struct cpumask *mask)
 {
 	atomic_t *count;
-	struct call_single_data *csd;
+	call_single_data_t *csd;
 	int cpu;
 
 	for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
 
 static int __init tick_broadcast_init(void)
 {
-	struct call_single_data *csd;
+	call_single_data_t *csd;
 	int cpu;
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index e08598c70b3e..8e78251eccc2 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -232,7 +232,7 @@ static int mips_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 	else
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
-	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
 		return ret;
 
 	if (off < count && user_count <= (count - off)) {
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 3f74f6c1f065..9fea6c6bbf49 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -48,7 +48,7 @@
 
 #include "uasm.c"
 
-static const struct insn const insn_table[insn_invalid] = {
+static const struct insn insn_table[insn_invalid] = {
 	[insn_addiu]	= {M(addiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM},
 	[insn_addu]	= {M(spec_op, 0, 0, 0, 0, addu_op), RS | RT | RD},
 	[insn_and]	= {M(spec_op, 0, 0, 0, 0, and_op), RS | RT | RD},
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
new file mode 100644
index 000000000000..3f87b96da5c4
--- /dev/null
+++ b/arch/mips/net/ebpf_jit.c
@@ -0,0 +1,1950 @@
+/*
+ * Just-In-Time compiler for eBPF filters on MIPS
+ *
+ * Copyright (c) 2017 Cavium, Inc.
+ *
+ * Based on code from:
+ *
+ * Copyright (c) 2014 Imagination Technologies Ltd.
+ * Author: Markos Chandras <markos.chandras@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/slab.h>
+#include <asm/bitops.h>
+#include <asm/byteorder.h>
+#include <asm/cacheflush.h>
+#include <asm/cpu-features.h>
+#include <asm/uasm.h>
+
+/* Registers used by JIT */
+#define MIPS_R_ZERO	0
+#define MIPS_R_AT	1
+#define MIPS_R_V0	2	/* BPF_R0 */
+#define MIPS_R_V1	3
+#define MIPS_R_A0	4	/* BPF_R1 */
+#define MIPS_R_A1	5	/* BPF_R2 */
+#define MIPS_R_A2	6	/* BPF_R3 */
+#define MIPS_R_A3	7	/* BPF_R4 */
+#define MIPS_R_A4	8	/* BPF_R5 */
+#define MIPS_R_T4	12	/* BPF_AX */
+#define MIPS_R_T5	13
+#define MIPS_R_T6	14
+#define MIPS_R_T7	15
+#define MIPS_R_S0	16	/* BPF_R6 */
+#define MIPS_R_S1	17	/* BPF_R7 */
+#define MIPS_R_S2	18	/* BPF_R8 */
+#define MIPS_R_S3	19	/* BPF_R9 */
+#define MIPS_R_S4	20	/* BPF_TCC */
+#define MIPS_R_S5	21
+#define MIPS_R_S6	22
+#define MIPS_R_S7	23
+#define MIPS_R_T8	24
+#define MIPS_R_T9	25
+#define MIPS_R_SP	29
+#define MIPS_R_RA	31
+
+/* eBPF flags */
+#define EBPF_SAVE_S0	BIT(0)
+#define EBPF_SAVE_S1	BIT(1)
+#define EBPF_SAVE_S2	BIT(2)
+#define EBPF_SAVE_S3	BIT(3)
+#define EBPF_SAVE_S4	BIT(4)
+#define EBPF_SAVE_RA	BIT(5)
+#define EBPF_SEEN_FP	BIT(6)
+#define EBPF_SEEN_TC	BIT(7)
+#define EBPF_TCC_IN_V1	BIT(8)
+
+/*
+ * For the mips64 ISA, we need to track the value range or type for
+ * each JIT register.  The BPF machine requires zero extended 32-bit
+ * values, but the mips64 ISA requires sign extended 32-bit values.
+ * At each point in the BPF program we track the state of every
+ * register so that we can zero extend or sign extend as the BPF
+ * semantics require.
+ */
+enum reg_val_type {
+	/* uninitialized */
+	REG_UNKNOWN,
+	/* not known to be 32-bit compatible. */
+	REG_64BIT,
+	/* 32-bit compatible, no truncation needed for 64-bit ops. */
+	REG_64BIT_32BIT,
+	/* 32-bit compatible, need truncation for 64-bit ops. */
+	REG_32BIT,
+	/* 32-bit zero extended. */
+	REG_32BIT_ZERO_EX,
+	/* 32-bit no sign/zero extension needed. */
+	REG_32BIT_POS
+};
+
+/*
+ * high bit of offsets indicates if long branch conversion done at
+ * this insn.
+ */
+#define OFFSETS_B_CONV	BIT(31)
+
+/**
+ * struct jit_ctx - JIT context
+ * @skf:		The sk_filter
+ * @stack_size:		eBPF stack size
+ * @tmp_offset:		eBPF $sp offset to 8-byte temporary memory
+ * @idx:		Instruction index
+ * @flags:		JIT flags
+ * @offsets:		Instruction offsets
+ * @target:		Memory location for the compiled filter
+ * @reg_val_types	Packed enum reg_val_type for each register.
+ */
+struct jit_ctx {
+	const struct bpf_prog *skf;
+	int stack_size;
+	int tmp_offset;
+	u32 idx;
+	u32 flags;
+	u32 *offsets;
+	u32 *target;
+	u64 *reg_val_types;
+	unsigned int long_b_conversion:1;
+	unsigned int gen_b_offsets:1;
+};
+
+static void set_reg_val_type(u64 *rvt, int reg, enum reg_val_type type)
+{
+	*rvt &= ~(7ull << (reg * 3));
+	*rvt |= ((u64)type << (reg * 3));
+}
+
+static enum reg_val_type get_reg_val_type(const struct jit_ctx *ctx,
+					  int index, int reg)
+{
+	return (ctx->reg_val_types[index] >> (reg * 3)) & 7;
+}
+
+/* Simply emit the instruction if the JIT memory space has been allocated */
+#define emit_instr(ctx, func, ...)			\
+do {							\
+	if ((ctx)->target != NULL) {			\
+		u32 *p = &(ctx)->target[ctx->idx];	\
+		uasm_i_##func(&p, ##__VA_ARGS__);	\
+	}						\
+	(ctx)->idx++;					\
+} while (0)
+
+static unsigned int j_target(struct jit_ctx *ctx, int target_idx)
+{
+	unsigned long target_va, base_va;
+	unsigned int r;
+
+	if (!ctx->target)
+		return 0;
+
+	base_va = (unsigned long)ctx->target;
+	target_va = base_va + (ctx->offsets[target_idx] & ~OFFSETS_B_CONV);
+
+	if ((base_va & ~0x0ffffffful) != (target_va & ~0x0ffffffful))
+		return (unsigned int)-1;
+	r = target_va & 0x0ffffffful;
+	return r;
+}
+
+/* Compute the immediate value for PC-relative branches. */
+static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx)
+{
+	if (!ctx->gen_b_offsets)
+		return 0;
+
+	/*
+	 * We want a pc-relative branch.  tgt is the instruction offset
+	 * we want to jump to.
+
+	 * Branch on MIPS:
+	 * I: target_offset <- sign_extend(offset)
+	 * I+1: PC += target_offset (delay slot)
+	 *
+	 * ctx->idx currently points to the branch instruction
+	 * but the offset is added to the delay slot so we need
+	 * to subtract 4.
+	 */
+	return (ctx->offsets[tgt] & ~OFFSETS_B_CONV) -
+		(ctx->idx * 4) - 4;
+}
+
+int bpf_jit_enable __read_mostly;
+
+enum which_ebpf_reg {
+	src_reg,
+	src_reg_no_fp,
+	dst_reg,
+	dst_reg_fp_ok
+};
+
+/*
+ * For eBPF, the register mapping naturally falls out of the
+ * requirements of eBPF and the MIPS n64 ABI.  We don't maintain a
+ * separate frame pointer, so BPF_REG_10 relative accesses are
+ * adjusted to be $sp relative.
+ */
+int ebpf_to_mips_reg(struct jit_ctx *ctx, const struct bpf_insn *insn,
+		     enum which_ebpf_reg w)
+{
+	int ebpf_reg = (w == src_reg || w == src_reg_no_fp) ?
+		insn->src_reg : insn->dst_reg;
+
+	switch (ebpf_reg) {
+	case BPF_REG_0:
+		return MIPS_R_V0;
+	case BPF_REG_1:
+		return MIPS_R_A0;
+	case BPF_REG_2:
+		return MIPS_R_A1;
+	case BPF_REG_3:
+		return MIPS_R_A2;
+	case BPF_REG_4:
+		return MIPS_R_A3;
+	case BPF_REG_5:
+		return MIPS_R_A4;
+	case BPF_REG_6:
+		ctx->flags |= EBPF_SAVE_S0;
+		return MIPS_R_S0;
+	case BPF_REG_7:
+		ctx->flags |= EBPF_SAVE_S1;
+		return MIPS_R_S1;
+	case BPF_REG_8:
+		ctx->flags |= EBPF_SAVE_S2;
+		return MIPS_R_S2;
+	case BPF_REG_9:
+		ctx->flags |= EBPF_SAVE_S3;
+		return MIPS_R_S3;
+	case BPF_REG_10:
+		if (w == dst_reg || w == src_reg_no_fp)
+			goto bad_reg;
+		ctx->flags |= EBPF_SEEN_FP;
+		/*
+		 * Needs special handling, return something that
+		 * cannot be clobbered just in case.
+		 */
+		return MIPS_R_ZERO;
+	case BPF_REG_AX:
+		return MIPS_R_T4;
+	default:
+bad_reg:
+		WARN(1, "Illegal bpf reg: %d\n", ebpf_reg);
+		return -EINVAL;
+	}
+}
+/*
+ * eBPF stack frame will be something like:
+ *
+ *  Entry $sp ------>   +--------------------------------+
+ *                      |   $ra  (optional)              |
+ *                      +--------------------------------+
+ *                      |   $s0  (optional)              |
+ *                      +--------------------------------+
+ *                      |   $s1  (optional)              |
+ *                      +--------------------------------+
+ *                      |   $s2  (optional)              |
+ *                      +--------------------------------+
+ *                      |   $s3  (optional)              |
+ *                      +--------------------------------+
+ *                      |   $s4  (optional)              |
+ *                      +--------------------------------+
+ *                      |   tmp-storage  (if $ra saved)  |
+ * $sp + tmp_offset --> +--------------------------------+ <--BPF_REG_10
+ *                      |   BPF_REG_10 relative storage  |
+ *                      |    MAX_BPF_STACK (optional)    |
+ *                      |      .                         |
+ *                      |      .                         |
+ *                      |      .                         |
+ *     $sp -------->    +--------------------------------+
+ *
+ * If BPF_REG_10 is never referenced, then the MAX_BPF_STACK sized
+ * area is not allocated.
+ */
+static int gen_int_prologue(struct jit_ctx *ctx)
+{
+	int stack_adjust = 0;
+	int store_offset;
+	int locals_size;
+
+	if (ctx->flags & EBPF_SAVE_RA)
+		/*
+		 * If RA we are doing a function call and may need
+		 * extra 8-byte tmp area.
+		 */
+		stack_adjust += 16;
+	if (ctx->flags & EBPF_SAVE_S0)
+		stack_adjust += 8;
+	if (ctx->flags & EBPF_SAVE_S1)
+		stack_adjust += 8;
+	if (ctx->flags & EBPF_SAVE_S2)
+		stack_adjust += 8;
+	if (ctx->flags & EBPF_SAVE_S3)
+		stack_adjust += 8;
+	if (ctx->flags & EBPF_SAVE_S4)
+		stack_adjust += 8;
+
+	BUILD_BUG_ON(MAX_BPF_STACK & 7);
+	locals_size = (ctx->flags & EBPF_SEEN_FP) ? MAX_BPF_STACK : 0;
+
+	stack_adjust += locals_size;
+	ctx->tmp_offset = locals_size;
+
+	ctx->stack_size = stack_adjust;
+
+	/*
+	 * First instruction initializes the tail call count (TCC).
+	 * On tail call we skip this instruction, and the TCC is
+	 * passed in $v1 from the caller.
+	 */
+	emit_instr(ctx, daddiu, MIPS_R_V1, MIPS_R_ZERO, MAX_TAIL_CALL_CNT);
+	if (stack_adjust)
+		emit_instr(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, -stack_adjust);
+	else
+		return 0;
+
+	store_offset = stack_adjust - 8;
+
+	if (ctx->flags & EBPF_SAVE_RA) {
+		emit_instr(ctx, sd, MIPS_R_RA, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S0) {
+		emit_instr(ctx, sd, MIPS_R_S0, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S1) {
+		emit_instr(ctx, sd, MIPS_R_S1, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S2) {
+		emit_instr(ctx, sd, MIPS_R_S2, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S3) {
+		emit_instr(ctx, sd, MIPS_R_S3, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S4) {
+		emit_instr(ctx, sd, MIPS_R_S4, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+
+	if ((ctx->flags & EBPF_SEEN_TC) && !(ctx->flags & EBPF_TCC_IN_V1))
+		emit_instr(ctx, daddu, MIPS_R_S4, MIPS_R_V1, MIPS_R_ZERO);
+
+	return 0;
+}
+
+static int build_int_epilogue(struct jit_ctx *ctx, int dest_reg)
+{
+	const struct bpf_prog *prog = ctx->skf;
+	int stack_adjust = ctx->stack_size;
+	int store_offset = stack_adjust - 8;
+	int r0 = MIPS_R_V0;
+
+	if (dest_reg == MIPS_R_RA &&
+	    get_reg_val_type(ctx, prog->len, BPF_REG_0) == REG_32BIT_ZERO_EX)
+		/* Don't let zero extended value escape. */
+		emit_instr(ctx, sll, r0, r0, 0);
+
+	if (ctx->flags & EBPF_SAVE_RA) {
+		emit_instr(ctx, ld, MIPS_R_RA, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S0) {
+		emit_instr(ctx, ld, MIPS_R_S0, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S1) {
+		emit_instr(ctx, ld, MIPS_R_S1, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S2) {
+		emit_instr(ctx, ld, MIPS_R_S2, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S3) {
+		emit_instr(ctx, ld, MIPS_R_S3, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	if (ctx->flags & EBPF_SAVE_S4) {
+		emit_instr(ctx, ld, MIPS_R_S4, store_offset, MIPS_R_SP);
+		store_offset -= 8;
+	}
+	emit_instr(ctx, jr, dest_reg);
+
+	if (stack_adjust)
+		emit_instr(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, stack_adjust);
+	else
+		emit_instr(ctx, nop);
+
+	return 0;
+}
+
+static void gen_imm_to_reg(const struct bpf_insn *insn, int reg,
+			   struct jit_ctx *ctx)
+{
+	if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) {
+		emit_instr(ctx, addiu, reg, MIPS_R_ZERO, insn->imm);
+	} else {
+		int lower = (s16)(insn->imm & 0xffff);
+		int upper = insn->imm - lower;
+
+		emit_instr(ctx, lui, reg, upper >> 16);
+		emit_instr(ctx, addiu, reg, reg, lower);
+	}
+
+}
+
+static int gen_imm_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
+			int idx)
+{
+	int upper_bound, lower_bound;
+	int dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+
+	if (dst < 0)
+		return dst;
+
+	switch (BPF_OP(insn->code)) {
+	case BPF_MOV:
+	case BPF_ADD:
+		upper_bound = S16_MAX;
+		lower_bound = S16_MIN;
+		break;
+	case BPF_SUB:
+		upper_bound = -(int)S16_MIN;
+		lower_bound = -(int)S16_MAX;
+		break;
+	case BPF_AND:
+	case BPF_OR:
+	case BPF_XOR:
+		upper_bound = 0xffff;
+		lower_bound = 0;
+		break;
+	case BPF_RSH:
+	case BPF_LSH:
+	case BPF_ARSH:
+		/* Shift amounts are truncated, no need for bounds */
+		upper_bound = S32_MAX;
+		lower_bound = S32_MIN;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/*
+	 * Immediate move clobbers the register, so no sign/zero
+	 * extension needed.
+	 */
+	if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+	    BPF_OP(insn->code) != BPF_MOV &&
+	    get_reg_val_type(ctx, idx, insn->dst_reg) == REG_32BIT)
+		emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+	/* BPF_ALU | BPF_LSH doesn't need separate sign extension */
+	if (BPF_CLASS(insn->code) == BPF_ALU &&
+	    BPF_OP(insn->code) != BPF_LSH &&
+	    BPF_OP(insn->code) != BPF_MOV &&
+	    get_reg_val_type(ctx, idx, insn->dst_reg) != REG_32BIT)
+		emit_instr(ctx, sll, dst, dst, 0);
+
+	if (insn->imm >= lower_bound && insn->imm <= upper_bound) {
+		/* single insn immediate case */
+		switch (BPF_OP(insn->code) | BPF_CLASS(insn->code)) {
+		case BPF_ALU64 | BPF_MOV:
+			emit_instr(ctx, daddiu, dst, MIPS_R_ZERO, insn->imm);
+			break;
+		case BPF_ALU64 | BPF_AND:
+		case BPF_ALU | BPF_AND:
+			emit_instr(ctx, andi, dst, dst, insn->imm);
+			break;
+		case BPF_ALU64 | BPF_OR:
+		case BPF_ALU | BPF_OR:
+			emit_instr(ctx, ori, dst, dst, insn->imm);
+			break;
+		case BPF_ALU64 | BPF_XOR:
+		case BPF_ALU | BPF_XOR:
+			emit_instr(ctx, xori, dst, dst, insn->imm);
+			break;
+		case BPF_ALU64 | BPF_ADD:
+			emit_instr(ctx, daddiu, dst, dst, insn->imm);
+			break;
+		case BPF_ALU64 | BPF_SUB:
+			emit_instr(ctx, daddiu, dst, dst, -insn->imm);
+			break;
+		case BPF_ALU64 | BPF_RSH:
+			emit_instr(ctx, dsrl_safe, dst, dst, insn->imm & 0x3f);
+			break;
+		case BPF_ALU | BPF_RSH:
+			emit_instr(ctx, srl, dst, dst, insn->imm & 0x1f);
+			break;
+		case BPF_ALU64 | BPF_LSH:
+			emit_instr(ctx, dsll_safe, dst, dst, insn->imm & 0x3f);
+			break;
+		case BPF_ALU | BPF_LSH:
+			emit_instr(ctx, sll, dst, dst, insn->imm & 0x1f);
+			break;
+		case BPF_ALU64 | BPF_ARSH:
+			emit_instr(ctx, dsra_safe, dst, dst, insn->imm & 0x3f);
+			break;
+		case BPF_ALU | BPF_ARSH:
+			emit_instr(ctx, sra, dst, dst, insn->imm & 0x1f);
+			break;
+		case BPF_ALU | BPF_MOV:
+			emit_instr(ctx, addiu, dst, MIPS_R_ZERO, insn->imm);
+			break;
+		case BPF_ALU | BPF_ADD:
+			emit_instr(ctx, addiu, dst, dst, insn->imm);
+			break;
+		case BPF_ALU | BPF_SUB:
+			emit_instr(ctx, addiu, dst, dst, -insn->imm);
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		/* multi insn immediate case */
+		if (BPF_OP(insn->code) == BPF_MOV) {
+			gen_imm_to_reg(insn, dst, ctx);
+		} else {
+			gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+			switch (BPF_OP(insn->code) | BPF_CLASS(insn->code)) {
+			case BPF_ALU64 | BPF_AND:
+			case BPF_ALU | BPF_AND:
+				emit_instr(ctx, and, dst, dst, MIPS_R_AT);
+				break;
+			case BPF_ALU64 | BPF_OR:
+			case BPF_ALU | BPF_OR:
+				emit_instr(ctx, or, dst, dst, MIPS_R_AT);
+				break;
+			case BPF_ALU64 | BPF_XOR:
+			case BPF_ALU | BPF_XOR:
+				emit_instr(ctx, xor, dst, dst, MIPS_R_AT);
+				break;
+			case BPF_ALU64 | BPF_ADD:
+				emit_instr(ctx, daddu, dst, dst, MIPS_R_AT);
+				break;
+			case BPF_ALU64 | BPF_SUB:
+				emit_instr(ctx, dsubu, dst, dst, MIPS_R_AT);
+				break;
+			case BPF_ALU | BPF_ADD:
+				emit_instr(ctx, addu, dst, dst, MIPS_R_AT);
+				break;
+			case BPF_ALU | BPF_SUB:
+				emit_instr(ctx, subu, dst, dst, MIPS_R_AT);
+				break;
+			default:
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void * __must_check
+ool_skb_header_pointer(const struct sk_buff *skb, int offset,
+		       int len, void *buffer)
+{
+	return skb_header_pointer(skb, offset, len, buffer);
+}
+
+static int size_to_len(const struct bpf_insn *insn)
+{
+	switch (BPF_SIZE(insn->code)) {
+	case BPF_B:
+		return 1;
+	case BPF_H:
+		return 2;
+	case BPF_W:
+		return 4;
+	case BPF_DW:
+		return 8;
+	}
+	return 0;
+}
+
+static void emit_const_to_reg(struct jit_ctx *ctx, int dst, u64 value)
+{
+	if (value >= 0xffffffffffff8000ull || value < 0x8000ull) {
+		emit_instr(ctx, daddiu, dst, MIPS_R_ZERO, (int)value);
+	} else if (value >= 0xffffffff80000000ull ||
+		   (value < 0x80000000 && value > 0xffff)) {
+		emit_instr(ctx, lui, dst, (s32)(s16)(value >> 16));
+		emit_instr(ctx, ori, dst, dst, (unsigned int)(value & 0xffff));
+	} else {
+		int i;
+		bool seen_part = false;
+		int needed_shift = 0;
+
+		for (i = 0; i < 4; i++) {
+			u64 part = (value >> (16 * (3 - i))) & 0xffff;
+
+			if (seen_part && needed_shift > 0 && (part || i == 3)) {
+				emit_instr(ctx, dsll_safe, dst, dst, needed_shift);
+				needed_shift = 0;
+			}
+			if (part) {
+				if (i == 0 || (!seen_part && i < 3 && part < 0x8000)) {
+					emit_instr(ctx, lui, dst, (s32)(s16)part);
+					needed_shift = -16;
+				} else {
+					emit_instr(ctx, ori, dst,
+						   seen_part ? dst : MIPS_R_ZERO,
+						   (unsigned int)part);
+				}
+				seen_part = true;
+			}
+			if (seen_part)
+				needed_shift += 16;
+		}
+	}
+}
+
+static int emit_bpf_tail_call(struct jit_ctx *ctx, int this_idx)
+{
+	int off, b_off;
+
+	ctx->flags |= EBPF_SEEN_TC;
+	/*
+	 * if (index >= array->map.max_entries)
+	 *     goto out;
+	 */
+	off = offsetof(struct bpf_array, map.max_entries);
+	emit_instr(ctx, lwu, MIPS_R_T5, off, MIPS_R_A1);
+	emit_instr(ctx, sltu, MIPS_R_AT, MIPS_R_T5, MIPS_R_A2);
+	b_off = b_imm(this_idx + 1, ctx);
+	emit_instr(ctx, bne, MIPS_R_AT, MIPS_R_ZERO, b_off);
+	/*
+	 * if (--TCC < 0)
+	 *     goto out;
+	 */
+	/* Delay slot */
+	emit_instr(ctx, daddiu, MIPS_R_T5,
+		   (ctx->flags & EBPF_TCC_IN_V1) ? MIPS_R_V1 : MIPS_R_S4, -1);
+	b_off = b_imm(this_idx + 1, ctx);
+	emit_instr(ctx, bltz, MIPS_R_T5, b_off);
+	/*
+	 * prog = array->ptrs[index];
+	 * if (prog == NULL)
+	 *     goto out;
+	 */
+	/* Delay slot */
+	emit_instr(ctx, dsll, MIPS_R_T8, MIPS_R_A2, 3);
+	emit_instr(ctx, daddu, MIPS_R_T8, MIPS_R_T8, MIPS_R_A1);
+	off = offsetof(struct bpf_array, ptrs);
+	emit_instr(ctx, ld, MIPS_R_AT, off, MIPS_R_T8);
+	b_off = b_imm(this_idx + 1, ctx);
+	emit_instr(ctx, beq, MIPS_R_AT, MIPS_R_ZERO, b_off);
+	/* Delay slot */
+	emit_instr(ctx, nop);
+
+	/* goto *(prog->bpf_func + 4); */
+	off = offsetof(struct bpf_prog, bpf_func);
+	emit_instr(ctx, ld, MIPS_R_T9, off, MIPS_R_AT);
+	/* All systems are go... propagate TCC */
+	emit_instr(ctx, daddu, MIPS_R_V1, MIPS_R_T5, MIPS_R_ZERO);
+	/* Skip first instruction (TCC initialization) */
+	emit_instr(ctx, daddiu, MIPS_R_T9, MIPS_R_T9, 4);
+	return build_int_epilogue(ctx, MIPS_R_T9);
+}
+
+static bool use_bbit_insns(void)
+{
+	switch (current_cpu_type()) {
+	case CPU_CAVIUM_OCTEON:
+	case CPU_CAVIUM_OCTEON_PLUS:
+	case CPU_CAVIUM_OCTEON2:
+	case CPU_CAVIUM_OCTEON3:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool is_bad_offset(int b_off)
+{
+	return b_off > 0x1ffff || b_off < -0x20000;
+}
+
+/* Returns the number of insn slots consumed. */
+static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
+			  int this_idx, int exit_idx)
+{
+	int src, dst, r, td, ts, mem_off, b_off;
+	bool need_swap, did_move, cmp_eq;
+	unsigned int target;
+	u64 t64;
+	s64 t64s;
+
+	switch (insn->code) {
+	case BPF_ALU64 | BPF_ADD | BPF_K: /* ALU64_IMM */
+	case BPF_ALU64 | BPF_SUB | BPF_K: /* ALU64_IMM */
+	case BPF_ALU64 | BPF_OR | BPF_K: /* ALU64_IMM */
+	case BPF_ALU64 | BPF_AND | BPF_K: /* ALU64_IMM */
+	case BPF_ALU64 | BPF_LSH | BPF_K: /* ALU64_IMM */
+	case BPF_ALU64 | BPF_RSH | BPF_K: /* ALU64_IMM */
+	case BPF_ALU64 | BPF_XOR | BPF_K: /* ALU64_IMM */
+	case BPF_ALU64 | BPF_ARSH | BPF_K: /* ALU64_IMM */
+	case BPF_ALU64 | BPF_MOV | BPF_K: /* ALU64_IMM */
+	case BPF_ALU | BPF_MOV | BPF_K: /* ALU32_IMM */
+	case BPF_ALU | BPF_ADD | BPF_K: /* ALU32_IMM */
+	case BPF_ALU | BPF_SUB | BPF_K: /* ALU32_IMM */
+	case BPF_ALU | BPF_OR | BPF_K: /* ALU64_IMM */
+	case BPF_ALU | BPF_AND | BPF_K: /* ALU64_IMM */
+	case BPF_ALU | BPF_LSH | BPF_K: /* ALU64_IMM */
+	case BPF_ALU | BPF_RSH | BPF_K: /* ALU64_IMM */
+	case BPF_ALU | BPF_XOR | BPF_K: /* ALU64_IMM */
+	case BPF_ALU | BPF_ARSH | BPF_K: /* ALU64_IMM */
+		r = gen_imm_insn(insn, ctx, this_idx);
+		if (r < 0)
+			return r;
+		break;
+	case BPF_ALU64 | BPF_MUL | BPF_K: /* ALU64_IMM */
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
+			emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+		if (insn->imm == 1) /* Mult by 1 is a nop */
+			break;
+		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+		emit_instr(ctx, dmultu, MIPS_R_AT, dst);
+		emit_instr(ctx, mflo, dst);
+		break;
+	case BPF_ALU64 | BPF_NEG | BPF_K: /* ALU64_IMM */
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
+			emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+		emit_instr(ctx, dsubu, dst, MIPS_R_ZERO, dst);
+		break;
+	case BPF_ALU | BPF_MUL | BPF_K: /* ALU_IMM */
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+		if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) {
+			/* sign extend */
+			emit_instr(ctx, sll, dst, dst, 0);
+		}
+		if (insn->imm == 1) /* Mult by 1 is a nop */
+			break;
+		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+		emit_instr(ctx, multu, dst, MIPS_R_AT);
+		emit_instr(ctx, mflo, dst);
+		break;
+	case BPF_ALU | BPF_NEG | BPF_K: /* ALU_IMM */
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+		if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) {
+			/* sign extend */
+			emit_instr(ctx, sll, dst, dst, 0);
+		}
+		emit_instr(ctx, subu, dst, MIPS_R_ZERO, dst);
+		break;
+	case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */
+	case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		if (insn->imm == 0) { /* Div by zero */
+			b_off = b_imm(exit_idx, ctx);
+			if (is_bad_offset(b_off))
+				return -E2BIG;
+			emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
+			emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO);
+		}
+		td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+		if (td == REG_64BIT || td == REG_32BIT_ZERO_EX)
+			/* sign extend */
+			emit_instr(ctx, sll, dst, dst, 0);
+		if (insn->imm == 1) {
+			/* div by 1 is a nop, mod by 1 is zero */
+			if (BPF_OP(insn->code) == BPF_MOD)
+				emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO);
+			break;
+		}
+		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+		emit_instr(ctx, divu, dst, MIPS_R_AT);
+		if (BPF_OP(insn->code) == BPF_DIV)
+			emit_instr(ctx, mflo, dst);
+		else
+			emit_instr(ctx, mfhi, dst);
+		break;
+	case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */
+	case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		if (insn->imm == 0) { /* Div by zero */
+			b_off = b_imm(exit_idx, ctx);
+			if (is_bad_offset(b_off))
+				return -E2BIG;
+			emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
+			emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO);
+		}
+		if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
+			emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+
+		if (insn->imm == 1) {
+			/* div by 1 is a nop, mod by 1 is zero */
+			if (BPF_OP(insn->code) == BPF_MOD)
+				emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO);
+			break;
+		}
+		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+		emit_instr(ctx, ddivu, dst, MIPS_R_AT);
+		if (BPF_OP(insn->code) == BPF_DIV)
+			emit_instr(ctx, mflo, dst);
+		else
+			emit_instr(ctx, mfhi, dst);
+		break;
+	case BPF_ALU64 | BPF_MOV | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_ADD | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_SUB | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_XOR | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_OR | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_AND | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_MUL | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_DIV | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_MOD | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_LSH | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_RSH | BPF_X: /* ALU64_REG */
+	case BPF_ALU64 | BPF_ARSH | BPF_X: /* ALU64_REG */
+		src = ebpf_to_mips_reg(ctx, insn, src_reg);
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (src < 0 || dst < 0)
+			return -EINVAL;
+		if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
+			emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+		did_move = false;
+		if (insn->src_reg == BPF_REG_10) {
+			if (BPF_OP(insn->code) == BPF_MOV) {
+				emit_instr(ctx, daddiu, dst, MIPS_R_SP, MAX_BPF_STACK);
+				did_move = true;
+			} else {
+				emit_instr(ctx, daddiu, MIPS_R_AT, MIPS_R_SP, MAX_BPF_STACK);
+				src = MIPS_R_AT;
+			}
+		} else if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
+			int tmp_reg = MIPS_R_AT;
+
+			if (BPF_OP(insn->code) == BPF_MOV) {
+				tmp_reg = dst;
+				did_move = true;
+			}
+			emit_instr(ctx, daddu, tmp_reg, src, MIPS_R_ZERO);
+			emit_instr(ctx, dinsu, tmp_reg, MIPS_R_ZERO, 32, 32);
+			src = MIPS_R_AT;
+		}
+		switch (BPF_OP(insn->code)) {
+		case BPF_MOV:
+			if (!did_move)
+				emit_instr(ctx, daddu, dst, src, MIPS_R_ZERO);
+			break;
+		case BPF_ADD:
+			emit_instr(ctx, daddu, dst, dst, src);
+			break;
+		case BPF_SUB:
+			emit_instr(ctx, dsubu, dst, dst, src);
+			break;
+		case BPF_XOR:
+			emit_instr(ctx, xor, dst, dst, src);
+			break;
+		case BPF_OR:
+			emit_instr(ctx, or, dst, dst, src);
+			break;
+		case BPF_AND:
+			emit_instr(ctx, and, dst, dst, src);
+			break;
+		case BPF_MUL:
+			emit_instr(ctx, dmultu, dst, src);
+			emit_instr(ctx, mflo, dst);
+			break;
+		case BPF_DIV:
+		case BPF_MOD:
+			b_off = b_imm(exit_idx, ctx);
+			if (is_bad_offset(b_off))
+				return -E2BIG;
+			emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
+			emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
+			emit_instr(ctx, ddivu, dst, src);
+			if (BPF_OP(insn->code) == BPF_DIV)
+				emit_instr(ctx, mflo, dst);
+			else
+				emit_instr(ctx, mfhi, dst);
+			break;
+		case BPF_LSH:
+			emit_instr(ctx, dsllv, dst, dst, src);
+			break;
+		case BPF_RSH:
+			emit_instr(ctx, dsrlv, dst, dst, src);
+			break;
+		case BPF_ARSH:
+			emit_instr(ctx, dsrav, dst, dst, src);
+			break;
+		default:
+			pr_err("ALU64_REG NOT HANDLED\n");
+			return -EINVAL;
+		}
+		break;
+	case BPF_ALU | BPF_MOV | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_ADD | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_SUB | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_XOR | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_OR | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_AND | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_MUL | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_DIV | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_MOD | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_LSH | BPF_X: /* ALU_REG */
+	case BPF_ALU | BPF_RSH | BPF_X: /* ALU_REG */
+		src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (src < 0 || dst < 0)
+			return -EINVAL;
+		td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+		if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) {
+			/* sign extend */
+			emit_instr(ctx, sll, dst, dst, 0);
+		}
+		did_move = false;
+		ts = get_reg_val_type(ctx, this_idx, insn->src_reg);
+		if (ts == REG_64BIT || ts == REG_32BIT_ZERO_EX) {
+			int tmp_reg = MIPS_R_AT;
+
+			if (BPF_OP(insn->code) == BPF_MOV) {
+				tmp_reg = dst;
+				did_move = true;
+			}
+			/* sign extend */
+			emit_instr(ctx, sll, tmp_reg, src, 0);
+			src = MIPS_R_AT;
+		}
+		switch (BPF_OP(insn->code)) {
+		case BPF_MOV:
+			if (!did_move)
+				emit_instr(ctx, addu, dst, src, MIPS_R_ZERO);
+			break;
+		case BPF_ADD:
+			emit_instr(ctx, addu, dst, dst, src);
+			break;
+		case BPF_SUB:
+			emit_instr(ctx, subu, dst, dst, src);
+			break;
+		case BPF_XOR:
+			emit_instr(ctx, xor, dst, dst, src);
+			break;
+		case BPF_OR:
+			emit_instr(ctx, or, dst, dst, src);
+			break;
+		case BPF_AND:
+			emit_instr(ctx, and, dst, dst, src);
+			break;
+		case BPF_MUL:
+			emit_instr(ctx, mul, dst, dst, src);
+			break;
+		case BPF_DIV:
+		case BPF_MOD:
+			b_off = b_imm(exit_idx, ctx);
+			if (is_bad_offset(b_off))
+				return -E2BIG;
+			emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
+			emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
+			emit_instr(ctx, divu, dst, src);
+			if (BPF_OP(insn->code) == BPF_DIV)
+				emit_instr(ctx, mflo, dst);
+			else
+				emit_instr(ctx, mfhi, dst);
+			break;
+		case BPF_LSH:
+			emit_instr(ctx, sllv, dst, dst, src);
+			break;
+		case BPF_RSH:
+			emit_instr(ctx, srlv, dst, dst, src);
+			break;
+		default:
+			pr_err("ALU_REG NOT HANDLED\n");
+			return -EINVAL;
+		}
+		break;
+	case BPF_JMP | BPF_EXIT:
+		if (this_idx + 1 < exit_idx) {
+			b_off = b_imm(exit_idx, ctx);
+			if (is_bad_offset(b_off))
+				return -E2BIG;
+			emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
+			emit_instr(ctx, nop);
+		}
+		break;
+	case BPF_JMP | BPF_JEQ | BPF_K: /* JMP_IMM */
+	case BPF_JMP | BPF_JNE | BPF_K: /* JMP_IMM */
+		cmp_eq = (BPF_OP(insn->code) == BPF_JEQ);
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok);
+		if (dst < 0)
+			return dst;
+		if (insn->imm == 0) {
+			src = MIPS_R_ZERO;
+		} else {
+			gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+			src = MIPS_R_AT;
+		}
+		goto jeq_common;
+	case BPF_JMP | BPF_JEQ | BPF_X: /* JMP_REG */
+	case BPF_JMP | BPF_JNE | BPF_X:
+	case BPF_JMP | BPF_JSGT | BPF_X:
+	case BPF_JMP | BPF_JSGE | BPF_X:
+	case BPF_JMP | BPF_JGT | BPF_X:
+	case BPF_JMP | BPF_JGE | BPF_X:
+	case BPF_JMP | BPF_JSET | BPF_X:
+		src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (src < 0 || dst < 0)
+			return -EINVAL;
+		td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+		ts = get_reg_val_type(ctx, this_idx, insn->src_reg);
+		if (td == REG_32BIT && ts != REG_32BIT) {
+			emit_instr(ctx, sll, MIPS_R_AT, src, 0);
+			src = MIPS_R_AT;
+		} else if (ts == REG_32BIT && td != REG_32BIT) {
+			emit_instr(ctx, sll, MIPS_R_AT, dst, 0);
+			dst = MIPS_R_AT;
+		}
+		if (BPF_OP(insn->code) == BPF_JSET) {
+			emit_instr(ctx, and, MIPS_R_AT, dst, src);
+			cmp_eq = false;
+			dst = MIPS_R_AT;
+			src = MIPS_R_ZERO;
+		} else if (BPF_OP(insn->code) == BPF_JSGT) {
+			emit_instr(ctx, dsubu, MIPS_R_AT, dst, src);
+			if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) {
+				b_off = b_imm(exit_idx, ctx);
+				if (is_bad_offset(b_off))
+					return -E2BIG;
+				emit_instr(ctx, blez, MIPS_R_AT, b_off);
+				emit_instr(ctx, nop);
+				return 2; /* We consumed the exit. */
+			}
+			b_off = b_imm(this_idx + insn->off + 1, ctx);
+			if (is_bad_offset(b_off))
+				return -E2BIG;
+			emit_instr(ctx, bgtz, MIPS_R_AT, b_off);
+			emit_instr(ctx, nop);
+			break;
+		} else if (BPF_OP(insn->code) == BPF_JSGE) {
+			emit_instr(ctx, slt, MIPS_R_AT, dst, src);
+			cmp_eq = true;
+			dst = MIPS_R_AT;
+			src = MIPS_R_ZERO;
+		} else if (BPF_OP(insn->code) == BPF_JGT) {
+			/* dst or src could be AT */
+			emit_instr(ctx, dsubu, MIPS_R_T8, dst, src);
+			emit_instr(ctx, sltu, MIPS_R_AT, dst, src);
+			/* SP known to be non-zero, movz becomes boolean not */
+			emit_instr(ctx, movz, MIPS_R_T9, MIPS_R_SP, MIPS_R_T8);
+			emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_ZERO, MIPS_R_T8);
+			emit_instr(ctx, or, MIPS_R_AT, MIPS_R_T9, MIPS_R_AT);
+			cmp_eq = true;
+			dst = MIPS_R_AT;
+			src = MIPS_R_ZERO;
+		} else if (BPF_OP(insn->code) == BPF_JGE) {
+			emit_instr(ctx, sltu, MIPS_R_AT, dst, src);
+			cmp_eq = true;
+			dst = MIPS_R_AT;
+			src = MIPS_R_ZERO;
+		} else { /* JNE/JEQ case */
+			cmp_eq = (BPF_OP(insn->code) == BPF_JEQ);
+		}
+jeq_common:
+		/*
+		 * If the next insn is EXIT and we are jumping arround
+		 * only it, invert the sense of the compare and
+		 * conditionally jump to the exit.  Poor man's branch
+		 * chaining.
+		 */
+		if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) {
+			b_off = b_imm(exit_idx, ctx);
+			if (is_bad_offset(b_off)) {
+				target = j_target(ctx, exit_idx);
+				if (target == (unsigned int)-1)
+					return -E2BIG;
+				cmp_eq = !cmp_eq;
+				b_off = 4 * 3;
+				if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) {
+					ctx->offsets[this_idx] |= OFFSETS_B_CONV;
+					ctx->long_b_conversion = 1;
+				}
+			}
+
+			if (cmp_eq)
+				emit_instr(ctx, bne, dst, src, b_off);
+			else
+				emit_instr(ctx, beq, dst, src, b_off);
+			emit_instr(ctx, nop);
+			if (ctx->offsets[this_idx] & OFFSETS_B_CONV) {
+				emit_instr(ctx, j, target);
+				emit_instr(ctx, nop);
+			}
+			return 2; /* We consumed the exit. */
+		}
+		b_off = b_imm(this_idx + insn->off + 1, ctx);
+		if (is_bad_offset(b_off)) {
+			target = j_target(ctx, this_idx + insn->off + 1);
+			if (target == (unsigned int)-1)
+				return -E2BIG;
+			cmp_eq = !cmp_eq;
+			b_off = 4 * 3;
+			if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) {
+				ctx->offsets[this_idx] |= OFFSETS_B_CONV;
+				ctx->long_b_conversion = 1;
+			}
+		}
+
+		if (cmp_eq)
+			emit_instr(ctx, beq, dst, src, b_off);
+		else
+			emit_instr(ctx, bne, dst, src, b_off);
+		emit_instr(ctx, nop);
+		if (ctx->offsets[this_idx] & OFFSETS_B_CONV) {
+			emit_instr(ctx, j, target);
+			emit_instr(ctx, nop);
+		}
+		break;
+	case BPF_JMP | BPF_JSGT | BPF_K: /* JMP_IMM */
+	case BPF_JMP | BPF_JSGE | BPF_K: /* JMP_IMM */
+		cmp_eq = (BPF_OP(insn->code) == BPF_JSGE);
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok);
+		if (dst < 0)
+			return dst;
+
+		if (insn->imm == 0) {
+			if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) {
+				b_off = b_imm(exit_idx, ctx);
+				if (is_bad_offset(b_off))
+					return -E2BIG;
+				if (cmp_eq)
+					emit_instr(ctx, bltz, dst, b_off);
+				else
+					emit_instr(ctx, blez, dst, b_off);
+				emit_instr(ctx, nop);
+				return 2; /* We consumed the exit. */
+			}
+			b_off = b_imm(this_idx + insn->off + 1, ctx);
+			if (is_bad_offset(b_off))
+				return -E2BIG;
+			if (cmp_eq)
+				emit_instr(ctx, bgez, dst, b_off);
+			else
+				emit_instr(ctx, bgtz, dst, b_off);
+			emit_instr(ctx, nop);
+			break;
+		}
+		/*
+		 * only "LT" compare available, so we must use imm + 1
+		 * to generate "GT"
+		 */
+		t64s = insn->imm + (cmp_eq ? 0 : 1);
+		if (t64s >= S16_MIN && t64s <= S16_MAX) {
+			emit_instr(ctx, slti, MIPS_R_AT, dst, (int)t64s);
+			src = MIPS_R_AT;
+			dst = MIPS_R_ZERO;
+			cmp_eq = true;
+			goto jeq_common;
+		}
+		emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s);
+		emit_instr(ctx, slt, MIPS_R_AT, dst, MIPS_R_AT);
+		src = MIPS_R_AT;
+		dst = MIPS_R_ZERO;
+		cmp_eq = true;
+		goto jeq_common;
+
+	case BPF_JMP | BPF_JGT | BPF_K:
+	case BPF_JMP | BPF_JGE | BPF_K:
+		cmp_eq = (BPF_OP(insn->code) == BPF_JGE);
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok);
+		if (dst < 0)
+			return dst;
+		/*
+		 * only "LT" compare available, so we must use imm + 1
+		 * to generate "GT"
+		 */
+		t64s = (u64)(u32)(insn->imm) + (cmp_eq ? 0 : 1);
+		if (t64s >= 0 && t64s <= S16_MAX) {
+			emit_instr(ctx, sltiu, MIPS_R_AT, dst, (int)t64s);
+			src = MIPS_R_AT;
+			dst = MIPS_R_ZERO;
+			cmp_eq = true;
+			goto jeq_common;
+		}
+		emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s);
+		emit_instr(ctx, sltu, MIPS_R_AT, dst, MIPS_R_AT);
+		src = MIPS_R_AT;
+		dst = MIPS_R_ZERO;
+		cmp_eq = true;
+		goto jeq_common;
+
+	case BPF_JMP | BPF_JSET | BPF_K: /* JMP_IMM */
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok);
+		if (dst < 0)
+			return dst;
+
+		if (use_bbit_insns() && hweight32((u32)insn->imm) == 1) {
+			if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) {
+				b_off = b_imm(exit_idx, ctx);
+				if (is_bad_offset(b_off))
+					return -E2BIG;
+				emit_instr(ctx, bbit0, dst, ffs((u32)insn->imm) - 1, b_off);
+				emit_instr(ctx, nop);
+				return 2; /* We consumed the exit. */
+			}
+			b_off = b_imm(this_idx + insn->off + 1, ctx);
+			if (is_bad_offset(b_off))
+				return -E2BIG;
+			emit_instr(ctx, bbit1, dst, ffs((u32)insn->imm) - 1, b_off);
+			emit_instr(ctx, nop);
+			break;
+		}
+		t64 = (u32)insn->imm;
+		emit_const_to_reg(ctx, MIPS_R_AT, t64);
+		emit_instr(ctx, and, MIPS_R_AT, dst, MIPS_R_AT);
+		src = MIPS_R_AT;
+		dst = MIPS_R_ZERO;
+		cmp_eq = false;
+		goto jeq_common;
+
+	case BPF_JMP | BPF_JA:
+		/*
+		 * Prefer relative branch for easier debugging, but
+		 * fall back if needed.
+		 */
+		b_off = b_imm(this_idx + insn->off + 1, ctx);
+		if (is_bad_offset(b_off)) {
+			target = j_target(ctx, this_idx + insn->off + 1);
+			if (target == (unsigned int)-1)
+				return -E2BIG;
+			emit_instr(ctx, j, target);
+		} else {
+			emit_instr(ctx, b, b_off);
+		}
+		emit_instr(ctx, nop);
+		break;
+	case BPF_LD | BPF_DW | BPF_IMM:
+		if (insn->src_reg != 0)
+			return -EINVAL;
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		t64 = ((u64)(u32)insn->imm) | ((u64)(insn + 1)->imm << 32);
+		emit_const_to_reg(ctx, dst, t64);
+		return 2; /* Double slot insn */
+
+	case BPF_JMP | BPF_CALL:
+		ctx->flags |= EBPF_SAVE_RA;
+		t64s = (s64)insn->imm + (s64)__bpf_call_base;
+		emit_const_to_reg(ctx, MIPS_R_T9, (u64)t64s);
+		emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
+		/* delay slot */
+		emit_instr(ctx, nop);
+		break;
+
+	case BPF_JMP | BPF_TAIL_CALL:
+		if (emit_bpf_tail_call(ctx, this_idx))
+			return -EINVAL;
+		break;
+
+	case BPF_LD | BPF_B | BPF_ABS:
+	case BPF_LD | BPF_H | BPF_ABS:
+	case BPF_LD | BPF_W | BPF_ABS:
+	case BPF_LD | BPF_DW | BPF_ABS:
+		ctx->flags |= EBPF_SAVE_RA;
+
+		gen_imm_to_reg(insn, MIPS_R_A1, ctx);
+		emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn));
+
+		if (insn->imm < 0) {
+			emit_const_to_reg(ctx, MIPS_R_T9, (u64)bpf_internal_load_pointer_neg_helper);
+		} else {
+			emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer);
+			emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset);
+		}
+		goto ld_skb_common;
+
+	case BPF_LD | BPF_B | BPF_IND:
+	case BPF_LD | BPF_H | BPF_IND:
+	case BPF_LD | BPF_W | BPF_IND:
+	case BPF_LD | BPF_DW | BPF_IND:
+		ctx->flags |= EBPF_SAVE_RA;
+		src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+		if (src < 0)
+			return src;
+		ts = get_reg_val_type(ctx, this_idx, insn->src_reg);
+		if (ts == REG_32BIT_ZERO_EX) {
+			/* sign extend */
+			emit_instr(ctx, sll, MIPS_R_A1, src, 0);
+			src = MIPS_R_A1;
+		}
+		if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) {
+			emit_instr(ctx, daddiu, MIPS_R_A1, src, insn->imm);
+		} else {
+			gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+			emit_instr(ctx, daddu, MIPS_R_A1, MIPS_R_AT, src);
+		}
+		/* truncate to 32-bit int */
+		emit_instr(ctx, sll, MIPS_R_A1, MIPS_R_A1, 0);
+		emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset);
+		emit_instr(ctx, slt, MIPS_R_AT, MIPS_R_A1, MIPS_R_ZERO);
+
+		emit_const_to_reg(ctx, MIPS_R_T8, (u64)bpf_internal_load_pointer_neg_helper);
+		emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer);
+		emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn));
+		emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_T8, MIPS_R_AT);
+
+ld_skb_common:
+		emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
+		/* delay slot move */
+		emit_instr(ctx, daddu, MIPS_R_A0, MIPS_R_S0, MIPS_R_ZERO);
+
+		/* Check the error value */
+		b_off = b_imm(exit_idx, ctx);
+		if (is_bad_offset(b_off)) {
+			target = j_target(ctx, exit_idx);
+			if (target == (unsigned int)-1)
+				return -E2BIG;
+
+			if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) {
+				ctx->offsets[this_idx] |= OFFSETS_B_CONV;
+				ctx->long_b_conversion = 1;
+			}
+			emit_instr(ctx, bne, MIPS_R_V0, MIPS_R_ZERO, 4 * 3);
+			emit_instr(ctx, nop);
+			emit_instr(ctx, j, target);
+			emit_instr(ctx, nop);
+		} else {
+			emit_instr(ctx, beq, MIPS_R_V0, MIPS_R_ZERO, b_off);
+			emit_instr(ctx, nop);
+		}
+
+#ifdef __BIG_ENDIAN
+		need_swap = false;
+#else
+		need_swap = true;
+#endif
+		dst = MIPS_R_V0;
+		switch (BPF_SIZE(insn->code)) {
+		case BPF_B:
+			emit_instr(ctx, lbu, dst, 0, MIPS_R_V0);
+			break;
+		case BPF_H:
+			emit_instr(ctx, lhu, dst, 0, MIPS_R_V0);
+			if (need_swap)
+				emit_instr(ctx, wsbh, dst, dst);
+			break;
+		case BPF_W:
+			emit_instr(ctx, lw, dst, 0, MIPS_R_V0);
+			if (need_swap) {
+				emit_instr(ctx, wsbh, dst, dst);
+				emit_instr(ctx, rotr, dst, dst, 16);
+			}
+			break;
+		case BPF_DW:
+			emit_instr(ctx, ld, dst, 0, MIPS_R_V0);
+			if (need_swap) {
+				emit_instr(ctx, dsbh, dst, dst);
+				emit_instr(ctx, dshd, dst, dst);
+			}
+			break;
+		}
+
+		break;
+	case BPF_ALU | BPF_END | BPF_FROM_BE:
+	case BPF_ALU | BPF_END | BPF_FROM_LE:
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
+		if (insn->imm == 64 && td == REG_32BIT)
+			emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
+
+		if (insn->imm != 64 &&
+		    (td == REG_64BIT || td == REG_32BIT_ZERO_EX)) {
+			/* sign extend */
+			emit_instr(ctx, sll, dst, dst, 0);
+		}
+
+#ifdef __BIG_ENDIAN
+		need_swap = (BPF_SRC(insn->code) == BPF_FROM_LE);
+#else
+		need_swap = (BPF_SRC(insn->code) == BPF_FROM_BE);
+#endif
+		if (insn->imm == 16) {
+			if (need_swap)
+				emit_instr(ctx, wsbh, dst, dst);
+			emit_instr(ctx, andi, dst, dst, 0xffff);
+		} else if (insn->imm == 32) {
+			if (need_swap) {
+				emit_instr(ctx, wsbh, dst, dst);
+				emit_instr(ctx, rotr, dst, dst, 16);
+			}
+		} else { /* 64-bit*/
+			if (need_swap) {
+				emit_instr(ctx, dsbh, dst, dst);
+				emit_instr(ctx, dshd, dst, dst);
+			}
+		}
+		break;
+
+	case BPF_ST | BPF_B | BPF_MEM:
+	case BPF_ST | BPF_H | BPF_MEM:
+	case BPF_ST | BPF_W | BPF_MEM:
+	case BPF_ST | BPF_DW | BPF_MEM:
+		if (insn->dst_reg == BPF_REG_10) {
+			ctx->flags |= EBPF_SEEN_FP;
+			dst = MIPS_R_SP;
+			mem_off = insn->off + MAX_BPF_STACK;
+		} else {
+			dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+			if (dst < 0)
+				return dst;
+			mem_off = insn->off;
+		}
+		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+		switch (BPF_SIZE(insn->code)) {
+		case BPF_B:
+			emit_instr(ctx, sb, MIPS_R_AT, mem_off, dst);
+			break;
+		case BPF_H:
+			emit_instr(ctx, sh, MIPS_R_AT, mem_off, dst);
+			break;
+		case BPF_W:
+			emit_instr(ctx, sw, MIPS_R_AT, mem_off, dst);
+			break;
+		case BPF_DW:
+			emit_instr(ctx, sd, MIPS_R_AT, mem_off, dst);
+			break;
+		}
+		break;
+
+	case BPF_LDX | BPF_B | BPF_MEM:
+	case BPF_LDX | BPF_H | BPF_MEM:
+	case BPF_LDX | BPF_W | BPF_MEM:
+	case BPF_LDX | BPF_DW | BPF_MEM:
+		if (insn->src_reg == BPF_REG_10) {
+			ctx->flags |= EBPF_SEEN_FP;
+			src = MIPS_R_SP;
+			mem_off = insn->off + MAX_BPF_STACK;
+		} else {
+			src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+			if (src < 0)
+				return src;
+			mem_off = insn->off;
+		}
+		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+		if (dst < 0)
+			return dst;
+		switch (BPF_SIZE(insn->code)) {
+		case BPF_B:
+			emit_instr(ctx, lbu, dst, mem_off, src);
+			break;
+		case BPF_H:
+			emit_instr(ctx, lhu, dst, mem_off, src);
+			break;
+		case BPF_W:
+			emit_instr(ctx, lw, dst, mem_off, src);
+			break;
+		case BPF_DW:
+			emit_instr(ctx, ld, dst, mem_off, src);
+			break;
+		}
+		break;
+
+	case BPF_STX | BPF_B | BPF_MEM:
+	case BPF_STX | BPF_H | BPF_MEM:
+	case BPF_STX | BPF_W | BPF_MEM:
+	case BPF_STX | BPF_DW | BPF_MEM:
+	case BPF_STX | BPF_W | BPF_XADD:
+	case BPF_STX | BPF_DW | BPF_XADD:
+		if (insn->dst_reg == BPF_REG_10) {
+			ctx->flags |= EBPF_SEEN_FP;
+			dst = MIPS_R_SP;
+			mem_off = insn->off + MAX_BPF_STACK;
+		} else {
+			dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
+			if (dst < 0)
+				return dst;
+			mem_off = insn->off;
+		}
+		src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
+		if (src < 0)
+			return dst;
+		if (BPF_MODE(insn->code) == BPF_XADD) {
+			switch (BPF_SIZE(insn->code)) {
+			case BPF_W:
+				if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
+					emit_instr(ctx, sll, MIPS_R_AT, src, 0);
+					src = MIPS_R_AT;
+				}
+				emit_instr(ctx, ll, MIPS_R_T8, mem_off, dst);
+				emit_instr(ctx, addu, MIPS_R_T8, MIPS_R_T8, src);
+				emit_instr(ctx, sc, MIPS_R_T8, mem_off, dst);
+				/*
+				 * On failure back up to LL (-4
+				 * instructions of 4 bytes each
+				 */
+				emit_instr(ctx, beq, MIPS_R_T8, MIPS_R_ZERO, -4 * 4);
+				emit_instr(ctx, nop);
+				break;
+			case BPF_DW:
+				if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
+					emit_instr(ctx, daddu, MIPS_R_AT, src, MIPS_R_ZERO);
+					emit_instr(ctx, dinsu, MIPS_R_AT, MIPS_R_ZERO, 32, 32);
+					src = MIPS_R_AT;
+				}
+				emit_instr(ctx, lld, MIPS_R_T8, mem_off, dst);
+				emit_instr(ctx, daddu, MIPS_R_T8, MIPS_R_T8, src);
+				emit_instr(ctx, scd, MIPS_R_T8, mem_off, dst);
+				emit_instr(ctx, beq, MIPS_R_T8, MIPS_R_ZERO, -4 * 4);
+				emit_instr(ctx, nop);
+				break;
+			}
+		} else { /* BPF_MEM */
+			switch (BPF_SIZE(insn->code)) {
+			case BPF_B:
+				emit_instr(ctx, sb, src, mem_off, dst);
+				break;
+			case BPF_H:
+				emit_instr(ctx, sh, src, mem_off, dst);
+				break;
+			case BPF_W:
+				emit_instr(ctx, sw, src, mem_off, dst);
+				break;
+			case BPF_DW:
+				if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
+					emit_instr(ctx, daddu, MIPS_R_AT, src, MIPS_R_ZERO);
+					emit_instr(ctx, dinsu, MIPS_R_AT, MIPS_R_ZERO, 32, 32);
+					src = MIPS_R_AT;
+				}
+				emit_instr(ctx, sd, src, mem_off, dst);
+				break;
+			}
+		}
+		break;
+
+	default:
+		pr_err("NOT HANDLED %d - (%02x)\n",
+		       this_idx, (unsigned int)insn->code);
+		return -EINVAL;
+	}
+	return 1;
+}
+
+#define RVT_VISITED_MASK 0xc000000000000000ull
+#define RVT_FALL_THROUGH 0x4000000000000000ull
+#define RVT_BRANCH_TAKEN 0x8000000000000000ull
+#define RVT_DONE (RVT_FALL_THROUGH | RVT_BRANCH_TAKEN)
+
+static int build_int_body(struct jit_ctx *ctx)
+{
+	const struct bpf_prog *prog = ctx->skf;
+	const struct bpf_insn *insn;
+	int i, r;
+
+	for (i = 0; i < prog->len; ) {
+		insn = prog->insnsi + i;
+		if ((ctx->reg_val_types[i] & RVT_VISITED_MASK) == 0) {
+			/* dead instruction, don't emit it. */
+			i++;
+			continue;
+		}
+
+		if (ctx->target == NULL)
+			ctx->offsets[i] = (ctx->offsets[i] & OFFSETS_B_CONV) | (ctx->idx * 4);
+
+		r = build_one_insn(insn, ctx, i, prog->len);
+		if (r < 0)
+			return r;
+		i += r;
+	}
+	/* epilogue offset */
+	if (ctx->target == NULL)
+		ctx->offsets[i] = ctx->idx * 4;
+
+	/*
+	 * All exits have an offset of the epilogue, some offsets may
+	 * not have been set due to banch-around threading, so set
+	 * them now.
+	 */
+	if (ctx->target == NULL)
+		for (i = 0; i < prog->len; i++) {
+			insn = prog->insnsi + i;
+			if (insn->code == (BPF_JMP | BPF_EXIT))
+				ctx->offsets[i] = ctx->idx * 4;
+		}
+	return 0;
+}
+
+/* return the last idx processed, or negative for error */
+static int reg_val_propagate_range(struct jit_ctx *ctx, u64 initial_rvt,
+				   int start_idx, bool follow_taken)
+{
+	const struct bpf_prog *prog = ctx->skf;
+	const struct bpf_insn *insn;
+	u64 exit_rvt = initial_rvt;
+	u64 *rvt = ctx->reg_val_types;
+	int idx;
+	int reg;
+
+	for (idx = start_idx; idx < prog->len; idx++) {
+		rvt[idx] = (rvt[idx] & RVT_VISITED_MASK) | exit_rvt;
+		insn = prog->insnsi + idx;
+		switch (BPF_CLASS(insn->code)) {
+		case BPF_ALU:
+			switch (BPF_OP(insn->code)) {
+			case BPF_ADD:
+			case BPF_SUB:
+			case BPF_MUL:
+			case BPF_DIV:
+			case BPF_OR:
+			case BPF_AND:
+			case BPF_LSH:
+			case BPF_RSH:
+			case BPF_NEG:
+			case BPF_MOD:
+			case BPF_XOR:
+				set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+				break;
+			case BPF_MOV:
+				if (BPF_SRC(insn->code)) {
+					set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+				} else {
+					/* IMM to REG move*/
+					if (insn->imm >= 0)
+						set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+					else
+						set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+				}
+				break;
+			case BPF_END:
+				if (insn->imm == 64)
+					set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+				else if (insn->imm == 32)
+					set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+				else /* insn->imm == 16 */
+					set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+				break;
+			}
+			rvt[idx] |= RVT_DONE;
+			break;
+		case BPF_ALU64:
+			switch (BPF_OP(insn->code)) {
+			case BPF_MOV:
+				if (BPF_SRC(insn->code)) {
+					/* REG to REG move*/
+					set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+				} else {
+					/* IMM to REG move*/
+					if (insn->imm >= 0)
+						set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+					else
+						set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT_32BIT);
+				}
+				break;
+			default:
+				set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+			}
+			rvt[idx] |= RVT_DONE;
+			break;
+		case BPF_LD:
+			switch (BPF_SIZE(insn->code)) {
+			case BPF_DW:
+				if (BPF_MODE(insn->code) == BPF_IMM) {
+					s64 val;
+
+					val = (s64)((u32)insn->imm | ((u64)(insn + 1)->imm << 32));
+					if (val > 0 && val <= S32_MAX)
+						set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+					else if (val >= S32_MIN && val <= S32_MAX)
+						set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT_32BIT);
+					else
+						set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+					rvt[idx] |= RVT_DONE;
+					idx++;
+				} else {
+					set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+				}
+				break;
+			case BPF_B:
+			case BPF_H:
+				set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+				break;
+			case BPF_W:
+				if (BPF_MODE(insn->code) == BPF_IMM)
+					set_reg_val_type(&exit_rvt, insn->dst_reg,
+							 insn->imm >= 0 ? REG_32BIT_POS : REG_32BIT);
+				else
+					set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+				break;
+			}
+			rvt[idx] |= RVT_DONE;
+			break;
+		case BPF_LDX:
+			switch (BPF_SIZE(insn->code)) {
+			case BPF_DW:
+				set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT);
+				break;
+			case BPF_B:
+			case BPF_H:
+				set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS);
+				break;
+			case BPF_W:
+				set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT);
+				break;
+			}
+			rvt[idx] |= RVT_DONE;
+			break;
+		case BPF_JMP:
+			switch (BPF_OP(insn->code)) {
+			case BPF_EXIT:
+				rvt[idx] = RVT_DONE | exit_rvt;
+				rvt[prog->len] = exit_rvt;
+				return idx;
+			case BPF_JA:
+				rvt[idx] |= RVT_DONE;
+				idx += insn->off;
+				break;
+			case BPF_JEQ:
+			case BPF_JGT:
+			case BPF_JGE:
+			case BPF_JSET:
+			case BPF_JNE:
+			case BPF_JSGT:
+			case BPF_JSGE:
+				if (follow_taken) {
+					rvt[idx] |= RVT_BRANCH_TAKEN;
+					idx += insn->off;
+					follow_taken = false;
+				} else {
+					rvt[idx] |= RVT_FALL_THROUGH;
+				}
+				break;
+			case BPF_CALL:
+				set_reg_val_type(&exit_rvt, BPF_REG_0, REG_64BIT);
+				/* Upon call return, argument registers are clobbered. */
+				for (reg = BPF_REG_0; reg <= BPF_REG_5; reg++)
+					set_reg_val_type(&exit_rvt, reg, REG_64BIT);
+
+				rvt[idx] |= RVT_DONE;
+				break;
+			default:
+				WARN(1, "Unhandled BPF_JMP case.\n");
+				rvt[idx] |= RVT_DONE;
+				break;
+			}
+			break;
+		default:
+			rvt[idx] |= RVT_DONE;
+			break;
+		}
+	}
+	return idx;
+}
+
+/*
+ * Track the value range (i.e. 32-bit vs. 64-bit) of each register at
+ * each eBPF insn.  This allows unneeded sign and zero extension
+ * operations to be omitted.
+ *
+ * Doesn't handle yet confluence of control paths with conflicting
+ * ranges, but it is good enough for most sane code.
+ */
+static int reg_val_propagate(struct jit_ctx *ctx)
+{
+	const struct bpf_prog *prog = ctx->skf;
+	u64 exit_rvt;
+	int reg;
+	int i;
+
+	/*
+	 * 11 registers * 3 bits/reg leaves top bits free for other
+	 * uses.  Bit-62..63 used to see if we have visited an insn.
+	 */
+	exit_rvt = 0;
+
+	/* Upon entry, argument registers are 64-bit. */
+	for (reg = BPF_REG_1; reg <= BPF_REG_5; reg++)
+		set_reg_val_type(&exit_rvt, reg, REG_64BIT);
+
+	/*
+	 * First follow all conditional branches on the fall-through
+	 * edge of control flow..
+	 */
+	reg_val_propagate_range(ctx, exit_rvt, 0, false);
+restart_search:
+	/*
+	 * Then repeatedly find the first conditional branch where
+	 * both edges of control flow have not been taken, and follow
+	 * the branch taken edge.  We will end up restarting the
+	 * search once per conditional branch insn.
+	 */
+	for (i = 0; i < prog->len; i++) {
+		u64 rvt = ctx->reg_val_types[i];
+
+		if ((rvt & RVT_VISITED_MASK) == RVT_DONE ||
+		    (rvt & RVT_VISITED_MASK) == 0)
+			continue;
+		if ((rvt & RVT_VISITED_MASK) == RVT_FALL_THROUGH) {
+			reg_val_propagate_range(ctx, rvt & ~RVT_VISITED_MASK, i, true);
+		} else { /* RVT_BRANCH_TAKEN */
+			WARN(1, "Unexpected RVT_BRANCH_TAKEN case.\n");
+			reg_val_propagate_range(ctx, rvt & ~RVT_VISITED_MASK, i, false);
+		}
+		goto restart_search;
+	}
+	/*
+	 * Eventually all conditional branches have been followed on
+	 * both branches and we are done.  Any insn that has not been
+	 * visited at this point is dead.
+	 */
+
+	return 0;
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+	u32 *p;
+
+	/* We are guaranteed to have aligned memory. */
+	for (p = area; size >= sizeof(u32); size -= sizeof(u32))
+		uasm_i_break(&p, BRK_BUG); /* Increments p */
+}
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+	struct bpf_prog *orig_prog = prog;
+	bool tmp_blinded = false;
+	struct bpf_prog *tmp;
+	struct bpf_binary_header *header = NULL;
+	struct jit_ctx ctx;
+	unsigned int image_size;
+	u8 *image_ptr;
+
+	if (!bpf_jit_enable || !cpu_has_mips64r2)
+		return prog;
+
+	tmp = bpf_jit_blind_constants(prog);
+	/* If blinding was requested and we failed during blinding,
+	 * we must fall back to the interpreter.
+	 */
+	if (IS_ERR(tmp))
+		return orig_prog;
+	if (tmp != prog) {
+		tmp_blinded = true;
+		prog = tmp;
+	}
+
+	memset(&ctx, 0, sizeof(ctx));
+
+	ctx.offsets = kcalloc(prog->len + 1, sizeof(*ctx.offsets), GFP_KERNEL);
+	if (ctx.offsets == NULL)
+		goto out_err;
+
+	ctx.reg_val_types = kcalloc(prog->len + 1, sizeof(*ctx.reg_val_types), GFP_KERNEL);
+	if (ctx.reg_val_types == NULL)
+		goto out_err;
+
+	ctx.skf = prog;
+
+	if (reg_val_propagate(&ctx))
+		goto out_err;
+
+	/*
+	 * First pass discovers used resources and instruction offsets
+	 * assuming short branches are used.
+	 */
+	if (build_int_body(&ctx))
+		goto out_err;
+
+	/*
+	 * If no calls are made (EBPF_SAVE_RA), then tail call count
+	 * in $v1, else we must save in n$s4.
+	 */
+	if (ctx.flags & EBPF_SEEN_TC) {
+		if (ctx.flags & EBPF_SAVE_RA)
+			ctx.flags |= EBPF_SAVE_S4;
+		else
+			ctx.flags |= EBPF_TCC_IN_V1;
+	}
+
+	/*
+	 * Second pass generates offsets, if any branches are out of
+	 * range a jump-around long sequence is generated, and we have
+	 * to try again from the beginning to generate the new
+	 * offsets.  This is done until no additional conversions are
+	 * necessary.
+	 */
+	do {
+		ctx.idx = 0;
+		ctx.gen_b_offsets = 1;
+		ctx.long_b_conversion = 0;
+		if (gen_int_prologue(&ctx))
+			goto out_err;
+		if (build_int_body(&ctx))
+			goto out_err;
+		if (build_int_epilogue(&ctx, MIPS_R_RA))
+			goto out_err;
+	} while (ctx.long_b_conversion);
+
+	image_size = 4 * ctx.idx;
+
+	header = bpf_jit_binary_alloc(image_size, &image_ptr,
+				      sizeof(u32), jit_fill_hole);
+	if (header == NULL)
+		goto out_err;
+
+	ctx.target = (u32 *)image_ptr;
+
+	/* Third pass generates the code */
+	ctx.idx = 0;
+	if (gen_int_prologue(&ctx))
+		goto out_err;
+	if (build_int_body(&ctx))
+		goto out_err;
+	if (build_int_epilogue(&ctx, MIPS_R_RA))
+		goto out_err;
+
+	/* Update the icache */
+	flush_icache_range((unsigned long)ctx.target,
+			   (unsigned long)(ctx.target + ctx.idx * sizeof(u32)));
+
+	if (bpf_jit_enable > 1)
+		/* Dump JIT code */
+		bpf_jit_dump(prog->len, image_size, 2, ctx.target);
+
+	bpf_jit_binary_lock_ro(header);
+	prog->bpf_func = (void *)ctx.target;
+	prog->jited = 1;
+	prog->jited_len = image_size;
+out_normal:
+	if (tmp_blinded)
+		bpf_jit_prog_release_other(prog, prog == orig_prog ?
+					   tmp : orig_prog);
+	kfree(ctx.offsets);
+	kfree(ctx.reg_val_types);
+
+	return prog;
+
+out_err:
+	prog = orig_prog;
+	if (header)
+		bpf_jit_binary_free(header);
+	goto out_normal;
+}
diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c
index bd67ac74fe2d..9632436d74d7 100644
--- a/arch/mips/pci/pci.c
+++ b/arch/mips/pci/pci.c
@@ -28,16 +28,15 @@ EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
 
 static int __init pcibios_set_cache_line_size(void)
 {
-	struct cpuinfo_mips *c = &current_cpu_data;
 	unsigned int lsize;
 
 	/*
 	 * Set PCI cacheline size to that of the highest level in the
 	 * cache hierarchy.
 	 */
-	lsize = c->dcache.linesz;
-	lsize = c->scache.linesz ? : lsize;
-	lsize = c->tcache.linesz ? : lsize;
+	lsize = cpu_dcache_line_size();
+	lsize = cpu_scache_line_size() ? : lsize;
+	lsize = cpu_tcache_line_size() ? : lsize;
 
 	BUG_ON(!lsize);
 
diff --git a/arch/mips/ralink/mt7620.c b/arch/mips/ralink/mt7620.c
index 094a0ee4af46..9be8b08ae46b 100644
--- a/arch/mips/ralink/mt7620.c
+++ b/arch/mips/ralink/mt7620.c
@@ -12,6 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/bug.h>
 
 #include <asm/mipsregs.h>
 #include <asm/mach-ralink/ralink_regs.h>
diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c
index 974276e828b2..e2690d7ca4dd 100644
--- a/arch/mips/vdso/gettimeofday.c
+++ b/arch/mips/vdso/gettimeofday.c
@@ -35,7 +35,8 @@ static __always_inline long gettimeofday_fallback(struct timeval *_tv,
 	"       syscall\n"
 	: "=r" (ret), "=r" (error)
 	: "r" (tv), "r" (tz), "r" (nr)
-	: "memory");
+	: "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13",
+	  "$14", "$15", "$24", "$25", "hi", "lo", "memory");
 
 	return error ? -ret : ret;
 }
@@ -55,7 +56,8 @@ static __always_inline long clock_gettime_fallback(clockid_t _clkid,
 	"       syscall\n"
 	: "=r" (ret), "=r" (error)
 	: "r" (clkid), "r" (ts), "r" (nr)
-	: "memory");
+	: "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13",
+	  "$14", "$15", "$24", "$25", "hi", "lo", "memory");
 
 	return error ? -ret : ret;
 }
diff --git a/arch/mn10300/include/asm/bug.h b/arch/mn10300/include/asm/bug.h
index aa6a38886391..811414fb002d 100644
--- a/arch/mn10300/include/asm/bug.h
+++ b/arch/mn10300/include/asm/bug.h
@@ -21,7 +21,7 @@ do {								\
 	asm volatile(						\
 		"	syscall 15			\n"	\
 		"0:					\n"	\
-		"	.section __bug_table,\"a\"	\n"	\
+		"	.section __bug_table,\"aw\"	\n"	\
 		"	.long 0b,%0,%1			\n"	\
 		"	.previous			\n"	\
 		:						\
diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h
index 9c7b8f7942d8..fe413b41df6c 100644
--- a/arch/mn10300/include/asm/spinlock.h
+++ b/arch/mn10300/include/asm/spinlock.h
@@ -26,11 +26,6 @@
 
 #define arch_spin_is_locked(x)	(*(volatile signed char *)(&(x)->slock) != 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	asm volatile(
diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h
index 778087341977..8fed278a24b8 100644
--- a/arch/openrisc/include/asm/futex.h
+++ b/arch/openrisc/include/asm/futex.h
@@ -30,20 +30,10 @@
 })
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 531da9eb8f43..dda1f558ef35 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -47,6 +47,9 @@ config PARISC
 	  and later HP3000 series).  The PA-RISC Linux project home page is
 	  at <http://www.parisc-linux.org/>.
 
+config CPU_BIG_ENDIAN
+	def_bool y
+
 config MMU
 	def_bool y
 
diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig
index 143d02652792..ccc109761f44 100644
--- a/arch/parisc/configs/712_defconfig
+++ b/arch/parisc/configs/712_defconfig
@@ -1,11 +1,9 @@
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=16
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_SLAB=y
@@ -14,7 +12,6 @@ CONFIG_OPROFILE=m
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
 CONFIG_PA7100LC=y
 CONFIG_PREEMPT_VOLUNTARY=y
 CONFIG_GSC_LASI=y
@@ -32,11 +29,9 @@ CONFIG_IP_PNP_DHCP=y
 CONFIG_IP_PNP_BOOTP=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
-# CONFIG_INET_LRO is not set
 CONFIG_INET_DIAG=m
 # CONFIG_IPV6 is not set
 CONFIG_NETFILTER=y
-CONFIG_IP_NF_QUEUE=m
 CONFIG_LLC2=m
 CONFIG_NET_PKTGEN=m
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -65,21 +60,20 @@ CONFIG_MD_LINEAR=m
 CONFIG_MD_RAID0=m
 CONFIG_MD_RAID1=m
 CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
 CONFIG_BONDING=m
+CONFIG_DUMMY=m
 CONFIG_TUN=m
-CONFIG_NET_ETHERNET=y
-CONFIG_MII=m
 CONFIG_LASI_82596=y
 CONFIG_PPP=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_SYNC_TTY=m
-CONFIG_PPP_DEFLATE=m
 CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
 CONFIG_PPP_MPPE=m
 CONFIG_PPPOE=m
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
 # CONFIG_KEYBOARD_HIL_OLD is not set
 CONFIG_MOUSE_SERIAL=m
+CONFIG_LEGACY_PTY_COUNT=64
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_NR_UARTS=17
@@ -88,22 +82,17 @@ CONFIG_SERIAL_8250_MANY_PORTS=y
 CONFIG_SERIAL_8250_SHARE_IRQ=y
 # CONFIG_SERIAL_MUX is not set
 CONFIG_PDC_CONSOLE=y
-CONFIG_LEGACY_PTY_COUNT=64
 CONFIG_PRINTER=m
 CONFIG_PPDEV=m
 # CONFIG_HW_RANDOM is not set
 CONFIG_RAW_DRIVER=y
 # CONFIG_HWMON is not set
-CONFIG_VIDEO_OUTPUT_CONTROL=m
 CONFIG_FB=y
 CONFIG_FB_MODE_HELPERS=y
 CONFIG_FB_TILEBLITTING=y
 CONFIG_DUMMY_CONSOLE_COLUMNS=128
 CONFIG_DUMMY_CONSOLE_ROWS=48
 CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_FONTS=y
-CONFIG_FONT_8x8=y
-CONFIG_FONT_8x16=y
 CONFIG_LOGO=y
 # CONFIG_LOGO_LINUX_MONO is not set
 # CONFIG_LOGO_LINUX_VGA16 is not set
@@ -111,13 +100,9 @@ CONFIG_LOGO=y
 CONFIG_SOUND=y
 CONFIG_SND=y
 CONFIG_SND_SEQUENCER=y
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
-CONFIG_SND_SEQUENCER_OSS=y
 CONFIG_SND_HARMONY=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
-# CONFIG_EXT3_FS_XATTR is not set
 CONFIG_JFS_FS=m
 CONFIG_XFS_FS=m
 CONFIG_AUTOFS4_FS=y
@@ -130,14 +115,10 @@ CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_UFS_FS=m
 CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
 CONFIG_NFS_V4=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V4=y
-CONFIG_RPCSEC_GSS_SPKM3=m
-CONFIG_SMB_FS=m
-CONFIG_SMB_NLS_DEFAULT=y
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_737=m
@@ -177,21 +158,16 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_MUTEXES=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
-CONFIG_CRYPTO_AES=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST6=m
@@ -200,6 +176,7 @@ CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_DEFLATE=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
-CONFIG_LIBCRC32C=m
+CONFIG_FONTS=y
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
diff --git a/arch/parisc/configs/a500_defconfig b/arch/parisc/configs/a500_defconfig
index 1a4f776b49b8..5acb93dcaabf 100644
--- a/arch/parisc/configs/a500_defconfig
+++ b/arch/parisc/configs/a500_defconfig
@@ -1,13 +1,10 @@
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=16
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_BLK_DEV_INITRD=y
-# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_EXPERT=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_SLAB=y
@@ -16,7 +13,6 @@ CONFIG_OPROFILE=m
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
 CONFIG_PA8X00=y
 CONFIG_64BIT=y
 CONFIG_SMP=y
@@ -43,21 +39,17 @@ CONFIG_IP_PNP_DHCP=y
 CONFIG_IP_PNP_BOOTP=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
-# CONFIG_INET_LRO is not set
 CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_TUNNEL=m
 CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_XT_MATCH_DCCP is not set
-CONFIG_IP_NF_QUEUE=m
 CONFIG_IP_NF_IPTABLES=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_TTL=m
 CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
-CONFIG_IP_NF_TARGET_LOG=m
-CONFIG_IP_NF_TARGET_ULOG=m
 CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_RAW=m
@@ -70,7 +62,6 @@ CONFIG_IP6_NF_MATCH_OPTS=m
 CONFIG_IP6_NF_MATCH_HL=m
 CONFIG_IP6_NF_MATCH_IPV6HEADER=m
 CONFIG_IP6_NF_MATCH_RT=m
-CONFIG_IP6_NF_TARGET_LOG=m
 CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_MANGLE=m
@@ -94,7 +85,6 @@ CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=y
 CONFIG_BLK_DEV_SR=y
 CONFIG_CHR_DEV_SG=y
-CONFIG_SCSI_MULTI_LUN=y
 CONFIG_SCSI_ISCSI_ATTRS=m
 CONFIG_SCSI_SYM53C8XX_2=y
 CONFIG_SCSI_QLOGIC_1280=m
@@ -106,43 +96,38 @@ CONFIG_MD_RAID0=y
 CONFIG_MD_RAID1=y
 CONFIG_FUSION=y
 CONFIG_FUSION_SPI=m
-CONFIG_FUSION_FC=m
 CONFIG_FUSION_CTL=m
 CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
 CONFIG_BONDING=m
+CONFIG_DUMMY=m
 CONFIG_TUN=m
-CONFIG_NET_ETHERNET=y
-CONFIG_NET_VENDOR_3COM=y
+CONFIG_PCMCIA_3C574=m
+CONFIG_PCMCIA_3C589=m
 CONFIG_VORTEX=m
 CONFIG_TYPHOON=m
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+CONFIG_PCNET32=m
+CONFIG_TIGON3=m
 CONFIG_NET_TULIP=y
 CONFIG_DE2104X=m
 CONFIG_TULIP=y
 CONFIG_TULIP_MMIO=y
 CONFIG_PCMCIA_XIRCOM=m
 CONFIG_HP100=m
-CONFIG_NET_PCI=y
-CONFIG_PCNET32=m
 CONFIG_E100=m
-CONFIG_ACENIC=m
-CONFIG_ACENIC_OMIT_TIGON_I=y
 CONFIG_E1000=m
-CONFIG_TIGON3=m
-CONFIG_NET_PCMCIA=y
-CONFIG_PCMCIA_3C589=m
-CONFIG_PCMCIA_3C574=m
 CONFIG_PCMCIA_SMC91C92=m
 CONFIG_PCMCIA_XIRC2PS=m
 CONFIG_PPP=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
 CONFIG_PPP_ASYNC=m
 CONFIG_PPP_SYNC_TTY=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_BSDCOMP=m
-# CONFIG_INPUT_MOUSEDEV is not set
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
+# CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_CS=m
@@ -151,7 +136,6 @@ CONFIG_SERIAL_8250_EXTENDED=y
 CONFIG_SERIAL_8250_MANY_PORTS=y
 CONFIG_SERIAL_8250_SHARE_IRQ=y
 CONFIG_PDC_CONSOLE=y
-# CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
 CONFIG_RAW_DRIVER=y
 # CONFIG_HWMON is not set
@@ -160,7 +144,6 @@ CONFIG_AGP_PARISC=y
 # CONFIG_STI_CONSOLE is not set
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
-# CONFIG_EXT3_FS_XATTR is not set
 CONFIG_JFS_FS=m
 CONFIG_XFS_FS=m
 CONFIG_AUTOFS4_FS=y
@@ -173,13 +156,9 @@ CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_UFS_FS=m
 CONFIG_NFS_FS=m
-CONFIG_NFS_V3=y
-CONFIG_NFS_V4=y
+CONFIG_NFS_V4=m
 CONFIG_NFSD=m
 CONFIG_NFSD_V4=y
-CONFIG_RPCSEC_GSS_SPKM3=m
-CONFIG_SMB_FS=m
-CONFIG_SMB_NLS_DEFAULT=y
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
@@ -187,17 +166,12 @@ CONFIG_NLS_ASCII=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
-CONFIG_DEBUG_KERNEL=y
+CONFIG_MAGIC_SYSRQ=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_BLOWFISH=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
-CONFIG_LIBCRC32C=m
diff --git a/arch/parisc/configs/b180_defconfig b/arch/parisc/configs/b180_defconfig
index f1a0c25bef8d..83ffd161aec5 100644
--- a/arch/parisc/configs/b180_defconfig
+++ b/arch/parisc/configs/b180_defconfig
@@ -3,7 +3,6 @@ CONFIG_SYSVIPC=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=16
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_SLAB=y
 CONFIG_MODULES=y
@@ -25,8 +24,6 @@ CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_BOOTP=y
-# CONFIG_INET_LRO is not set
-CONFIG_IPV6=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
@@ -53,10 +50,9 @@ CONFIG_MD_LINEAR=y
 CONFIG_MD_RAID0=y
 CONFIG_MD_RAID1=y
 CONFIG_NETDEVICES=y
-CONFIG_NET_ETHERNET=y
-CONFIG_LASI_82596=y
 CONFIG_NET_TULIP=y
 CONFIG_TULIP=y
+CONFIG_LASI_82596=y
 CONFIG_PPP=y
 CONFIG_INPUT_EVDEV=y
 # CONFIG_KEYBOARD_HIL_OLD is not set
@@ -71,40 +67,31 @@ CONFIG_SERIAL_8250_SHARE_IRQ=y
 CONFIG_PRINTER=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
-CONFIG_VIDEO_OUTPUT_CONTROL=m
 CONFIG_FB=y
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_LOGO=y
 CONFIG_SOUND=y
 CONFIG_SND=y
 CONFIG_SND_SEQUENCER=y
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
-CONFIG_SND_SEQUENCER_OSS=y
 CONFIG_SND_HARMONY=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
-# CONFIG_EXT3_FS_XATTR is not set
 CONFIG_AUTOFS4_FS=y
 CONFIG_ISO9660_FS=y
 CONFIG_JOLIET=y
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=y
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=y
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_850=m
 CONFIG_NLS_ASCII=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_HEADERS_CHECK=y
+CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
 CONFIG_SECURITY=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig
index 8e8f0e34f817..0764d3971cf6 100644
--- a/arch/parisc/configs/c3000_defconfig
+++ b/arch/parisc/configs/c3000_defconfig
@@ -1,12 +1,9 @@
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=16
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_BLK_DEV_INITRD=y
-# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_EXPERT=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_SLAB=y
@@ -15,7 +12,6 @@ CONFIG_OPROFILE=m
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
 CONFIG_PA8X00=y
 CONFIG_PREEMPT_VOLUNTARY=y
 # CONFIG_GSC is not set
@@ -31,13 +27,11 @@ CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_BOOTP=y
-# CONFIG_INET_LRO is not set
 # CONFIG_INET_DIAG is not set
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_TUNNEL=m
 CONFIG_NETFILTER=y
 CONFIG_NETFILTER_DEBUG=y
-CONFIG_IP_NF_QUEUE=m
 CONFIG_NET_PKTGEN=m
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -50,13 +44,11 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_IDE=y
 CONFIG_BLK_DEV_IDECD=y
 CONFIG_BLK_DEV_NS87415=y
-CONFIG_PATA_SIL680=m
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=y
 CONFIG_BLK_DEV_SR=y
 CONFIG_CHR_DEV_SG=y
-CONFIG_SCSI_MULTI_LUN=y
 CONFIG_SCSI_ISCSI_ATTRS=m
 CONFIG_SCSI_SYM53C8XX_2=y
 CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
@@ -76,28 +68,23 @@ CONFIG_FUSION=y
 CONFIG_FUSION_SPI=m
 CONFIG_FUSION_CTL=m
 CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
 CONFIG_BONDING=m
+CONFIG_DUMMY=m
 CONFIG_TUN=m
-CONFIG_NET_ETHERNET=y
+CONFIG_ACENIC=m
+CONFIG_TIGON3=m
 CONFIG_NET_TULIP=y
 CONFIG_DE2104X=m
 CONFIG_TULIP=y
 CONFIG_TULIP_MMIO=y
-CONFIG_NET_PCI=y
 CONFIG_E100=m
-CONFIG_ACENIC=m
 CONFIG_E1000=m
-CONFIG_TIGON3=m
 CONFIG_PPP=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_SYNC_TTY=m
-CONFIG_PPP_DEFLATE=m
 CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
 CONFIG_PPPOE=m
-# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
-CONFIG_INPUT_MOUSEDEV_SCREEN_X=1600
-CONFIG_INPUT_MOUSEDEV_SCREEN_Y=1200
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
 # CONFIG_KEYBOARD_ATKBD is not set
 # CONFIG_MOUSE_PS2 is not set
 CONFIG_SERIO=m
@@ -111,7 +98,6 @@ CONFIG_SERIAL_8250_SHARE_IRQ=y
 # CONFIG_HW_RANDOM is not set
 CONFIG_RAW_DRIVER=y
 # CONFIG_HWMON is not set
-CONFIG_VIDEO_OUTPUT_CONTROL=m
 CONFIG_FB=y
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_LOGO=y
@@ -121,9 +107,6 @@ CONFIG_LOGO=y
 CONFIG_SOUND=y
 CONFIG_SND=y
 CONFIG_SND_SEQUENCER=y
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
-CONFIG_SND_SEQUENCER_OSS=y
 CONFIG_SND_AD1889=y
 CONFIG_USB_HIDDEV=y
 CONFIG_USB=y
@@ -139,7 +122,6 @@ CONFIG_USB_MICROTEK=m
 CONFIG_USB_LEGOTOWER=m
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
-# CONFIG_EXT3_FS_XATTR is not set
 CONFIG_XFS_FS=m
 CONFIG_AUTOFS4_FS=y
 CONFIG_ISO9660_FS=y
@@ -149,7 +131,6 @@ CONFIG_VFAT_FS=m
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=y
 CONFIG_NFSD_V3=y
@@ -159,18 +140,13 @@ CONFIG_NLS_ASCII=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
-CONFIG_DEBUG_KERNEL=y
+CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_MUTEXES=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_MD5=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_DES=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
-CONFIG_LIBCRC32C=m
diff --git a/arch/parisc/configs/c8000_defconfig b/arch/parisc/configs/c8000_defconfig
index f6a4c016304b..088ab948a5ca 100644
--- a/arch/parisc/configs/c8000_defconfig
+++ b/arch/parisc/configs/c8000_defconfig
@@ -1,16 +1,13 @@
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_FHANDLE=y
+# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
-CONFIG_RD_BZIP2=y
-CONFIG_RD_LZMA=y
-CONFIG_RD_LZO=y
 CONFIG_EXPERT=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_SLAB=y
@@ -23,7 +20,6 @@ CONFIG_PA8X00=y
 CONFIG_64BIT=y
 CONFIG_SMP=y
 CONFIG_PREEMPT=y
-# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_IOMMU_CCIO=y
 CONFIG_PCI=y
 CONFIG_PCI_LBA=y
@@ -146,7 +142,6 @@ CONFIG_FB_FOREIGN_ENDIAN=y
 CONFIG_FB_MODE_HELPERS=y
 CONFIG_FB_TILEBLITTING=y
 # CONFIG_FB_STI is not set
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 # CONFIG_BACKLIGHT_GENERIC is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
@@ -157,12 +152,9 @@ CONFIG_LOGO=y
 # CONFIG_LOGO_LINUX_CLUT224 is not set
 CONFIG_SOUND=m
 CONFIG_SND=m
+CONFIG_SND_VERBOSE_PRINTK=y
 CONFIG_SND_SEQUENCER=m
 CONFIG_SND_SEQ_DUMMY=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_VERBOSE_PRINTK=y
 CONFIG_SND_AD1889=m
 # CONFIG_SND_USB is not set
 # CONFIG_SND_GSC is not set
@@ -174,8 +166,6 @@ CONFIG_EXT2_FS_XATTR=y
 CONFIG_EXT2_FS_POSIX_ACL=y
 CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_EXT4_FS=m
 CONFIG_REISERFS_FS=m
 CONFIG_REISERFS_PROC_INFO=y
 CONFIG_XFS_FS=m
@@ -238,11 +228,8 @@ CONFIG_DEBUG_SLAB=y
 CONFIG_DEBUG_SLAB_LEAK=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_DEBUG_RT_MUTEXES=y
-CONFIG_PROVE_RCU_DELAY=y
 CONFIG_DEBUG_BLOCK_EXT_DEVT=y
 CONFIG_LATENCYTOP=y
 CONFIG_KEYS=y
diff --git a/arch/parisc/configs/default_defconfig b/arch/parisc/configs/default_defconfig
index 310b6657e4ac..52c9050a7c5c 100644
--- a/arch/parisc/configs/default_defconfig
+++ b/arch/parisc/configs/default_defconfig
@@ -1,11 +1,9 @@
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=16
-CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_SLAB=y
@@ -41,9 +39,7 @@ CONFIG_IP_PNP_DHCP=y
 CONFIG_IP_PNP_BOOTP=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
-# CONFIG_INET_LRO is not set
 CONFIG_INET_DIAG=m
-CONFIG_IPV6=y
 CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
@@ -82,26 +78,23 @@ CONFIG_MD_RAID1=y
 CONFIG_MD_RAID10=y
 CONFIG_BLK_DEV_DM=y
 CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
 CONFIG_BONDING=m
+CONFIG_DUMMY=m
 CONFIG_TUN=m
-CONFIG_NET_ETHERNET=y
-CONFIG_MII=m
-CONFIG_LASI_82596=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=y
-CONFIG_NET_PCI=y
 CONFIG_ACENIC=y
 CONFIG_TIGON3=y
-CONFIG_NET_PCMCIA=y
+CONFIG_NET_TULIP=y
+CONFIG_TULIP=y
+CONFIG_LASI_82596=y
 CONFIG_PPP=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_SYNC_TTY=m
-CONFIG_PPP_DEFLATE=m
 CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
 CONFIG_PPPOE=m
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
 # CONFIG_KEYBOARD_HIL_OLD is not set
 CONFIG_MOUSE_SERIAL=y
+CONFIG_LEGACY_PTY_COUNT=64
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_CS=y
@@ -109,31 +102,24 @@ CONFIG_SERIAL_8250_NR_UARTS=17
 CONFIG_SERIAL_8250_EXTENDED=y
 CONFIG_SERIAL_8250_MANY_PORTS=y
 CONFIG_SERIAL_8250_SHARE_IRQ=y
-CONFIG_LEGACY_PTY_COUNT=64
 CONFIG_PRINTER=m
 CONFIG_PPDEV=m
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
-CONFIG_VIDEO_OUTPUT_CONTROL=m
 CONFIG_FB=y
 CONFIG_FB_MODE_HELPERS=y
 CONFIG_FB_TILEBLITTING=y
 CONFIG_DUMMY_CONSOLE_COLUMNS=128
 CONFIG_DUMMY_CONSOLE_ROWS=48
 CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_FONTS=y
-CONFIG_FONT_8x16=y
 CONFIG_LOGO=y
 # CONFIG_LOGO_LINUX_MONO is not set
 # CONFIG_LOGO_LINUX_VGA16 is not set
 # CONFIG_LOGO_LINUX_CLUT224 is not set
 CONFIG_SOUND=y
 CONFIG_SND=y
-CONFIG_SND_SEQUENCER=y
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
-CONFIG_SND_SEQUENCER_OSS=y
 CONFIG_SND_DYNAMIC_MINORS=y
+CONFIG_SND_SEQUENCER=y
 CONFIG_SND_AD1889=y
 CONFIG_SND_HARMONY=y
 CONFIG_HID_GYRATION=y
@@ -141,7 +127,6 @@ CONFIG_HID_NTRIG=y
 CONFIG_HID_PANTHERLORD=y
 CONFIG_HID_PETALYNX=y
 CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SONY=y
 CONFIG_HID_SUNPLUS=y
 CONFIG_HID_TOPSEED=y
 CONFIG_USB=y
@@ -150,21 +135,15 @@ CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_UHCI_HCD=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_AUTOFS_FS=y
 CONFIG_ISO9660_FS=y
 CONFIG_JOLIET=y
 CONFIG_VFAT_FS=y
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
 CONFIG_NFSD=y
 CONFIG_NFSD_V4=y
-CONFIG_RPCSEC_GSS_SPKM3=m
-CONFIG_SMB_FS=m
-CONFIG_SMB_NLS_DEFAULT=y
 CONFIG_CIFS=m
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_CODEPAGE_737=m
@@ -204,30 +183,24 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=y
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
+CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_MUTEXES=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
 CONFIG_KEYS=y
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
-CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
-CONFIG_CRYPTO_AES=m
 CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_ARC4=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
 CONFIG_LIBCRC32C=m
+CONFIG_FONTS=y
diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index 8688ba7f5966..37ae4b57c001 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -2,15 +2,11 @@ CONFIG_LOCALVERSION="-32bit"
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_FHANDLE=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=16
 CONFIG_BLK_DEV_INITRD=y
-CONFIG_RD_BZIP2=y
-CONFIG_RD_LZMA=y
-CONFIG_RD_LZO=y
 CONFIG_EXPERT=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_PERF_EVENTS=y
@@ -49,7 +45,6 @@ CONFIG_INET_ESP=m
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 CONFIG_INET_DIAG=m
 CONFIG_LLC2=m
 # CONFIG_WIRELESS is not set
@@ -149,10 +144,8 @@ CONFIG_PRINTER=m
 CONFIG_PPDEV=m
 # CONFIG_HW_RANDOM is not set
 CONFIG_I2C=y
-CONFIG_POWER_SUPPLY=y
 # CONFIG_HWMON is not set
 CONFIG_AGP=y
-CONFIG_VIDEO_OUTPUT_CONTROL=y
 CONFIG_FB=y
 CONFIG_FB_FOREIGN_ENDIAN=y
 CONFIG_FB_MODE_HELPERS=y
@@ -169,11 +162,8 @@ CONFIG_LOGO=y
 # CONFIG_LOGO_LINUX_CLUT224 is not set
 CONFIG_SOUND=m
 CONFIG_SND=m
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_SEQUENCER_OSS=y
 CONFIG_SND_DYNAMIC_MINORS=y
+CONFIG_SND_SEQUENCER=m
 CONFIG_SND_AD1889=m
 CONFIG_SND_HARMONY=m
 CONFIG_HIDRAW=y
@@ -223,12 +213,7 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT2_FS_XATTR=y
 CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_SECURITY=y
-CONFIG_EXT4_FS=y
-CONFIG_XFS_FS=m
-CONFIG_XFS_QUOTA=y
-CONFIG_XFS_RT=y
 CONFIG_QUOTA=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_QFMT_V2=y
@@ -293,15 +278,12 @@ CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_HUNG_TASK=y
-CONFIG_TIMER_STATS=y
 CONFIG_DEBUG_RT_MUTEXES=y
 CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_MUTEXES=y
-CONFIG_RCU_CPU_STALL_INFO=y
 CONFIG_LATENCYTOP=y
 CONFIG_LKDTM=m
 CONFIG_KEYS=y
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD5=y
@@ -320,7 +302,6 @@ CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_DEFLATE=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRC_CCITT=m
 CONFIG_CRC_T10DIF=y
 CONFIG_FONTS=y
diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig
index c564e6e1fa23..d39e7f821aba 100644
--- a/arch/parisc/configs/generic-64bit_defconfig
+++ b/arch/parisc/configs/generic-64bit_defconfig
@@ -8,10 +8,11 @@ CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
-# CONFIG_UTS_NS is not set
-# CONFIG_IPC_NS is not set
-# CONFIG_PID_NS is not set
-# CONFIG_NET_NS is not set
+CONFIG_CGROUPS=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CPUSETS=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -52,7 +53,6 @@ CONFIG_INET_ESP=m
 CONFIG_INET_XFRM_MODE_TRANSPORT=m
 CONFIG_INET_XFRM_MODE_TUNNEL=m
 CONFIG_INET_XFRM_MODE_BEET=m
-CONFIG_INET_LRO=m
 CONFIG_INET_DIAG=m
 CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_ADVANCED is not set
@@ -84,7 +84,6 @@ CONFIG_PATA_SIL680=y
 CONFIG_ATA_GENERIC=y
 CONFIG_MD=y
 CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
 CONFIG_BLK_DEV_DM=m
 CONFIG_DM_RAID=m
 CONFIG_DM_UEVENT=y
@@ -138,21 +137,21 @@ CONFIG_QLGE=m
 # CONFIG_NET_VENDOR_TI is not set
 # CONFIG_NET_VENDOR_VIA is not set
 # CONFIG_NET_VENDOR_WIZNET is not set
+CONFIG_MDIO_BITBANG=m
 CONFIG_PHYLIB=y
-CONFIG_MARVELL_PHY=m
-CONFIG_DAVICOM_PHY=m
-CONFIG_QSEMI_PHY=m
-CONFIG_LXT_PHY=m
-CONFIG_CICADA_PHY=m
-CONFIG_VITESSE_PHY=m
-CONFIG_SMSC_PHY=m
 CONFIG_BROADCOM_PHY=m
+CONFIG_CICADA_PHY=m
+CONFIG_DAVICOM_PHY=m
 CONFIG_ICPLUS_PHY=m
-CONFIG_REALTEK_PHY=m
+CONFIG_LSI_ET1011C_PHY=m
+CONFIG_LXT_PHY=m
+CONFIG_MARVELL_PHY=m
 CONFIG_NATIONAL_PHY=m
+CONFIG_QSEMI_PHY=m
+CONFIG_REALTEK_PHY=m
+CONFIG_SMSC_PHY=m
 CONFIG_STE10XP=m
-CONFIG_LSI_ET1011C_PHY=m
-CONFIG_MDIO_BITBANG=m
+CONFIG_VITESSE_PHY=m
 CONFIG_SLIP=m
 CONFIG_SLIP_COMPRESSED=y
 CONFIG_SLIP_SMART=y
@@ -166,10 +165,8 @@ CONFIG_INPUT_MISC=y
 CONFIG_SERIO_SERPORT=m
 # CONFIG_HP_SDC is not set
 CONFIG_SERIO_RAW=m
-CONFIG_DEVPTS_MULTIPLE_INSTANCES=y
 # CONFIG_LEGACY_PTYS is not set
 CONFIG_NOZOMI=m
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
 CONFIG_SERIAL_8250_CONSOLE=y
@@ -207,10 +204,8 @@ CONFIG_AGP=y
 CONFIG_AGP_PARISC=y
 CONFIG_DRM=y
 CONFIG_DRM_RADEON=y
-CONFIG_DRM_RADEON_UMS=y
 CONFIG_FIRMWARE_EDID=y
 CONFIG_FB_MODE_HELPERS=y
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_BACKLIGHT_GENERIC is not set
 CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
 CONFIG_LOGO=y
@@ -246,8 +241,6 @@ CONFIG_EXT2_FS_XATTR=y
 CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT3_FS=y
 CONFIG_EXT3_FS_SECURITY=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_SECURITY=y
 CONFIG_XFS_FS=m
 CONFIG_BTRFS_FS=m
 CONFIG_QUOTA=y
@@ -286,27 +279,16 @@ CONFIG_DEBUG_FS=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
 # CONFIG_SCHED_DEBUG is not set
-CONFIG_TIMER_STATS=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_ARC4=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_DEFLATE=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_CRC_CCITT=m
 CONFIG_LIBCRC32C=y
-CONFIG_XZ_DEC_X86=y
-CONFIG_XZ_DEC_POWERPC=y
-CONFIG_XZ_DEC_IA64=y
-CONFIG_XZ_DEC_ARM=y
-CONFIG_XZ_DEC_ARMTHUMB=y
-CONFIG_XZ_DEC_SPARC=y
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 5394b9c5f914..17b98a87e5e2 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i)
 	_atomic_spin_unlock_irqrestore(v, flags);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 static __inline__ int atomic_read(const atomic_t *v)
 {
 	return READ_ONCE((v)->counter);
diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h
index d2742273a685..07ea467f22fc 100644
--- a/arch/parisc/include/asm/bug.h
+++ b/arch/parisc/include/asm/bug.h
@@ -27,7 +27,7 @@
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
-			     "\t.pushsection __bug_table,\"a\"\n"	\
+			     "\t.pushsection __bug_table,\"aw\"\n"	\
 			     "2:\t" ASM_WORD_INSN "1b, %c0\n"		\
 			     "\t.short %c1, %c2\n"			\
 			     "\t.org 2b+%c3\n"				\
@@ -50,7 +50,7 @@
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
-			     "\t.pushsection __bug_table,\"a\"\n"	\
+			     "\t.pushsection __bug_table,\"aw\"\n"	\
 			     "2:\t" ASM_WORD_INSN "1b, %c0\n"		\
 			     "\t.short %c1, %c2\n"			\
 			     "\t.org 2b+%c3\n"				\
@@ -64,7 +64,7 @@
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
-			     "\t.pushsection __bug_table,\"a\"\n"	\
+			     "\t.pushsection __bug_table,\"aw\"\n"	\
 			     "2:\t" ASM_WORD_INSN "1b\n"		\
 			     "\t.short %c0\n"				\
 			     "\t.org 2b+%c1\n"				\
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 0ba14300cd8e..c601aab2fb36 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
 }
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
 	unsigned long int flags;
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval, ret;
 	u32 tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)))
-		return -EFAULT;
-
 	_futex_spin_lock_irqsave(uaddr, &flags);
 	pagefault_disable();
 
@@ -85,17 +75,9 @@ out_pagefault_enable:
 	pagefault_enable();
 	_futex_spin_unlock_irqrestore(uaddr, &flags);
 
-	if (ret == 0) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/parisc/include/asm/pdcpat.h b/arch/parisc/include/asm/pdcpat.h
index 32e105fb8adb..e3c0586260d8 100644
--- a/arch/parisc/include/asm/pdcpat.h
+++ b/arch/parisc/include/asm/pdcpat.h
@@ -150,7 +150,7 @@
 #define PDC_PAT_MEM_SETGM		9L /* Set Good Memory value        */
 #define PDC_PAT_MEM_ADD_PAGE		10L /* ADDs a page to the cell      */
 #define PDC_PAT_MEM_ADDRESS		11L /* Get Physical Location From   */
-                                    		 /* Memory Address               */
+					    /* Memory Address               */
 #define PDC_PAT_MEM_GET_TXT_SIZE   	12L /* Get Formatted Text Size   */
 #define PDC_PAT_MEM_GET_PD_TXT     	13L /* Get PD Formatted Text     */
 #define PDC_PAT_MEM_GET_CELL_TXT   	14L /* Get Cell Formatted Text   */
@@ -228,6 +228,17 @@ struct pdc_pat_mem_read_pd_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_PD_READ */
 	unsigned long pdt_entries;
 };
 
+struct pdc_pat_mem_phys_mem_location { /* PDC_PAT_MEM/PDC_PAT_MEM_ADDRESS */
+	u64 cabinet:8;
+	u64 ign1:8;
+	u64 ign2:8;
+	u64 cell_slot:8;
+	u64 ign3:8;
+	u64 dimm_slot:8; /* DIMM slot, e.g. 0x1A, 0x2B, show user hex value! */
+	u64 ign4:8;
+	u64 source:4; /* for mem: always 0x07 */
+	u64 source_detail:4; /* for mem: always 0x04 (SIMM or DIMM) */
+};
 
 struct pdc_pat_pd_addr_map_entry {
 	unsigned char entry_type;       /* 1 = Memory Descriptor Entry Type */
@@ -319,6 +330,9 @@ extern int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret,
 extern int pdc_pat_mem_read_pd_pdt(struct pdc_pat_mem_read_pd_retinfo *pret,
 		unsigned long *pdt_entries_ptr, unsigned long count,
 		unsigned long offset);
+extern int pdc_pat_mem_get_dimm_phys_location(
+                struct pdc_pat_mem_phys_mem_location *pret,
+                unsigned long phys_addr);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index e32936cd7f10..55bfe4affca3 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -14,13 +14,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x)
 
 #define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *x)
-{
-	volatile unsigned int *a = __ldcw_align(x);
-
-	smp_cond_load_acquire(a, VAL);
-}
-
 static inline void arch_spin_lock_flags(arch_spinlock_t *x,
 					 unsigned long flags)
 {
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index 88fe0aad4390..bc208136bbb2 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -34,7 +34,7 @@ struct thread_info {
 
 /* thread information allocation */
 
-#define THREAD_SIZE_ORDER	2 /* PA-RISC requires at least 16k stack */
+#define THREAD_SIZE_ORDER	3 /* PA-RISC requires at least 32k stack */
 /* Be sure to hunt all references to this down when you change the size of
  * the kernel stack */
 #define THREAD_SIZE             (PAGE_SIZE << THREAD_SIZE_ORDER)
diff --git a/arch/parisc/include/uapi/asm/ioctls.h b/arch/parisc/include/uapi/asm/ioctls.h
index 674c68a5bbd0..d0e3321403be 100644
--- a/arch/parisc/include/uapi/asm/ioctls.h
+++ b/arch/parisc/include/uapi/asm/ioctls.h
@@ -60,7 +60,7 @@
 #define TIOCGPKT	_IOR('T', 0x38, int) /* Get packet mode state */
 #define TIOCGPTLCK	_IOR('T', 0x39, int) /* Get Pty lock state */
 #define TIOCGEXCL	_IOR('T', 0x40, int) /* Get exclusive mode state */
-#define TIOCGPTPEER	_IOR('T', 0x41, int) /* Safely open the slave */
+#define TIOCGPTPEER	_IO('T', 0x41) /* Safely open the slave */
 
 #define FIONCLEX	0x5450  /* these numbers need to be adjusted. */
 #define FIOCLEX		0x5451
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index c32a09095216..19c0c141bc3f 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -453,8 +453,8 @@ void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
 	  before it can be accessed through the kernel mapping. */
 	preempt_disable();
 	flush_dcache_page_asm(__pa(vfrom), vaddr);
-	preempt_enable();
 	copy_page_asm(vto, vfrom);
+	preempt_enable();
 }
 EXPORT_SYMBOL(copy_user_page);
 
@@ -539,6 +539,10 @@ void flush_cache_mm(struct mm_struct *mm)
 	struct vm_area_struct *vma;
 	pgd_t *pgd;
 
+	/* Flush the TLB to avoid speculation if coherency is required. */
+	if (parisc_requires_coherency())
+		flush_tlb_all();
+
 	/* Flushing the whole cache on each cpu takes forever on
 	   rp3440, etc.  So, avoid it if the mm isn't too big.  */
 	if (mm_total_size(mm) >= parisc_cache_flush_threshold) {
@@ -577,33 +581,21 @@ void flush_cache_mm(struct mm_struct *mm)
 void flush_cache_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end)
 {
-	unsigned long addr;
-	pgd_t *pgd;
-
 	BUG_ON(!vma->vm_mm->context);
 
-	if ((end - start) >= parisc_cache_flush_threshold) {
-		flush_cache_all();
-		return;
-	}
+	/* Flush the TLB to avoid speculation if coherency is required. */
+	if (parisc_requires_coherency())
+		flush_tlb_range(vma, start, end);
 
-	if (vma->vm_mm->context == mfsp(3)) {
-		flush_user_dcache_range_asm(start, end);
-		if (vma->vm_flags & VM_EXEC)
-			flush_user_icache_range_asm(start, end);
+	if ((end - start) >= parisc_cache_flush_threshold
+	    || vma->vm_mm->context != mfsp(3)) {
+		flush_cache_all();
 		return;
 	}
 
-	pgd = vma->vm_mm->pgd;
-	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
-		unsigned long pfn;
-		pte_t *ptep = get_ptep(pgd, addr);
-		if (!ptep)
-			continue;
-		pfn = pte_pfn(*ptep);
-		if (pfn_valid(pfn))
-			__flush_cache_page(vma, addr, PFN_PHYS(pfn));
-	}
+	flush_user_dcache_range_asm(start, end);
+	if (vma->vm_flags & VM_EXEC)
+		flush_user_icache_range_asm(start, end);
 }
 
 void
@@ -612,7 +604,8 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long
 	BUG_ON(!vma->vm_mm->context);
 
 	if (pfn_valid(pfn)) {
-		flush_tlb_page(vma, vmaddr);
+		if (parisc_requires_coherency())
+			flush_tlb_page(vma, vmaddr);
 		__flush_cache_page(vma, vmaddr, PFN_PHYS(pfn));
 	}
 }
diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c
index 98190252c12f..f622a311d04a 100644
--- a/arch/parisc/kernel/firmware.c
+++ b/arch/parisc/kernel/firmware.c
@@ -1481,12 +1481,44 @@ int pdc_pat_mem_read_pd_pdt(struct pdc_pat_mem_read_pd_retinfo *pret,
 		unsigned long offset)
 {
 	int retval;
-	unsigned long flags;
+	unsigned long flags, entries;
 
 	spin_lock_irqsave(&pdc_lock, flags);
 	retval = mem_pdc_call(PDC_PAT_MEM, PDC_PAT_MEM_PD_READ,
-		__pa(&pret), __pa(pdt_entries_ptr),
+		__pa(&pdc_result), __pa(pdt_entries_ptr),
 		count, offset);
+
+	if (retval == PDC_OK) {
+		entries = min(pdc_result[0], count);
+		pret->actual_count_bytes = entries;
+		pret->pdt_entries = entries / sizeof(unsigned long);
+	}
+
+	spin_unlock_irqrestore(&pdc_lock, flags);
+
+	return retval;
+}
+
+/**
+ * pdc_pat_mem_get_dimm_phys_location - Get physical DIMM slot via PAT firmware
+ * @pret: ptr to hold returned information
+ * @phys_addr: physical address to examine
+ *
+ */
+int pdc_pat_mem_get_dimm_phys_location(
+		struct pdc_pat_mem_phys_mem_location *pret,
+		unsigned long phys_addr)
+{
+	int retval;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pdc_lock, flags);
+	retval = mem_pdc_call(PDC_PAT_MEM, PDC_PAT_MEM_ADDRESS,
+		__pa(&pdc_result), phys_addr);
+
+	if (retval == PDC_OK)
+		memcpy(pret, &pdc_result, sizeof(*pret));
+
 	spin_unlock_irqrestore(&pdc_lock, flags);
 
 	return retval;
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index ba5e1c7b1f17..0ca254085a66 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -380,7 +380,7 @@ static inline int eirr_to_irq(unsigned long eirr)
 /*
  * IRQ STACK - used for irq handler
  */
-#define IRQ_STACK_SIZE      (4096 << 2) /* 16k irq stack size */
+#define IRQ_STACK_SIZE      (4096 << 3) /* 32k irq stack size */
 
 union irq_stack_union {
 	unsigned long stack[IRQ_STACK_SIZE/sizeof(unsigned long)];
@@ -413,6 +413,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	if (regs->sr[7])
 		return;
 
+	/* exit if already in panic */
+	if (sysctl_panic_on_stackoverflow < 0)
+		return;
+
 	/* calculate kernel stack usage */
 	stack_usage = sp - stack_start;
 #ifdef CONFIG_IRQSTACKS
@@ -454,8 +458,10 @@ check_kernel_stack:
 #ifdef CONFIG_IRQSTACKS
 panic_check:
 #endif
-	if (sysctl_panic_on_stackoverflow)
+	if (sysctl_panic_on_stackoverflow) {
+		sysctl_panic_on_stackoverflow = -1; /* disable further checks */
 		panic("low stack detected by irq handler - check messages\n");
+	}
 #endif
 }
 
diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c
index f3a797e670b0..d02874ecb94d 100644
--- a/arch/parisc/kernel/pdt.c
+++ b/arch/parisc/kernel/pdt.c
@@ -112,10 +112,12 @@ void __init pdc_pdt_init(void)
 #ifdef CONFIG_64BIT
 		struct pdc_pat_mem_read_pd_retinfo pat_pret;
 
+		/* try old obsolete PAT firmware function first */
+		pdt_type = PDT_PAT_OLD;
 		ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry,
 			MAX_PDT_ENTRIES);
 		if (ret != PDC_OK) {
-			pdt_type = PDT_PAT_OLD;
+			pdt_type = PDT_PAT_NEW;
 			ret = pdc_pat_mem_read_pd_pdt(&pat_pret, pdt_entry,
 				MAX_PDT_TABLE_SIZE, 0);
 		}
@@ -131,11 +133,20 @@ void __init pdc_pdt_init(void)
 	}
 
 	for (i = 0; i < pdt_status.pdt_entries; i++) {
-		if (i < 20)
-			pr_warn("PDT: BAD PAGE #%d at 0x%08lx (error_type = %lu)\n",
-				i,
-				pdt_entry[i] & PAGE_MASK,
-				pdt_entry[i] & 1);
+		struct pdc_pat_mem_phys_mem_location loc;
+
+		/* get DIMM slot number */
+		loc.dimm_slot = 0xff;
+#ifdef CONFIG_64BIT
+		pdc_pat_mem_get_dimm_phys_location(&loc, pdt_entry[i]);
+#endif
+
+		pr_warn("PDT: BAD PAGE #%d at 0x%08lx, "
+			"DIMM slot %02x (error_type = %lu)\n",
+			i,
+			pdt_entry[i] & PAGE_MASK,
+			loc.dimm_slot,
+			pdt_entry[i] & 1);
 
 		/* mark memory page bad */
 		memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE);
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index b64d7d21646e..a45a67d526f8 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -53,6 +53,7 @@
 #include <linux/uaccess.h>
 #include <linux/rcupdate.h>
 #include <linux/random.h>
+#include <linux/nmi.h>
 
 #include <asm/io.h>
 #include <asm/asm-offsets.h>
@@ -145,6 +146,7 @@ void machine_power_off(void)
 
 	/* prevent soft lockup/stalled CPU messages for endless loop. */
 	rcu_sysrq_start();
+	lockup_detector_suspend();
 	for (;;);
 }
 
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 3d6ef1b29c6a..ffe2cbf52d1a 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -78,6 +78,8 @@ SECTIONS
 		*(.text.sys_exit)
 		*(.text.do_sigaltstack)
 		*(.text.do_fork)
+		*(.text.div)
+		*($$*)			/* millicode routines */
 		*(.text.*)
 		*(.fixup)
 		*(.lock.text)		/* out-of-line lock text */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 36f858c37ca7..81b0031f909f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -199,7 +199,7 @@ config PPC
 	select HAVE_OPTPROBES			if PPC64
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI		if PPC64
-	select HAVE_HARDLOCKUP_DETECTOR_PERF	if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
+	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE		if SMP
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 8d4ed73d5490..e2b3e7a00c9e 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -59,6 +59,19 @@ machine-$(CONFIG_PPC64) += 64
 machine-$(CONFIG_CPU_LITTLE_ENDIAN) += le
 UTS_MACHINE := $(subst $(space),,$(machine-y))
 
+# XXX This needs to be before we override LD below
+ifdef CONFIG_PPC32
+KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
+else
+ifeq ($(call ld-ifversion, -ge, 225000000, y),y)
+# Have the linker provide sfpr if possible.
+# There is a corresponding test in arch/powerpc/lib/Makefile
+KBUILD_LDFLAGS_MODULE += --save-restore-funcs
+else
+KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
+endif
+endif
+
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 override LD	+= -EL
 LDEMULATION	:= lppc
@@ -190,18 +203,6 @@ else
 CHECKFLAGS	+= -D__LITTLE_ENDIAN__
 endif
 
-ifdef CONFIG_PPC32
-KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
-else
-ifeq ($(call ld-ifversion, -ge, 225000000, y),y)
-# Have the linker provide sfpr if possible.
-# There is a corresponding test in arch/powerpc/lib/Makefile
-KBUILD_LDFLAGS_MODULE += --save-restore-funcs
-else
-KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
-endif
-endif
-
 ifeq ($(CONFIG_476FPE_ERR46),y)
 	KBUILD_LDFLAGS_MODULE += --ppc476-workaround \
 		-T $(srctree)/arch/powerpc/platforms/44x/ppc476_modules.lds
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index a7814a7b1523..6f952fe1f084 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -25,12 +25,20 @@ compress-$(CONFIG_KERNEL_XZ)   := CONFIG_KERNEL_XZ
 BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 		 -fno-strict-aliasing -Os -msoft-float -pipe \
 		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
-		 -isystem $(shell $(CROSS32CC) -print-file-name=include) \
 		 -D$(compress-y)
 
+BOOTCC := $(CC)
 ifdef CONFIG_PPC64_BOOT_WRAPPER
 BOOTCFLAGS	+= -m64
+else
+BOOTCFLAGS	+= -m32
+ifdef CROSS32_COMPILE
+    BOOTCC := $(CROSS32_COMPILE)gcc
+endif
 endif
+
+BOOTCFLAGS	+= -isystem $(shell $(BOOTCC) -print-file-name=include)
+
 ifdef CONFIG_CPU_BIG_ENDIAN
 BOOTCFLAGS	+= -mbig-endian
 else
@@ -183,10 +191,10 @@ clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
 		empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
 quiet_cmd_bootcc = BOOTCC  $@
-      cmd_bootcc = $(CROSS32CC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $<
+      cmd_bootcc = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $<
 
 quiet_cmd_bootas = BOOTAS  $@
-      cmd_bootas = $(CROSS32CC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $<
+      cmd_bootas = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $<
 
 quiet_cmd_bootar = BOOTAR  $@
       cmd_bootar = $(CROSS32AR) -cr$(KBUILD_ARFLAGS) $@.$$$$ $(filter-out FORCE,$^); mv $@.$$$$ $@
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 0695ce047d56..34fc9bbfca9e 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -293,7 +293,8 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 5175028c56ce..c5246d29f385 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -324,7 +324,8 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_DEBUG_MUTEXES=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 1a61aa20dfba..fd5d98a0b95c 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -291,7 +291,8 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 25d42bd3f114..9c601adfc500 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -74,13 +74,6 @@ do {									\
 	___p1;								\
 })
 
-/*
- * This must resolve to hwsync on SMP for the context switch path.
- * See _switch, and core scheduler context switch memory ordering
- * comments.
- */
-#define smp_mb__before_spinlock()   smp_mb()
-
 #include <asm-generic/barrier.h>
 
 #endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 0ce513f2926f..36fc7bfe9e11 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -91,6 +91,7 @@ static inline int hash__pgd_bad(pgd_t pgd)
 }
 #ifdef CONFIG_STRICT_KERNEL_RWX
 extern void hash__mark_rodata_ro(void);
+extern void hash__mark_initmem_nx(void);
 #endif
 
 extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 77529a3e3811..5b4023c616f7 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -59,13 +59,14 @@ extern struct patb_entry *partition_tb;
 #define PRTS_MASK	0x1f		/* process table size field */
 #define PRTB_MASK	0x0ffffffffffff000UL
 
-/*
- * Limit process table to PAGE_SIZE table. This
- * also limit the max pid we can support.
- * MAX_USER_CONTEXT * 16 bytes of space.
- */
-#define PRTB_SIZE_SHIFT	(CONTEXT_BITS + 4)
-#define PRTB_ENTRIES	(1ul << CONTEXT_BITS)
+/* Number of supported PID bits */
+extern unsigned int mmu_pid_bits;
+
+/* Base PID to allocate from */
+extern unsigned int mmu_base_pid;
+
+#define PRTB_SIZE_SHIFT	(mmu_pid_bits + 4)
+#define PRTB_ENTRIES	(1ul << mmu_pid_bits)
 
 /*
  * Power9 currently only support 64K partition table size.
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c0737c86a362..818a58fc3f4f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -608,9 +608,17 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 	return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP);
 }
 
+/*
+ * This is potentially called with a pmd as the argument, in which case it's not
+ * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set.
+ * That's because the bit we use for _PAGE_DEVMAP is not reserved for software
+ * use in page directory entries (ie. non-ptes).
+ */
 static inline int pte_devmap(pte_t pte)
 {
-	return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
+	u64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
+
+	return (pte_raw(pte) & mask) == mask;
 }
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
@@ -1192,5 +1200,6 @@ static inline const int pud_pfn(pud_t pud)
 	BUILD_BUG();
 	return 0;
 }
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 487709ff6875..544440b5aff3 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -118,6 +118,7 @@
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
 extern void radix__mark_rodata_ro(void);
+extern void radix__mark_initmem_nx(void);
 #endif
 
 static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 0151af6c2a50..87fcc1948817 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -18,7 +18,7 @@
 #include <asm/asm-offsets.h>
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 .macro EMIT_BUG_ENTRY addr,file,line,flags
-	 .section __bug_table,"a"
+	 .section __bug_table,"aw"
 5001:	 PPC_LONG \addr, 5002f
 	 .short \line, \flags
 	 .org 5001b+BUG_ENTRY_SIZE
@@ -29,7 +29,7 @@
 .endm
 #else
 .macro EMIT_BUG_ENTRY addr,file,line,flags
-	 .section __bug_table,"a"
+	 .section __bug_table,"aw"
 5001:	 PPC_LONG \addr
 	 .short \flags
 	 .org 5001b+BUG_ENTRY_SIZE
@@ -42,14 +42,14 @@
    sizeof(struct bug_entry), respectively */
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #define _EMIT_BUG_ENTRY				\
-	".section __bug_table,\"a\"\n"		\
+	".section __bug_table,\"aw\"\n"		\
 	"2:\t" PPC_LONG "1b, %0\n"		\
 	"\t.short %1, %2\n"			\
 	".org 2b+%3\n"				\
 	".previous\n"
 #else
 #define _EMIT_BUG_ENTRY				\
-	".section __bug_table,\"a\"\n"		\
+	".section __bug_table,\"aw\"\n"		\
 	"2:\t" PPC_LONG "1b\n"			\
 	"\t.short %2\n"				\
 	".org 2b+%3\n"				\
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index eaada6c92344..719ed9b61ea7 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -29,18 +29,10 @@
 	: "b" (uaddr), "i" (-EFAULT), "r" (oparg) \
 	: "cr0", "memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8b3f1238d07f..e372ed871c51 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -67,11 +67,6 @@ extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 
-static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
-							 unsigned long address)
-{
-}
-
 #define HPTEG_CACHE_NUM			(1 << 15)
 #define HPTEG_HASH_BITS_PTE		13
 #define HPTEG_HASH_BITS_PTE_LONG	12
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index da7e9432fa8f..35bec1c5bd5a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -45,7 +45,7 @@ extern void set_context(unsigned long id, pgd_t *pgd);
 
 #ifdef CONFIG_PPC_BOOK3S_64
 extern void radix__switch_mmu_context(struct mm_struct *prev,
-				     struct mm_struct *next);
+				      struct mm_struct *next);
 static inline void switch_mmu_context(struct mm_struct *prev,
 				      struct mm_struct *next,
 				      struct task_struct *tsk)
@@ -67,6 +67,12 @@ extern void __destroy_context(unsigned long context_id);
 extern void mmu_context_init(void);
 #endif
 
+#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
+extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
+#else
+static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { }
+#endif
+
 extern void switch_cop(struct mm_struct *next);
 extern int use_cop(unsigned long acop, struct mm_struct *mm);
 extern void drop_cop(unsigned long acop, struct mm_struct *mm);
@@ -79,10 +85,32 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev,
 				      struct mm_struct *next,
 				      struct task_struct *tsk)
 {
+	bool new_on_cpu = false;
+
 	/* Mark this context has been used on the new CPU */
-	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next)))
+	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
 		cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
 
+		/*
+		 * This full barrier orders the store to the cpumask above vs
+		 * a subsequent operation which allows this CPU to begin loading
+		 * translations for next.
+		 *
+		 * When using the radix MMU that operation is the load of the
+		 * MMU context id, which is then moved to SPRN_PID.
+		 *
+		 * For the hash MMU it is either the first load from slb_cache
+		 * in switch_slb(), and/or the store of paca->mm_ctx_id in
+		 * copy_mm_to_paca().
+		 *
+		 * On the read side the barrier is in pte_xchg(), which orders
+		 * the store to the PTE vs the load of mm_cpumask.
+		 */
+		smp_mb();
+
+		new_on_cpu = true;
+	}
+
 	/* 32-bit keeps track of the current PGDIR in the thread struct */
 #ifdef CONFIG_PPC32
 	tsk->thread.pgdir = next->pgd;
@@ -109,6 +137,10 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev,
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
 		asm volatile ("dssall");
 #endif /* CONFIG_ALTIVEC */
+
+	if (new_on_cpu)
+		radix_kvm_prefetch_workaround(next);
+
 	/*
 	 * The actual HW switching method differs between the various
 	 * sub architectures. Out of line for now
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h
index 9c0f5db5cf46..67e7e3d990f4 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -87,6 +87,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
 	unsigned long *p = (unsigned long *)ptep;
 	__be64 prev;
 
+	/* See comment in switch_mm_irqs_off() */
 	prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old),
 					     (__force unsigned long)pte_raw(new));
 
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
index 8bd3b13fe2fb..369a164b545c 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -62,6 +62,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
 {
 	unsigned long *p = (unsigned long *)ptep;
 
+	/* See comment in switch_mm_irqs_off() */
 	return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new));
 }
 #endif
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index dd01212935ac..afae9a336136 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -80,6 +80,13 @@ unsigned long vmalloc_to_phys(void *vmalloc_addr);
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mark_initmem_nx(void);
+#else
+static inline void mark_initmem_nx(void) { }
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 8c1b913de6d7..edbe571bcc54 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -170,39 +170,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	lock->slock = 0;
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	arch_spinlock_t lock_val;
-
-	smp_mb();
-
-	/*
-	 * Atomically load and store back the lock value (unchanged). This
-	 * ensures that our observation of the lock value is ordered with
-	 * respect to other lock operations.
-	 */
-	__asm__ __volatile__(
-"1:	" PPC_LWARX(%0, 0, %2, 0) "\n"
-"	stwcx. %0, 0, %2\n"
-"	bne- 1b\n"
-	: "=&r" (lock_val), "+m" (*lock)
-	: "r" (lock)
-	: "cr0", "xer");
-
-	if (arch_spin_value_unlocked(lock_val))
-		goto out;
-
-	while (lock->slock) {
-		HMT_low();
-		if (SHARED_PROCESSOR)
-			__spin_yield(lock);
-	}
-	HMT_medium();
-
-out:
-	smp_mb();
-}
-
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -342,5 +309,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	__rw_yield(lock)
 #define arch_write_relax(lock)	__rw_yield(lock)
 
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()   smp_mb()
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/uapi/asm/ioctls.h b/arch/powerpc/include/uapi/asm/ioctls.h
index bfd609a3e928..e3b10469f787 100644
--- a/arch/powerpc/include/uapi/asm/ioctls.h
+++ b/arch/powerpc/include/uapi/asm/ioctls.h
@@ -100,7 +100,7 @@
 #define TIOCGPKT	_IOR('T', 0x38, int) /* Get packet mode state */
 #define TIOCGPTLCK	_IOR('T', 0x39, int) /* Get Pty lock state */
 #define TIOCGEXCL	_IOR('T', 0x40, int) /* Get exclusive mode state */
-#define TIOCGPTPEER	_IOR('T', 0x41, int) /* Safely open the slave */
+#define TIOCGPTPEER	_IO('T', 0x41) /* Safely open the slave */
 
 #define TIOCSERCONFIG	0x5453
 #define TIOCSERGWILD	0x5454
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 49d8422767b4..e925c1c99c71 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -223,17 +223,27 @@ system_call_exit:
 	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
 	bne-	.Lsyscall_exit_work
 
-	/* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */
-	li	r7,MSR_FP
+	andi.	r0,r8,MSR_FP
+	beq 2f
 #ifdef CONFIG_ALTIVEC
-	oris	r7,r7,MSR_VEC@h
+	andis.	r0,r8,MSR_VEC@h
+	bne	3f
 #endif
-	and	r0,r8,r7
-	cmpd	r0,r7
-	bne	.Lsyscall_restore_math
-.Lsyscall_restore_math_cont:
+2:	addi    r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_PPC_BOOK3S
+	li	r10,MSR_RI
+	mtmsrd	r10,1		/* Restore RI */
+#endif
+	bl	restore_math
+#ifdef CONFIG_PPC_BOOK3S
+	li	r11,0
+	mtmsrd	r11,1
+#endif
+	ld	r8,_MSR(r1)
+	ld	r3,RESULT(r1)
+	li	r11,-MAX_ERRNO
 
-	cmpld	r3,r11
+3:	cmpld	r3,r11
 	ld	r5,_CCR(r1)
 	bge-	.Lsyscall_error
 .Lsyscall_error_cont:
@@ -267,40 +277,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	std	r5,_CCR(r1)
 	b	.Lsyscall_error_cont
 
-.Lsyscall_restore_math:
-	/*
-	 * Some initial tests from restore_math to avoid the heavyweight
-	 * C code entry and MSR manipulations.
-	 */
-	LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK)
-	and.	r0,r0,r8
-	bne	1f
-
-	ld	r7,PACACURRENT(r13)
-	lbz	r0,THREAD+THREAD_LOAD_FP(r7)
-#ifdef CONFIG_ALTIVEC
-	lbz	r6,THREAD+THREAD_LOAD_VEC(r7)
-	add	r0,r0,r6
-#endif
-	cmpdi	r0,0
-	beq	.Lsyscall_restore_math_cont
-
-1:	addi    r3,r1,STACK_FRAME_OVERHEAD
-#ifdef CONFIG_PPC_BOOK3S
-	li	r10,MSR_RI
-	mtmsrd	r10,1		/* Restore RI */
-#endif
-	bl	restore_math
-#ifdef CONFIG_PPC_BOOK3S
-	li	r11,0
-	mtmsrd	r11,1
-#endif
-	/* Restore volatiles, reload MSR from updated one */
-	ld	r8,_MSR(r1)
-	ld	r3,RESULT(r1)
-	li	r11,-MAX_ERRNO
-	b	.Lsyscall_restore_math_cont
-
 /* Traced system call support */
 .Lsyscall_dotrace:
 	bl	save_nvgprs
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e6d8354d79ef..f14f3c04ec7e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -824,7 +824,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
  * r3 volatile parameter and return value for status
  * r4-r10 volatile input and output value
  * r11 volatile hypercall number and output value
- * r12 volatile
+ * r12 volatile input and output value
  * r13-r31 nonvolatile
  * LR nonvolatile
  * CTR volatile
@@ -834,25 +834,26 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
  * Other registers nonvolatile
  *
  * The intersection of volatile registers that don't contain possible
- * inputs is: r12, cr0, xer, ctr. We may use these as scratch regs
- * upon entry without saving.
+ * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry
+ * without saving, though xer is not a good idea to use, as hardware may
+ * interpret some bits so it may be costly to change them.
  */
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 	/*
 	 * There is a little bit of juggling to get syscall and hcall
-	 * working well. Save r10 in ctr to be restored in case it is a
-	 * hcall.
+	 * working well. Save r13 in ctr to avoid using SPRG scratch
+	 * register.
 	 *
 	 * Userspace syscalls have already saved the PPR, hcalls must save
 	 * it before setting HMT_MEDIUM.
 	 */
 #define SYSCALL_KVMTEST							\
-	mr	r12,r13;						\
+	mtctr	r13;							\
 	GET_PACA(r13);							\
-	mtctr	r10;							\
+	std	r10,PACA_EXGEN+EX_R10(r13);				\
 	KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \
 	HMT_MEDIUM;							\
-	mr	r9,r12;							\
+	mfctr	r9;
 
 #else
 #define SYSCALL_KVMTEST							\
@@ -935,8 +936,8 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100)
 	 * This is a hcall, so register convention is as above, with these
 	 * differences:
 	 * r13 = PACA
-	 * r12 = orig r13
-	 * ctr = orig r10
+	 * ctr = orig r13
+	 * orig r10 saved in PACA
 	 */
 TRAMP_KVM_BEGIN(do_kvm_0xc00)
 	 /*
@@ -944,14 +945,13 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00)
 	  * HMT_MEDIUM. That allows the KVM code to save that value into the
 	  * guest state (it is the guest's PPR value).
 	  */
-	OPT_GET_SPR(r0, SPRN_PPR, CPU_FTR_HAS_PPR)
+	OPT_GET_SPR(r10, SPRN_PPR, CPU_FTR_HAS_PPR)
 	HMT_MEDIUM
-	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r0, CPU_FTR_HAS_PPR)
+	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r10, CPU_FTR_HAS_PPR)
 	mfctr	r10
-	SET_SCRATCH0(r12)
+	SET_SCRATCH0(r10)
 	std	r9,PACA_EXGEN+EX_R9(r13)
 	mfcr	r9
-	std	r10,PACA_EXGEN+EX_R10(r13)
 	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
 #endif
 
@@ -1325,10 +1325,18 @@ EXC_VIRT_NONE(0x5800, 0x100)
 	std	r10,PACA_EXGEN+EX_R13(r13);		\
 	EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H)
 
+/*
+ * Branch to soft_nmi_interrupt using the emergency stack. The emergency
+ * stack is one that is usable by maskable interrupts so long as MSR_EE
+ * remains off. It is used for recovery when something has corrupted the
+ * normal kernel stack, for example. The "soft NMI" must not use the process
+ * stack because we want irq disabled sections to avoid touching the stack
+ * at all (other than PMU interrupts), so use the emergency stack for this,
+ * and run it entirely with interrupts hard disabled.
+ */
 EXC_COMMON_BEGIN(soft_nmi_common)
 	mr	r10,r1
 	ld	r1,PACAEMERGSP(r13)
-	ld	r1,PACA_NMI_EMERG_SP(r13)
 	subi	r1,r1,INT_FRAME_SIZE
 	EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900,
 			system_reset, soft_nmi_interrupt,
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 5adb390e773b..e6252c5a57a4 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -30,6 +30,7 @@
  * Use unused space in the interrupt stack to save and restore
  * registers for winkle support.
  */
+#define _MMCR0	GPR0
 #define _SDR1	GPR3
 #define _PTCR	GPR3
 #define _RPR	GPR4
@@ -272,6 +273,14 @@ power_enter_stop:
 	b 	pnv_wakeup_noloss
 
 .Lhandle_esl_ec_set:
+	/*
+	 * POWER9 DD2 can incorrectly set PMAO when waking up after a
+	 * state-loss idle. Saving and restoring MMCR0 over idle is a
+	 * workaround.
+	 */
+	mfspr	r4,SPRN_MMCR0
+	std	r4,_MMCR0(r1)
+
 /*
  * Check if the requested state is a deep idle state.
  */
@@ -450,10 +459,20 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 pnv_restore_hyp_resource_arch300:
 	/*
 	 * Workaround for POWER9, if we lost resources, the ERAT
-	 * might have been mixed up and needs flushing.
+	 * might have been mixed up and needs flushing. We also need
+	 * to reload MMCR0 (see comment above). We also need to set
+	 * then clear bit 60 in MMCRA to ensure the PMU starts running.
 	 */
 	blt	cr3,1f
 	PPC_INVALIDATE_ERAT
+	ld	r1,PACAR1(r13)
+	mfspr	r4,SPRN_MMCRA
+	ori	r4,r4,(1 << (63-60))
+	mtspr	SPRN_MMCRA,r4
+	xori	r4,r4,(1 << (63-60))
+	mtspr	SPRN_MMCRA,r4
+	ld	r4,_MMCR0(r1)
+	mtspr	SPRN_MMCR0,r4
 1:
 	/*
 	 * POWER ISA 3. Use PSSCR to determine if we
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0bcec745a672..f291f7826abc 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -145,6 +145,19 @@ notrace unsigned int __check_irq_replay(void)
 
 	/* Clear bit 0 which we wouldn't clear otherwise */
 	local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+	if (happened & PACA_IRQ_HARD_DIS) {
+		/*
+		 * We may have missed a decrementer interrupt if hard disabled.
+		 * Check the decrementer register in case we had a rollover
+		 * while hard disabled.
+		 */
+		if (!(happened & PACA_IRQ_DEC)) {
+			if (decrementer_check_overflow()) {
+				local_paca->irq_happened |= PACA_IRQ_DEC;
+				happened |= PACA_IRQ_DEC;
+			}
+		}
+	}
 
 	/*
 	 * Force the delivery of pending soft-disabled interrupts on PS3.
@@ -170,7 +183,7 @@ notrace unsigned int __check_irq_replay(void)
 	 * in case we also had a rollover while hard disabled
 	 */
 	local_paca->irq_happened &= ~PACA_IRQ_DEC;
-	if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow())
+	if (happened & PACA_IRQ_DEC)
 		return 0x900;
 
 	/* Finally check if an external interrupt happened */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9f3e2c932dcc..1f0fd361e09b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -362,7 +362,8 @@ void enable_kernel_vsx(void)
 
 	cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
+	if (current->thread.regs &&
+	    (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) {
 		check_if_tm_restore_required(current);
 		/*
 		 * If a thread has already been reclaimed then the
@@ -386,7 +387,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
 {
 	if (tsk->thread.regs) {
 		preempt_disable();
-		if (tsk->thread.regs->msr & MSR_VSX) {
+		if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) {
 			BUG_ON(tsk != current);
 			giveup_vsx(tsk);
 		}
@@ -511,10 +512,6 @@ void restore_math(struct pt_regs *regs)
 {
 	unsigned long msr;
 
-	/*
-	 * Syscall exit makes a similar initial check before branching
-	 * to restore_math. Keep them in synch.
-	 */
 	if (!msr_tm_active(regs->msr) &&
 		!current->thread.load_fp && !loadvec(current->thread))
 		return;
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 925a4ef90559..660ed39e9c9a 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -127,12 +127,19 @@ static void flush_tmregs_to_thread(struct task_struct *tsk)
 	 * If task is not current, it will have been flushed already to
 	 * it's thread_struct during __switch_to().
 	 *
-	 * A reclaim flushes ALL the state.
+	 * A reclaim flushes ALL the state or if not in TM save TM SPRs
+	 * in the appropriate thread structures from live.
 	 */
 
-	if (tsk == current && MSR_TM_SUSPENDED(mfmsr()))
-		tm_reclaim_current(TM_CAUSE_SIGNAL);
+	if (tsk != current)
+		return;
 
+	if (MSR_TM_SUSPENDED(mfmsr())) {
+		tm_reclaim_current(TM_CAUSE_SIGNAL);
+	} else {
+		tm_enable();
+		tm_save_sprs(&(tsk->thread));
+	}
 }
 #else
 static inline void flush_tmregs_to_thread(struct task_struct *tsk) { }
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 997c88d54acf..8d3320562c70 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -351,7 +351,7 @@ static void nmi_ipi_lock_start(unsigned long *flags)
 	hard_irq_disable();
 	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
 		raw_local_irq_restore(*flags);
-		cpu_relax();
+		spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
 		raw_local_irq_save(*flags);
 		hard_irq_disable();
 	}
@@ -360,7 +360,7 @@ static void nmi_ipi_lock_start(unsigned long *flags)
 static void nmi_ipi_lock(void)
 {
 	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
-		cpu_relax();
+		spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
 }
 
 static void nmi_ipi_unlock(void)
@@ -475,7 +475,7 @@ int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
 	nmi_ipi_lock_start(&flags);
 	while (nmi_ipi_busy_count) {
 		nmi_ipi_unlock_end(&flags);
-		cpu_relax();
+		spin_until_cond(nmi_ipi_busy_count == 0);
 		nmi_ipi_lock_start(&flags);
 	}
 
@@ -1003,21 +1003,13 @@ static struct sched_domain_topology_level powerpc_topology[] = {
 	{ NULL, },
 };
 
-static __init long smp_setup_cpu_workfn(void *data __always_unused)
-{
-	smp_ops->setup_cpu(boot_cpuid);
-	return 0;
-}
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	/*
-	 * We want the setup_cpu() here to be called on the boot CPU, but
-	 * init might run on any CPU, so make sure it's invoked on the boot
-	 * CPU.
+	 * We are running pinned to the boot CPU, see rest_init().
 	 */
 	if (smp_ops && smp_ops->setup_cpu)
-		work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
+		smp_ops->setup_cpu(boot_cpuid);
 
 	if (smp_ops && smp_ops->bringup_done)
 		smp_ops->bringup_done();
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index b67f8b03a32d..34721a257a77 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -71,15 +71,20 @@ static inline void wd_smp_lock(unsigned long *flags)
 	 * This may be called from low level interrupt handlers at some
 	 * point in future.
 	 */
-	local_irq_save(*flags);
-	while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock)))
-		cpu_relax();
+	raw_local_irq_save(*flags);
+	hard_irq_disable(); /* Make it soft-NMI safe */
+	while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
+		raw_local_irq_restore(*flags);
+		spin_until_cond(!test_bit(0, &__wd_smp_lock));
+		raw_local_irq_save(*flags);
+		hard_irq_disable();
+	}
 }
 
 static inline void wd_smp_unlock(unsigned long *flags)
 {
 	clear_bit_unlock(0, &__wd_smp_lock);
-	local_irq_restore(*flags);
+	raw_local_irq_restore(*flags);
 }
 
 static void wd_lockup_ipi(struct pt_regs *regs)
@@ -96,10 +101,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 		nmi_panic(regs, "Hard LOCKUP");
 }
 
-static void set_cpu_stuck(int cpu, u64 tb)
+static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
 {
-	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
-	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
+	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
+	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
 	if (cpumask_empty(&wd_smp_cpus_pending)) {
 		wd_smp_last_reset_tb = tb;
 		cpumask_andnot(&wd_smp_cpus_pending,
@@ -107,6 +112,10 @@ static void set_cpu_stuck(int cpu, u64 tb)
 				&wd_smp_cpus_stuck);
 	}
 }
+static void set_cpu_stuck(int cpu, u64 tb)
+{
+	set_cpumask_stuck(cpumask_of(cpu), tb);
+}
 
 static void watchdog_smp_panic(int cpu, u64 tb)
 {
@@ -135,11 +144,9 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 	}
 	smp_flush_nmi_ipi(1000000);
 
-	/* Take the stuck CPU out of the watch group */
-	for_each_cpu(c, &wd_smp_cpus_pending)
-		set_cpu_stuck(c, tb);
+	/* Take the stuck CPUs out of the watch group */
+	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
 
-out:
 	wd_smp_unlock(&flags);
 
 	printk_safe_flush();
@@ -152,6 +159,11 @@ out:
 
 	if (hardlockup_panic)
 		nmi_panic(NULL, "Hard LOCKUP");
+
+	return;
+
+out:
+	wd_smp_unlock(&flags);
 }
 
 static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
@@ -258,9 +270,11 @@ static void wd_timer_fn(unsigned long data)
 
 void arch_touch_nmi_watchdog(void)
 {
+	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
 
-	watchdog_timer_interrupt(cpu);
+	if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
+		watchdog_timer_interrupt(cpu);
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
@@ -283,6 +297,8 @@ static void stop_watchdog_timer_on(unsigned int cpu)
 
 static int start_wd_on_cpu(unsigned int cpu)
 {
+	unsigned long flags;
+
 	if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
 		WARN_ON(1);
 		return 0;
@@ -297,12 +313,14 @@ static int start_wd_on_cpu(unsigned int cpu)
 	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
 		return 0;
 
+	wd_smp_lock(&flags);
 	cpumask_set_cpu(cpu, &wd_cpus_enabled);
 	if (cpumask_weight(&wd_cpus_enabled) == 1) {
 		cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
 		wd_smp_last_reset_tb = get_tb();
 	}
-	smp_wmb();
+	wd_smp_unlock(&flags);
+
 	start_watchdog_timer_on(cpu);
 
 	return 0;
@@ -310,12 +328,17 @@ static int start_wd_on_cpu(unsigned int cpu)
 
 static int stop_wd_on_cpu(unsigned int cpu)
 {
+	unsigned long flags;
+
 	if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
 		return 0; /* Can happen in CPU unplug case */
 
 	stop_watchdog_timer_on(cpu);
 
+	wd_smp_lock(&flags);
 	cpumask_clear_cpu(cpu, &wd_cpus_enabled);
+	wd_smp_unlock(&flags);
+
 	wd_smp_clear_cpu_pending(cpu, get_tb());
 
 	return 0;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 8cb0190e2a73..b42812e014c0 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -164,8 +164,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 		goto out;
 	}
 
-	if (kvm->arch.hpt.virt)
+	if (kvm->arch.hpt.virt) {
 		kvmppc_free_hpt(&kvm->arch.hpt);
+		kvmppc_rmap_reset(kvm);
+	}
 
 	err = kvmppc_allocate_hpt(&info, order);
 	if (err < 0)
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index a160c14304eb..53766e2bc029 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				   struct kvm_create_spapr_tce_64 *args)
 {
 	struct kvmppc_spapr_tce_table *stt = NULL;
+	struct kvmppc_spapr_tce_table *siter;
 	unsigned long npages, size;
 	int ret = -ENOMEM;
 	int i;
+	int fd = -1;
 
 	if (!args->size)
 		return -EINVAL;
 
-	/* Check this LIOBN hasn't been previously allocated */
-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == args->liobn)
-			return -EBUSY;
-	}
-
 	size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
 	npages = kvmppc_tce_pages(size);
 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
-	if (ret) {
-		stt = NULL;
-		goto fail;
-	}
+	if (ret)
+		return ret;
 
 	ret = -ENOMEM;
 	stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
 		      GFP_KERNEL);
 	if (!stt)
-		goto fail;
+		goto fail_acct;
 
 	stt->liobn = args->liobn;
 	stt->page_shift = args->page_shift;
@@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 			goto fail;
 	}
 
-	kvm_get_kvm(kvm);
+	ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				    stt, O_RDWR | O_CLOEXEC);
+	if (ret < 0)
+		goto fail;
 
 	mutex_lock(&kvm->lock);
-	list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	/* Check this LIOBN hasn't been previously allocated */
+	ret = 0;
+	list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) {
+		if (siter->liobn == args->liobn) {
+			ret = -EBUSY;
+			break;
+		}
+	}
+
+	if (!ret) {
+		list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+		kvm_get_kvm(kvm);
+	}
 
 	mutex_unlock(&kvm->lock);
 
-	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-				stt, O_RDWR | O_CLOEXEC);
+	if (!ret)
+		return fd;
 
-fail:
-	if (stt) {
-		for (i = 0; i < npages; i++)
-			if (stt->pages[i])
-				__free_page(stt->pages[i]);
+	put_unused_fd(fd);
 
-		kfree(stt);
-	}
+ fail:
+	for (i = 0; i < npages; i++)
+		if (stt->pages[i])
+			__free_page(stt->pages[i]);
+
+	kfree(stt);
+ fail_acct:
+	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
 	return ret;
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 0b436df746fc..359c79cdf0cc 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3211,6 +3211,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			run->fail_entry.hardware_entry_failure_reason = 0;
 			return -EINVAL;
 		}
+		/* Enable TM so we can read the TM SPRs */
+		mtmsr(mfmsr() | MSR_TM);
 		current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
 		current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
 		current->thread.tm_texasr = mfspr(SPRN_TEXASR);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index cb44065e2946..9c9c983b864f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1291,6 +1291,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	/* Hypervisor doorbell - exit only if host IPI flag set */
 	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
 	bne	3f
+BEGIN_FTR_SECTION
+	PPC_MSGSYNC
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	lbz	r0, HSTATE_HOST_IPI(r13)
 	cmpwi	r0, 0
 	beq	4f
@@ -1443,12 +1446,14 @@ mc_cont:
 	ori	r6,r6,1
 	mtspr	SPRN_CTRLT,r6
 4:
-	/* Read the guest SLB and save it away */
+	/* Check if we are running hash or radix and store it in cr2 */
 	ld	r5, VCPU_KVM(r9)
 	lbz	r0, KVM_RADIX(r5)
-	cmpwi	r0, 0
+	cmpwi	cr2,r0,0
+
+	/* Read the guest SLB and save it away */
 	li	r5, 0
-	bne	3f			/* for radix, save 0 entries */
+	bne	cr2, 3f			/* for radix, save 0 entries */
 	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
 	mtctr	r0
 	li	r6,0
@@ -1712,11 +1717,6 @@ BEGIN_FTR_SECTION_NESTED(96)
 END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 22:
-	/* Clear out SLB */
-	li	r5,0
-	slbmte	r5,r5
-	slbia
-	ptesync
 
 	/* Restore host values of some registers */
 BEGIN_FTR_SECTION
@@ -1737,10 +1737,56 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_PID, r7
 	mtspr	SPRN_IAMR, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
+#ifdef CONFIG_PPC_RADIX_MMU
+	/*
+	 * Are we running hash or radix ?
+	 */
+	beq	cr2,3f
+
+	/* Radix: Handle the case where the guest used an illegal PID */
+	LOAD_REG_ADDR(r4, mmu_base_pid)
+	lwz	r3, VCPU_GUEST_PID(r9)
+	lwz	r5, 0(r4)
+	cmpw	cr0,r3,r5
+	blt	2f
+
+	/*
+	 * Illegal PID, the HW might have prefetched and cached in the TLB
+	 * some translations for the  LPID 0 / guest PID combination which
+	 * Linux doesn't know about, so we need to flush that PID out of
+	 * the TLB. First we need to set LPIDR to 0 so tlbiel applies to
+	 * the right context.
+	*/
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	isync
+
+	/* Then do a congruence class local flush */
+	ld	r6,VCPU_KVM(r9)
+	lwz	r0,KVM_TLB_SETS(r6)
+	mtctr	r0
+	li	r7,0x400		/* IS field = 0b01 */
+	ptesync
+	sldi	r0,r3,32		/* RS has PID */
+1:	PPC_TLBIEL(7,0,2,1,1)		/* RIC=2, PRS=1, R=1 */
+	addi	r7,r7,0x1000
+	bdnz	1b
+	ptesync
+
+2:	/* Flush the ERAT on radix P9 DD1 guest exit */
 BEGIN_FTR_SECTION
 	PPC_INVALIDATE_ERAT
 END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
+	b	4f
+#endif /* CONFIG_PPC_RADIX_MMU */
 
+	/* Hash: clear out SLB */
+3:	li	r5,0
+	slbmte	r5,r5
+	slbia
+	ptesync
+4:
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
 	 * We don't have to lock against tlbies but we do
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 4636ca6e7d38..d1ed2c41b5d2 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -16,7 +16,22 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
 	u8 cppr;
 	u16 ack;
 
-	/* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */
+	/*
+	 * Ensure any previous store to CPPR is ordered vs.
+	 * the subsequent loads from PIPR or ACK.
+	 */
+	eieio();
+
+	/*
+	 * DD1 bug workaround: If PIPR is less favored than CPPR
+	 * ignore the interrupt or we might incorrectly lose an IPB
+	 * bit.
+	 */
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+		u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+		if (pipr >= xc->hw_cppr)
+			return;
+	}
 
 	/* Perform the acknowledge OS to register cycle. */
 	ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG));
@@ -235,6 +250,11 @@ skip_ipi:
 	/*
 	 * If we found an interrupt, adjust what the guest CPPR should
 	 * be as if we had just fetched that interrupt from HW.
+	 *
+	 * Note: This can only make xc->cppr smaller as the previous
+	 * loop will only exit with hirq != 0 if prio is lower than
+	 * the current xc->cppr. Thus we don't need to re-check xc->mfrr
+	 * for pending IPIs.
 	 */
 	if (hirq)
 		xc->cppr = prio;
@@ -381,6 +401,12 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
 	xc->cppr = cppr;
 
 	/*
+	 * Order the above update of xc->cppr with the subsequent
+	 * read of xc->mfrr inside push_pending_to_hw()
+	 */
+	smp_mb();
+
+	/*
 	 * We are masking less, we need to look for pending things
 	 * to deliver and set VP pending bits accordingly to trigger
 	 * a new interrupt otherwise we might miss MFRR changes for
@@ -420,21 +446,37 @@ X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr)
 	 * used to signal MFRR changes is EOId when fetched from
 	 * the queue.
 	 */
-	if (irq == XICS_IPI || irq == 0)
+	if (irq == XICS_IPI || irq == 0) {
+		/*
+		 * This barrier orders the setting of xc->cppr vs.
+		 * subsquent test of xc->mfrr done inside
+		 * scan_interrupts and push_pending_to_hw
+		 */
+		smp_mb();
 		goto bail;
+	}
 
 	/* Find interrupt source */
 	sb = kvmppc_xive_find_source(xive, irq, &src);
 	if (!sb) {
 		pr_devel(" source not found !\n");
 		rc = H_PARAMETER;
+		/* Same as above */
+		smp_mb();
 		goto bail;
 	}
 	state = &sb->irq_state[src];
 	kvmppc_xive_select_irq(state, &hw_num, &xd);
 
 	state->in_eoi = true;
-	mb();
+
+	/*
+	 * This barrier orders both setting of in_eoi above vs,
+	 * subsequent test of guest_priority, and the setting
+	 * of xc->cppr vs. subsquent test of xc->mfrr done inside
+	 * scan_interrupts and push_pending_to_hw
+	 */
+	smp_mb();
 
 again:
 	if (state->guest_priority == MASKED) {
@@ -461,6 +503,14 @@ again:
 
 	}
 
+	/*
+	 * This barrier orders the above guest_priority check
+	 * and spin_lock/unlock with clearing in_eoi below.
+	 *
+	 * It also has to be a full mb() as it must ensure
+	 * the MMIOs done in source_eoi() are completed before
+	 * state->in_eoi is visible.
+	 */
 	mb();
 	state->in_eoi = false;
 bail:
@@ -495,6 +545,18 @@ X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
 	/* Locklessly write over MFRR */
 	xc->mfrr = mfrr;
 
+	/*
+	 * The load of xc->cppr below and the subsequent MMIO store
+	 * to the IPI must happen after the above mfrr update is
+	 * globally visible so that:
+	 *
+	 * - Synchronize with another CPU doing an H_EOI or a H_CPPR
+	 *   updating xc->cppr then reading xc->mfrr.
+	 *
+	 * - The target of the IPI sees the xc->mfrr update
+	 */
+	mb();
+
 	/* Shoot the IPI if most favored than target cppr */
 	if (mfrr < xc->cppr)
 		__x_writeq(0, __x_trig_page(&xc->vp_ipi_data));
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8541f18694a4..46b4e67d2372 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -402,6 +402,7 @@ void __init mem_init(void)
 void free_initmem(void)
 {
 	ppc_md.progress = ppc_printk_progress;
+	mark_initmem_nx();
 	free_initmem_default(POISON_FREE_INITMEM);
 }
 
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index abed1fe6992f..a75f63833284 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -126,9 +126,10 @@ static int hash__init_new_context(struct mm_struct *mm)
 static int radix__init_new_context(struct mm_struct *mm)
 {
 	unsigned long rts_field;
-	int index;
+	int index, max_id;
 
-	index = alloc_context_id(1, PRTB_ENTRIES - 1);
+	max_id = (1 << mmu_pid_bits) - 1;
+	index = alloc_context_id(mmu_base_pid, max_id);
 	if (index < 0)
 		return index;
 
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 188b4107584d..443a2c66a304 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -425,33 +425,51 @@ int hash__has_transparent_hugepage(void)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
-void hash__mark_rodata_ro(void)
+static bool hash__change_memory_range(unsigned long start, unsigned long end,
+				      unsigned long newpp)
 {
-	unsigned long start = (unsigned long)_stext;
-	unsigned long end = (unsigned long)__init_begin;
 	unsigned long idx;
 	unsigned int step, shift;
-	unsigned long newpp = PP_RXXX;
 
 	shift = mmu_psize_defs[mmu_linear_psize].shift;
 	step = 1 << shift;
 
-	start = ((start + step - 1) >> shift) << shift;
-	end = (end >> shift) << shift;
+	start = ALIGN_DOWN(start, step);
+	end = ALIGN(end, step); // aligns up
 
-	pr_devel("marking ro start %lx, end %lx, step %x\n",
-			start, end, step);
+	if (start >= end)
+		return false;
 
-	if (start == end) {
-		pr_warn("could not set rodata ro, relocate the start"
-			" of the kernel to a 0x%x boundary\n", step);
-		return;
-	}
+	pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
+		 start, end, newpp, step);
 
 	for (idx = start; idx < end; idx += step)
 		/* Not sure if we can do much with the return value */
 		mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
 							mmu_kernel_ssize);
 
+	return true;
+}
+
+void hash__mark_rodata_ro(void)
+{
+	unsigned long start, end;
+
+	start = (unsigned long)_stext;
+	end = (unsigned long)__init_begin;
+
+	WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
+}
+
+void hash__mark_initmem_nx(void)
+{
+	unsigned long start, end, pp;
+
+	start = (unsigned long)__init_begin;
+	end = (unsigned long)__init_end;
+
+	pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+
+	WARN_ON(!hash__change_memory_range(start, end, pp));
 }
 #endif
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 8c13e4282308..671a45d86c18 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -25,6 +25,9 @@
 
 #include <trace/events/thp.h>
 
+unsigned int mmu_pid_bits;
+unsigned int mmu_base_pid;
+
 static int native_register_process_table(unsigned long base, unsigned long pg_sz,
 					 unsigned long table_size)
 {
@@ -112,10 +115,9 @@ set_the_pte:
 }
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
-void radix__mark_rodata_ro(void)
+void radix__change_memory_range(unsigned long start, unsigned long end,
+				unsigned long clear)
 {
-	unsigned long start = (unsigned long)_stext;
-	unsigned long end = (unsigned long)__init_begin;
 	unsigned long idx;
 	pgd_t *pgdp;
 	pud_t *pudp;
@@ -125,7 +127,8 @@ void radix__mark_rodata_ro(void)
 	start = ALIGN_DOWN(start, PAGE_SIZE);
 	end = PAGE_ALIGN(end); // aligns up
 
-	pr_devel("marking ro start %lx, end %lx\n", start, end);
+	pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
+		 start, end, clear);
 
 	for (idx = start; idx < end; idx += PAGE_SIZE) {
 		pgdp = pgd_offset_k(idx);
@@ -147,11 +150,29 @@ void radix__mark_rodata_ro(void)
 		if (!ptep)
 			continue;
 update_the_pte:
-		radix__pte_update(&init_mm, idx, ptep, _PAGE_WRITE, 0, 0);
+		radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
 	}
 
 	radix__flush_tlb_kernel_range(start, end);
 }
+
+void radix__mark_rodata_ro(void)
+{
+	unsigned long start, end;
+
+	start = (unsigned long)_stext;
+	end = (unsigned long)__init_begin;
+
+	radix__change_memory_range(start, end, _PAGE_WRITE);
+}
+
+void radix__mark_initmem_nx(void)
+{
+	unsigned long start = (unsigned long)__init_begin;
+	unsigned long end = (unsigned long)__init_end;
+
+	radix__change_memory_range(start, end, _PAGE_EXEC);
+}
 #endif /* CONFIG_STRICT_KERNEL_RWX */
 
 static inline void __meminit print_mapping(unsigned long start,
@@ -243,11 +264,34 @@ static void __init radix_init_pgtable(void)
 	for_each_memblock(memory, reg)
 		WARN_ON(create_physical_mapping(reg->base,
 						reg->base + reg->size));
+
+	/* Find out how many PID bits are supported */
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (!mmu_pid_bits)
+			mmu_pid_bits = 20;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		/*
+		 * When KVM is possible, we only use the top half of the
+		 * PID space to avoid collisions between host and guest PIDs
+		 * which can cause problems due to prefetch when exiting the
+		 * guest with AIL=3
+		 */
+		mmu_base_pid = 1 << (mmu_pid_bits - 1);
+#else
+		mmu_base_pid = 1;
+#endif
+	} else {
+		/* The guest uses the bottom half of the PID space */
+		if (!mmu_pid_bits)
+			mmu_pid_bits = 19;
+		mmu_base_pid = 1;
+	}
+
 	/*
 	 * Allocate Partition table and process table for the
 	 * host.
 	 */
-	BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 36), "Process table size too large.");
+	BUG_ON(PRTB_SIZE_SHIFT > 36);
 	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
 	/*
 	 * Fill in the process table.
@@ -321,6 +365,12 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
+	/* Find MMU PID size */
+	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+	if (prop && size == 4)
+		mmu_pid_bits = be32_to_cpup(prop);
+
+	/* Grab page size encodings */
 	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
 	if (!prop)
 		return 0;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 5c0b795d656c..0736e94c7615 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -505,4 +505,12 @@ void mark_rodata_ro(void)
 	else
 		hash__mark_rodata_ro();
 }
+
+void mark_initmem_nx(void)
+{
+	if (radix_enabled())
+		radix__mark_initmem_nx();
+	else
+		hash__mark_initmem_nx();
+}
 #endif
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index e94fbd4c8845..781532d7bc4d 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -36,7 +36,7 @@ void subpage_prot_free(struct mm_struct *mm)
 		}
 	}
 	addr = 0;
-	for (i = 0; i < 2; ++i) {
+	for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
 		p = spt->protptrs[i];
 		if (!p)
 			continue;
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 744e0164ecf5..16ae1bbe13f0 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -12,12 +12,12 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
-#include <asm/ppc-opcode.h>
 
+#include <asm/ppc-opcode.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/trace.h>
-
+#include <asm/cputhreads.h>
 
 #define RIC_FLUSH_TLB 0
 #define RIC_FLUSH_PWC 1
@@ -454,3 +454,44 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
 	else
 		radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
 }
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
+{
+	unsigned int pid = mm->context.id;
+
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	/*
+	 * If this context hasn't run on that CPU before and KVM is
+	 * around, there's a slim chance that the guest on another
+	 * CPU just brought in obsolete translation into the TLB of
+	 * this CPU due to a bad prefetch using the guest PID on
+	 * the way into the hypervisor.
+	 *
+	 * We work around this here. If KVM is possible, we check if
+	 * any sibling thread is in KVM. If it is, the window may exist
+	 * and thus we flush that PID from the core.
+	 *
+	 * A potential future improvement would be to mark which PIDs
+	 * have never been used on the system and avoid it if the PID
+	 * is new and the process has no other cpumask bit set.
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
+		int cpu = smp_processor_id();
+		int sib = cpu_first_thread_sibling(cpu);
+		bool flush = false;
+
+		for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
+			if (sib == cpu)
+				continue;
+			if (paca[sib].kvm_hstate.kvm_vcpu)
+				flush = true;
+		}
+		if (flush)
+			_tlbiel_pid(pid, RIC_FLUSH_ALL);
+	}
+}
+EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6c2d4168daec..2e3eb7431571 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
 
-		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+		if (event->attr.sample_type &
+		    (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
 			perf_get_data_addr(regs, &data.addr);
 
 		if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
index d7c9b186954d..763ffca9628d 100644
--- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c
@@ -89,7 +89,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk,
 			goto err;
 
 		ret = of_irq_to_resource(np, 0, &res[1]);
-		if (!ret)
+		if (ret <= 0)
 			goto err;
 
 		pdev = platform_device_alloc("mpc83xx_spi", i);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 2abee070373f..a553aeea7af6 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -56,6 +56,7 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
  */
 static u64 pnv_deepest_stop_psscr_val;
 static u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_flag;
 static bool deepest_stop_found;
 
 static int pnv_save_sprs_for_deep_states(void)
@@ -185,8 +186,40 @@ static void pnv_alloc_idle_core_states(void)
 
 	update_subcore_sibling_mask();
 
-	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
-		pnv_save_sprs_for_deep_states();
+	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+		int rc = pnv_save_sprs_for_deep_states();
+
+		if (likely(!rc))
+			return;
+
+		/*
+		 * The stop-api is unable to restore hypervisor
+		 * resources on wakeup from platform idle states which
+		 * lose full context. So disable such states.
+		 */
+		supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+		pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
+		pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
+
+		if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+		    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+			/*
+			 * Use the default stop state for CPU-Hotplug
+			 * if available.
+			 */
+			if (default_stop_found) {
+				pnv_deepest_stop_psscr_val =
+					pnv_default_stop_val;
+				pnv_deepest_stop_psscr_mask =
+					pnv_default_stop_mask;
+				pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
+					pnv_deepest_stop_psscr_val);
+			} else { /* Fallback to snooze loop for CPU-Hotplug */
+				deepest_stop_found = false;
+				pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
+			}
+		}
+	}
 }
 
 u32 pnv_get_supported_cpuidle_states(void)
@@ -375,7 +408,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
 						pnv_deepest_stop_psscr_val;
 		srr1 = power9_idle_stop(psscr);
 
-	} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+	} else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
+		   (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
 		srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
 	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
 		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
@@ -553,6 +587,7 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
 			max_residency_ns = residency_ns[i];
 			pnv_deepest_stop_psscr_val = psscr_val[i];
 			pnv_deepest_stop_psscr_mask = psscr_mask[i];
+			pnv_deepest_stop_flag = flags[i];
 			deepest_stop_found = true;
 		}
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index b5d960d6db3d..4c7b8591f737 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -614,15 +614,6 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
 	mmio_invalidate(npu_context, 1, address, true);
 }
 
-static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
-					struct mm_struct *mm,
-					unsigned long address)
-{
-	struct npu_context *npu_context = mn_to_npu_context(mn);
-
-	mmio_invalidate(npu_context, 1, address, true);
-}
-
 static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
 					struct mm_struct *mm,
 					unsigned long start, unsigned long end)
@@ -640,7 +631,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
 	.release = pnv_npu2_mn_release,
 	.change_pte = pnv_npu2_mn_change_pte,
-	.invalidate_page = pnv_npu2_mn_invalidate_page,
 	.invalidate_range = pnv_npu2_mn_invalidate_range,
 };
 
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 9b87abb178f0..cad6b57ce494 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -78,7 +78,7 @@ void opal_configure_cores(void)
 	 *  ie. Host hash  supports  hash guests
 	 *      Host radix supports  hash/radix guests
 	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+	if (early_cpu_has_feature(CPU_FTR_ARCH_300)) {
 		reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH;
 		if (early_radix_enabled())
 			reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 437613588df1..b900eb1d5e17 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1852,6 +1852,14 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 			/* 4GB offset bypasses 32-bit space */
 			set_dma_offset(&pdev->dev, (1ULL << 32));
 			set_dma_ops(&pdev->dev, &dma_direct_ops);
+		} else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
+			/*
+			 * Fail the request if a DMA mask between 32 and 64 bits
+			 * was requested but couldn't be fulfilled. Ideally we
+			 * would do this for 64-bits but historically we have
+			 * always fallen back to 32-bits.
+			 */
+			return -ENOMEM;
 		} else {
 			dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
 			set_dma_ops(&pdev->dev, &dma_iommu_ops);
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index e5bf1e84047f..011ef2180fe6 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -82,7 +82,6 @@ static int pSeries_reconfig_remove_node(struct device_node *np)
 
 	of_detach_node(np);
 	of_node_put(parent);
-	of_node_put(np); /* Must decrement the refcount */
 	return 0;
 }
 
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index 1bbd9dbfe4e0..ce9cc123988b 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -14,7 +14,7 @@
 		".section .rodata.str,\"aMS\",@progbits,1\n"	\
 		"2:	.asciz	\""__FILE__"\"\n"		\
 		".previous\n"					\
-		".section __bug_table,\"a\"\n"			\
+		".section __bug_table,\"aw\"\n"			\
 		"3:	.long	1b-3b,2b-3b\n"			\
 		"	.short	%0,%1\n"			\
 		"	.org	3b+%2\n"			\
@@ -30,7 +30,7 @@
 	asm volatile(					\
 		"0:	j	0b+2\n"			\
 		"1:\n"					\
-		".section __bug_table,\"a\"\n"		\
+		".section __bug_table,\"aw\"\n"		\
 		"2:	.long	1b-2b\n"		\
 		"	.short	%0\n"			\
 		"	.org	2b+%1\n"		\
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index b9300f8aee10..07a82bc933a7 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -8,11 +8,12 @@
 #include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
 
-#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p(typeof(0?(t)0:0ULL), u64))
+#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \
+				typeof(0?(__force t)0:0ULL), u64))
 
 #define __SC_DELOUSE(t,v) ({ \
 	BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \
-	(t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
+	(__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
 })
 
 #define PSW32_MASK_PER		0x40000000UL
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index a4811aa0304d..8f8eec9e1198 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -21,17 +21,12 @@
 		: "0" (-EFAULT), "d" (oparg), "a" (uaddr),		\
 		  "m" (*uaddr) : "cc");
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, newval, ret;
 
 	load_kernel_asce();
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
 
 	pagefault_disable();
 	switch (op) {
@@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 4541ac44b35f..24bc41622a98 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -44,6 +44,11 @@ static inline int init_new_context(struct task_struct *tsk,
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
 		break;
+	case -PAGE_SIZE:
+		/* forked 5-level task, set new asce with new_mm->pgd */
+		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
+		break;
 	case 1UL << 53:
 		/* forked 4-level task, set new asce with new mm->pgd */
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index f7838ecd83c6..217ee5210c32 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -98,13 +98,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
 		: "cc", "memory");
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	while (arch_spin_is_locked(lock))
-		arch_spin_relax(lock);
-	smp_acquire__after_ctrl_dep();
-}
-
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 7317b3108a88..2eb8ff0d6fca 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -47,10 +47,9 @@ struct mmu_table_batch {
 extern void tlb_table_flush(struct mmu_gather *tlb);
 extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 
-static inline void tlb_gather_mmu(struct mmu_gather *tlb,
-				  struct mm_struct *mm,
-				  unsigned long start,
-				  unsigned long end)
+static inline void
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->start = start;
@@ -76,9 +75,15 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 	tlb_flush_mmu_free(tlb);
 }
 
-static inline void tlb_finish_mmu(struct mmu_gather *tlb,
-				  unsigned long start, unsigned long end)
+static inline void
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+		unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->start = start;
+		tlb->end = end;
+	}
+
 	tlb_flush_mmu(tlb);
 }
 
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 0c82f7903fc7..c1bf75ffb875 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -998,7 +998,7 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 	psw_bits(regs.psw).ia	= sfr->basic.ia;
 	psw_bits(regs.psw).dat	= sfr->basic.T;
 	psw_bits(regs.psw).wait = sfr->basic.W;
-	psw_bits(regs.psw).per	= sfr->basic.P;
+	psw_bits(regs.psw).pstate = sfr->basic.P;
 	psw_bits(regs.psw).as	= sfr->basic.AS;
 
 	/*
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3f2884e99ed4..af09d3437631 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1324,7 +1324,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
 	uint8_t *keys;
 	uint64_t hva;
-	int i, r = 0;
+	int srcu_idx, i, r = 0;
 
 	if (args->flags != 0)
 		return -EINVAL;
@@ -1342,6 +1342,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 		return -ENOMEM;
 
 	down_read(&current->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
 	for (i = 0; i < args->count; i++) {
 		hva = gfn_to_hva(kvm, args->start_gfn + i);
 		if (kvm_is_error_hva(hva)) {
@@ -1353,6 +1354,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 		if (r)
 			break;
 	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	up_read(&current->mm->mmap_sem);
 
 	if (!r) {
@@ -1370,7 +1372,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
 	uint8_t *keys;
 	uint64_t hva;
-	int i, r = 0;
+	int srcu_idx, i, r = 0;
 
 	if (args->flags != 0)
 		return -EINVAL;
@@ -1396,6 +1398,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 		goto out;
 
 	down_read(&current->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
 	for (i = 0; i < args->count; i++) {
 		hva = gfn_to_hva(kvm, args->start_gfn + i);
 		if (kvm_is_error_hva(hva)) {
@@ -1413,6 +1416,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 		if (r)
 			break;
 	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	up_read(&current->mm->mmap_sem);
 out:
 	kvfree(keys);
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
index 926b5244263e..a2e5c24f47a7 100644
--- a/arch/s390/kvm/sthyi.c
+++ b/arch/s390/kvm/sthyi.c
@@ -394,7 +394,7 @@ static int sthyi(u64 vaddr)
 		"srl     %[cc],28\n"
 		: [cc] "=d" (cc)
 		: [code] "d" (code), [addr] "a" (addr)
-		: "memory", "cc");
+		: "3", "memory", "cc");
 	return cc;
 }
 
@@ -425,7 +425,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
 	VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
 	trace_kvm_s390_handle_sthyi(vcpu, code, addr);
 
-	if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+	if (reg1 == reg2 || reg1 & 1 || reg2 & 1)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
 	if (code & 0xffff) {
@@ -433,6 +433,9 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
+	if (addr & ~PAGE_MASK)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
 	/*
 	 * If the page has not yet been faulted in, we want to do that
 	 * now and not after all the expensive calculations.
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 2e10d2b8ad35..5bea139517a2 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -119,7 +119,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		return addr;
 
 check_asce_limit:
-	if (addr + len > current->mm->context.asce_limit) {
+	if (addr + len > current->mm->context.asce_limit &&
+	    addr + len <= TASK_SIZE) {
 		rc = crst_table_upgrade(mm, addr + len);
 		if (rc)
 			return (unsigned long) rc;
@@ -183,7 +184,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	}
 
 check_asce_limit:
-	if (addr + len > current->mm->context.asce_limit) {
+	if (addr + len > current->mm->context.asce_limit &&
+	    addr + len <= TASK_SIZE) {
 		rc = crst_table_upgrade(mm, addr + len);
 		if (rc)
 			return (unsigned long) rc;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index d4d409ba206b..4a1f7366b17a 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -591,11 +591,11 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 	unsigned long ptev;
 	pgste_t pgste;
 
-	/* Clear storage key */
+	/* Clear storage key ACC and F, but set R/C */
 	preempt_disable();
 	pgste = pgste_get_lock(ptep);
-	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
-			      PGSTE_GR_BIT | PGSTE_GC_BIT);
+	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
 	ptev = pte_val(*ptep);
 	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
 		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 01c6fbc3e85b..1803797fc885 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1253,7 +1253,8 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
 		insn_count = bpf_jit_insn(jit, fp, i);
 		if (insn_count < 0)
 			return -1;
-		jit->addrs[i + 1] = jit->prg; /* Next instruction address */
+		/* Next instruction address */
+		jit->addrs[i + insn_count] = jit->prg;
 	}
 	bpf_jit_epilogue(jit);
 
diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h
index c9828f785ca0..5b5086367639 100644
--- a/arch/sh/include/asm/bug.h
+++ b/arch/sh/include/asm/bug.h
@@ -24,14 +24,14 @@
  */
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #define _EMIT_BUG_ENTRY				\
-	"\t.pushsection __bug_table,\"a\"\n"	\
+	"\t.pushsection __bug_table,\"aw\"\n"	\
 	"2:\t.long 1b, %O1\n"			\
 	"\t.short %O2, %O3\n"			\
 	"\t.org 2b+%O4\n"			\
 	"\t.popsection\n"
 #else
 #define _EMIT_BUG_ENTRY				\
-	"\t.pushsection __bug_table,\"a\"\n"	\
+	"\t.pushsection __bug_table,\"aw\"\n"	\
 	"2:\t.long 1b\n"			\
 	"\t.short %O3\n"			\
 	"\t.org 2b+%O4\n"			\
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index d0078747d308..8f8cf941a8cd 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
 }
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	u32 oparg = (encoded_op << 8) >> 20;
-	u32 cmparg = (encoded_op << 20) >> 20;
 	u32 oldval, newval, prev;
 	int ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	do {
@@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
 
 	return ret;
 }
diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h
index c46e8cc7b515..5ed7dbbd94ff 100644
--- a/arch/sh/include/asm/spinlock-cas.h
+++ b/arch/sh/include/asm/spinlock-cas.h
@@ -29,11 +29,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new
 #define arch_spin_is_locked(x)		((x)->lock <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, VAL > 0);
-}
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	while (!__sl_cas(&lock->lock, 1, 0));
diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h
index cec78143fa83..f77263aae760 100644
--- a/arch/sh/include/asm/spinlock-llsc.h
+++ b/arch/sh/include/asm/spinlock-llsc.h
@@ -21,11 +21,6 @@
 #define arch_spin_is_locked(x)		((x)->lock <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, VAL > 0);
-}
-
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 46e0d635e36f..51a8bc967e75 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -36,7 +36,8 @@ static inline void init_tlb_gather(struct mmu_gather *tlb)
 }
 
 static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+		unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->start = start;
@@ -47,9 +48,10 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start
 }
 
 static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+		unsigned long start, unsigned long end, bool force)
 {
-	if (tlb->fullmm)
+	if (tlb->fullmm || force)
 		flush_tlb_mm(tlb->mm);
 
 	/* keep the page table cache within bounds */
diff --git a/arch/sh/include/uapi/asm/ioctls.h b/arch/sh/include/uapi/asm/ioctls.h
index eec7901e9e65..787bac9f67da 100644
--- a/arch/sh/include/uapi/asm/ioctls.h
+++ b/arch/sh/include/uapi/asm/ioctls.h
@@ -93,7 +93,7 @@
 #define TIOCGPKT	_IOR('T', 0x38, int) /* Get packet mode state */
 #define TIOCGPTLCK	_IOR('T', 0x39, int) /* Get Pty lock state */
 #define TIOCGEXCL	_IOR('T', 0x40, int) /* Get exclusive mode state */
-#define TIOCGPTPEER	_IOR('T', 0x41, int) /* Safely open the slave */
+#define TIOCGPTPEER	_IO('T', 0x41) /* Safely open the slave */
 
 #define TIOCSERCONFIG	_IO('T', 83) /* 0x5453 */
 #define TIOCSERGWILD	_IOR('T', 84,  int) /* 0x5454 */
diff --git a/arch/sparc/configs/sparc32_defconfig b/arch/sparc/configs/sparc32_defconfig
index c74d3701ad68..207a43a2d8b3 100644
--- a/arch/sparc/configs/sparc32_defconfig
+++ b/arch/sparc/configs/sparc32_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
@@ -23,7 +22,6 @@ CONFIG_IP_PNP_DHCP=y
 CONFIG_INET_AH=y
 CONFIG_INET_ESP=y
 CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
 CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
@@ -69,7 +67,6 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT2_FS_XATTR=y
 CONFIG_EXT2_FS_POSIX_ACL=y
 CONFIG_EXT2_FS_SECURITY=y
-CONFIG_AUTOFS_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_ISO9660_FS=m
 CONFIG_PROC_KCORE=y
@@ -82,7 +79,6 @@ CONFIG_NLS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 # CONFIG_SCHED_DEBUG is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
 CONFIG_KGDB=y
 CONFIG_KGDB_TESTS=y
 CONFIG_CRYPTO_NULL=m
diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig
index b2e650d1764f..ca8609d7292f 100644
--- a/arch/sparc/configs/sparc64_defconfig
+++ b/arch/sparc/configs/sparc64_defconfig
@@ -1,5 +1,4 @@
 CONFIG_64BIT=y
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
@@ -184,7 +183,6 @@ CONFIG_HID_TOPSEED=y
 CONFIG_HID_THRUSTMASTER=y
 CONFIG_HID_ZEROPLUS=y
 CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
 CONFIG_USB_EHCI_HCD=m
 # CONFIG_USB_EHCI_TT_NEWSCHED is not set
 CONFIG_USB_OHCI_HCD=y
@@ -210,8 +208,6 @@ CONFIG_LOCKUP_DETECTOR=y
 CONFIG_DETECT_HUNG_TASK=y
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_UPROBE_EVENTS=y
 CONFIG_KEYS=y
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index ee3f11c43cda..7643e979e333 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int);
 int __atomic_add_unless(atomic_t *, int, int);
 void atomic_set(atomic_t *, int);
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 #define atomic_read(v)          ACCESS_ONCE((v)->counter)
 
 #define atomic_add(i, v)	((void)atomic_add_return( (int)(i), (v)))
diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h
index 4e899b0dabf7..1cfd89d92208 100644
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -29,22 +29,14 @@
 	: "r" (uaddr), "r" (oparg), "i" (-EFAULT)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-		return -EFAULT;
 	if (unlikely((((unsigned long) uaddr) & 0x3UL)))
 		return -EINVAL;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index 2cddcda4f85f..87841d687f8d 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -27,9 +27,11 @@ void destroy_context(struct mm_struct *mm);
 void __tsb_context_switch(unsigned long pgd_pa,
 			  struct tsb_config *tsb_base,
 			  struct tsb_config *tsb_huge,
-			  unsigned long tsb_descr_pa);
+			  unsigned long tsb_descr_pa,
+			  unsigned long secondary_ctx);
 
-static inline void tsb_context_switch(struct mm_struct *mm)
+static inline void tsb_context_switch_ctx(struct mm_struct *mm,
+					  unsigned long ctx)
 {
 	__tsb_context_switch(__pa(mm->pgd),
 			     &mm->context.tsb_block[MM_TSB_BASE],
@@ -40,9 +42,12 @@ static inline void tsb_context_switch(struct mm_struct *mm)
 #else
 			     NULL
 #endif
-			     , __pa(&mm->context.tsb_descr[MM_TSB_BASE]));
+			     , __pa(&mm->context.tsb_descr[MM_TSB_BASE]),
+			     ctx);
 }
 
+#define tsb_context_switch(X) tsb_context_switch_ctx(X, 0)
+
 void tsb_grow(struct mm_struct *mm,
 	      unsigned long tsb_index,
 	      unsigned long mm_rss);
@@ -112,8 +117,7 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
 	 * cpu0 to update it's TSB because at that point the cpu_vm_mask
 	 * only had cpu1 set in it.
 	 */
-	load_secondary_context(mm);
-	tsb_context_switch(mm);
+	tsb_context_switch_ctx(mm, CTX_HWBITS(mm->context));
 
 	/* Any time a processor runs a context on an address space
 	 * for the first time, we must flush that context out of the
diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h
index 0efd0583a8c9..6249214148c2 100644
--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
@@ -68,6 +68,7 @@ typedef struct { unsigned long iopgprot; } iopgprot_t;
 #define iopgprot_val(x)	((x).iopgprot)
 
 #define __pte(x)	((pte_t) { (x) } )
+#define __pmd(x)	((pmd_t) { { (x) }, })
 #define __iopte(x)	((iopte_t) { (x) } )
 #define __pgd(x)	((pgd_t) { (x) } )
 #define __ctxd(x)	((ctxd_t) { (x) } )
@@ -95,6 +96,7 @@ typedef unsigned long iopgprot_t;
 #define iopgprot_val(x)	(x)
 
 #define __pte(x)	(x)
+#define __pmd(x)	((pmd_t) { { (x) }, })
 #define __iopte(x)	(x)
 #define __pgd(x)	(x)
 #define __ctxd(x)	(x)
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 8011e79f59c9..67345b2dc408 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -14,11 +14,6 @@
 
 #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	__asm__ __volatile__(
diff --git a/arch/sparc/include/asm/spitfire.h b/arch/sparc/include/asm/spitfire.h
index 1d8321c827a8..1b1286d05069 100644
--- a/arch/sparc/include/asm/spitfire.h
+++ b/arch/sparc/include/asm/spitfire.h
@@ -47,10 +47,26 @@
 #define SUN4V_CHIP_NIAGARA5	0x05
 #define SUN4V_CHIP_SPARC_M6	0x06
 #define SUN4V_CHIP_SPARC_M7	0x07
+#define SUN4V_CHIP_SPARC_M8	0x08
 #define SUN4V_CHIP_SPARC64X	0x8a
 #define SUN4V_CHIP_SPARC_SN	0x8b
 #define SUN4V_CHIP_UNKNOWN	0xff
 
+/*
+ * The following CPU_ID_xxx constants are used
+ * to identify the CPU type in the setup phase
+ * (see head_64.S)
+ */
+#define CPU_ID_NIAGARA1		('1')
+#define CPU_ID_NIAGARA2		('2')
+#define CPU_ID_NIAGARA3		('3')
+#define CPU_ID_NIAGARA4		('4')
+#define CPU_ID_NIAGARA5		('5')
+#define CPU_ID_M6		('6')
+#define CPU_ID_M7		('7')
+#define CPU_ID_M8		('8')
+#define CPU_ID_SONOMA1		('N')
+
 #ifndef __ASSEMBLY__
 
 enum ultra_tlb_layout {
diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h
index ec9c04de3664..ff05992dae7a 100644
--- a/arch/sparc/include/asm/trap_block.h
+++ b/arch/sparc/include/asm/trap_block.h
@@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR_CPUS];
 void init_cur_cpu_trap(struct thread_info *);
 void setup_tba(void);
 extern int ncpus_probed;
+extern u64 cpu_mondo_counter[NR_CPUS];
 
 unsigned long real_hard_smp_processor_id(void);
 
diff --git a/arch/sparc/include/uapi/asm/ioctls.h b/arch/sparc/include/uapi/asm/ioctls.h
index 6d27398632ea..f5df72b93bb2 100644
--- a/arch/sparc/include/uapi/asm/ioctls.h
+++ b/arch/sparc/include/uapi/asm/ioctls.h
@@ -88,7 +88,7 @@
 #define TIOCGPTN	_IOR('t', 134, unsigned int) /* Get Pty Number */
 #define TIOCSPTLCK	_IOW('t', 135, int) /* Lock/unlock PTY */
 #define TIOCSIG		_IOW('t', 136, int) /* Generate signal on Pty slave */
-#define TIOCGPTPEER	_IOR('t', 137, int) /* Safely open the slave */
+#define TIOCGPTPEER	_IO('t', 137) /* Safely open the slave */
 
 /* Little f */
 #define FIOCLEX		_IO('f', 1)
diff --git a/arch/sparc/kernel/cpu.c b/arch/sparc/kernel/cpu.c
index 493e023a468a..ef4f18f7a674 100644
--- a/arch/sparc/kernel/cpu.c
+++ b/arch/sparc/kernel/cpu.c
@@ -506,6 +506,12 @@ static void __init sun4v_cpu_probe(void)
 		sparc_pmu_type = "sparc-m7";
 		break;
 
+	case SUN4V_CHIP_SPARC_M8:
+		sparc_cpu_type = "SPARC-M8";
+		sparc_fpu_type = "SPARC-M8 integrated FPU";
+		sparc_pmu_type = "sparc-m8";
+		break;
+
 	case SUN4V_CHIP_SPARC_SN:
 		sparc_cpu_type = "SPARC-SN";
 		sparc_fpu_type = "SPARC-SN integrated FPU";
diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
index 45c820e1cba5..90d550bbfeef 100644
--- a/arch/sparc/kernel/cpumap.c
+++ b/arch/sparc/kernel/cpumap.c
@@ -328,6 +328,7 @@ static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index)
 	case SUN4V_CHIP_NIAGARA5:
 	case SUN4V_CHIP_SPARC_M6:
 	case SUN4V_CHIP_SPARC_M7:
+	case SUN4V_CHIP_SPARC_M8:
 	case SUN4V_CHIP_SPARC_SN:
 	case SUN4V_CHIP_SPARC64X:
 		rover_inc_table = niagara_iterate_method;
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index 41a407328667..78e0211753d2 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -424,22 +424,25 @@ EXPORT_SYMBOL(sun4v_chip_type)
 	 nop
 
 70:	ldub	[%g1 + 7], %g2
-	cmp	%g2, '3'
+	cmp	%g2, CPU_ID_NIAGARA3
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_NIAGARA3, %g4
-	cmp	%g2, '4'
+	cmp	%g2, CPU_ID_NIAGARA4
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_NIAGARA4, %g4
-	cmp	%g2, '5'
+	cmp	%g2, CPU_ID_NIAGARA5
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_NIAGARA5, %g4
-	cmp	%g2, '6'
+	cmp	%g2, CPU_ID_M6
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_SPARC_M6, %g4
-	cmp	%g2, '7'
+	cmp	%g2, CPU_ID_M7
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_SPARC_M7, %g4
-	cmp	%g2, 'N'
+	cmp	%g2, CPU_ID_M8
+	be,pt	%xcc, 5f
+	 mov	SUN4V_CHIP_SPARC_M8, %g4
+	cmp	%g2, CPU_ID_SONOMA1
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_SPARC_SN, %g4
 	ba,pt	%xcc, 49f
@@ -448,10 +451,10 @@ EXPORT_SYMBOL(sun4v_chip_type)
 91:	sethi	%hi(prom_cpu_compatible), %g1
 	or	%g1, %lo(prom_cpu_compatible), %g1
 	ldub	[%g1 + 17], %g2
-	cmp	%g2, '1'
+	cmp	%g2, CPU_ID_NIAGARA1
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_NIAGARA1, %g4
-	cmp	%g2, '2'
+	cmp	%g2, CPU_ID_NIAGARA2
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_NIAGARA2, %g4
 	
@@ -602,6 +605,9 @@ niagara_tlb_fixup:
 	cmp	%g1, SUN4V_CHIP_SPARC_M7
 	be,pt	%xcc, niagara4_patch
 	 nop
+	cmp	%g1, SUN4V_CHIP_SPARC_M8
+	be,pt	%xcc, niagara4_patch
+	 nop
 	cmp	%g1, SUN4V_CHIP_SPARC_SN
 	be,pt	%xcc, niagara4_patch
 	 nop
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 24f21c726dfa..9ebebf1fd93d 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -673,12 +673,14 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
 static int dma_4v_supported(struct device *dev, u64 device_mask)
 {
 	struct iommu *iommu = dev->archdata.iommu;
-	u64 dma_addr_mask;
+	u64 dma_addr_mask = iommu->dma_addr_mask;
 
-	if (device_mask > DMA_BIT_MASK(32) && iommu->atu)
-		dma_addr_mask = iommu->atu->dma_addr_mask;
-	else
-		dma_addr_mask = iommu->dma_addr_mask;
+	if (device_mask > DMA_BIT_MASK(32)) {
+		if (iommu->atu)
+			dma_addr_mask = iommu->atu->dma_addr_mask;
+		else
+			return 0;
+	}
 
 	if ((device_mask & dma_addr_mask) == dma_addr_mask)
 		return 1;
@@ -1264,8 +1266,6 @@ static int pci_sun4v_probe(struct platform_device *op)
 			 * ATU group, but ATU hcalls won't be available.
 			 */
 			hv_atu = false;
-			pr_err(PFX "Could not register hvapi ATU err=%d\n",
-			       err);
 		} else {
 			pr_info(PFX "Registered hvapi ATU major[%lu] minor[%lu]\n",
 				vatu_major, vatu_minor);
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index a38787b84322..732af9a9f6dd 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -602,7 +602,7 @@ void pcibios_fixup_bus(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	int i, has_io, has_mem;
-	unsigned int cmd;
+	unsigned int cmd = 0;
 	struct linux_pcic *pcic;
 	/* struct linux_pbm_info* pbm = &pcic->pbm; */
 	int node;
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index 4d9c3e13c150..150ee7d4b059 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -288,10 +288,17 @@ static void __init sun4v_patch(void)
 
 	sun4v_patch_2insn_range(&__sun4v_2insn_patch,
 				&__sun4v_2insn_patch_end);
-	if (sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
-	    sun4v_chip_type == SUN4V_CHIP_SPARC_SN)
+
+	switch (sun4v_chip_type) {
+	case SUN4V_CHIP_SPARC_M7:
+	case SUN4V_CHIP_SPARC_M8:
+	case SUN4V_CHIP_SPARC_SN:
 		sun_m7_patch_2insn_range(&__sun_m7_2insn_patch,
 					 &__sun_m7_2insn_patch_end);
+		break;
+	default:
+		break;
+	}
 
 	sun4v_hvapi_init();
 }
@@ -529,6 +536,7 @@ static void __init init_sparc64_elf_hwcap(void)
 		    sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
+		    sun4v_chip_type == SUN4V_CHIP_SPARC_M8 ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC_SN ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC64X)
 			cap |= HWCAP_SPARC_BLKINIT;
@@ -538,6 +546,7 @@ static void __init init_sparc64_elf_hwcap(void)
 		    sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
+		    sun4v_chip_type == SUN4V_CHIP_SPARC_M8 ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC_SN ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC64X)
 			cap |= HWCAP_SPARC_N2;
@@ -568,6 +577,7 @@ static void __init init_sparc64_elf_hwcap(void)
 			    sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
+			    sun4v_chip_type == SUN4V_CHIP_SPARC_M8 ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC_SN ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC64X)
 				cap |= (AV_SPARC_VIS | AV_SPARC_VIS2 |
@@ -578,6 +588,7 @@ static void __init init_sparc64_elf_hwcap(void)
 			    sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
+			    sun4v_chip_type == SUN4V_CHIP_SPARC_M8 ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC_SN ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC64X)
 				cap |= (AV_SPARC_VIS3 | AV_SPARC_HPC |
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index fdf31040a7dc..3218bc43302e 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -622,22 +622,48 @@ retry:
 	}
 }
 
-/* Multi-cpu list version.  */
+#define	CPU_MONDO_COUNTER(cpuid)	(cpu_mondo_counter[cpuid])
+#define	MONDO_USEC_WAIT_MIN		2
+#define	MONDO_USEC_WAIT_MAX		100
+#define	MONDO_RETRY_LIMIT		500000
+
+/* Multi-cpu list version.
+ *
+ * Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
+ * Sometimes not all cpus receive the mondo, requiring us to re-send
+ * the mondo until all cpus have received, or cpus are truly stuck
+ * unable to receive mondo, and we timeout.
+ * Occasionally a target cpu strand is borrowed briefly by hypervisor to
+ * perform guest service, such as PCIe error handling. Consider the
+ * service time, 1 second overall wait is reasonable for 1 cpu.
+ * Here two in-between mondo check wait time are defined: 2 usec for
+ * single cpu quick turn around and up to 100usec for large cpu count.
+ * Deliver mondo to large number of cpus could take longer, we adjusts
+ * the retry count as long as target cpus are making forward progress.
+ */
 static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 {
-	int retries, this_cpu, prev_sent, i, saw_cpu_error;
+	int this_cpu, tot_cpus, prev_sent, i, rem;
+	int usec_wait, retries, tot_retries;
+	u16 first_cpu = 0xffff;
+	unsigned long xc_rcvd = 0;
 	unsigned long status;
+	int ecpuerror_id = 0;
+	int enocpu_id = 0;
 	u16 *cpu_list;
+	u16 cpu;
 
 	this_cpu = smp_processor_id();
-
 	cpu_list = __va(tb->cpu_list_pa);
-
-	saw_cpu_error = 0;
-	retries = 0;
+	usec_wait = cnt * MONDO_USEC_WAIT_MIN;
+	if (usec_wait > MONDO_USEC_WAIT_MAX)
+		usec_wait = MONDO_USEC_WAIT_MAX;
+	retries = tot_retries = 0;
+	tot_cpus = cnt;
 	prev_sent = 0;
+
 	do {
-		int forward_progress, n_sent;
+		int n_sent, mondo_delivered, target_cpu_busy;
 
 		status = sun4v_cpu_mondo_send(cnt,
 					      tb->cpu_list_pa,
@@ -645,94 +671,113 @@ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 
 		/* HV_EOK means all cpus received the xcall, we're done.  */
 		if (likely(status == HV_EOK))
-			break;
+			goto xcall_done;
+
+		/* If not these non-fatal errors, panic */
+		if (unlikely((status != HV_EWOULDBLOCK) &&
+			(status != HV_ECPUERROR) &&
+			(status != HV_ENOCPU)))
+			goto fatal_errors;
 
 		/* First, see if we made any forward progress.
 		 *
+		 * Go through the cpu_list, count the target cpus that have
+		 * received our mondo (n_sent), and those that did not (rem).
+		 * Re-pack cpu_list with the cpus remain to be retried in the
+		 * front - this simplifies tracking the truly stalled cpus.
+		 *
 		 * The hypervisor indicates successful sends by setting
 		 * cpu list entries to the value 0xffff.
+		 *
+		 * EWOULDBLOCK means some target cpus did not receive the
+		 * mondo and retry usually helps.
+		 *
+		 * ECPUERROR means at least one target cpu is in error state,
+		 * it's usually safe to skip the faulty cpu and retry.
+		 *
+		 * ENOCPU means one of the target cpu doesn't belong to the
+		 * domain, perhaps offlined which is unexpected, but not
+		 * fatal and it's okay to skip the offlined cpu.
 		 */
+		rem = 0;
 		n_sent = 0;
 		for (i = 0; i < cnt; i++) {
-			if (likely(cpu_list[i] == 0xffff))
+			cpu = cpu_list[i];
+			if (likely(cpu == 0xffff)) {
 				n_sent++;
+			} else if ((status == HV_ECPUERROR) &&
+				(sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
+				ecpuerror_id = cpu + 1;
+			} else if (status == HV_ENOCPU && !cpu_online(cpu)) {
+				enocpu_id = cpu + 1;
+			} else {
+				cpu_list[rem++] = cpu;
+			}
 		}
 
-		forward_progress = 0;
-		if (n_sent > prev_sent)
-			forward_progress = 1;
+		/* No cpu remained, we're done. */
+		if (rem == 0)
+			break;
 
-		prev_sent = n_sent;
+		/* Otherwise, update the cpu count for retry. */
+		cnt = rem;
 
-		/* If we get a HV_ECPUERROR, then one or more of the cpus
-		 * in the list are in error state.  Use the cpu_state()
-		 * hypervisor call to find out which cpus are in error state.
+		/* Record the overall number of mondos received by the
+		 * first of the remaining cpus.
 		 */
-		if (unlikely(status == HV_ECPUERROR)) {
-			for (i = 0; i < cnt; i++) {
-				long err;
-				u16 cpu;
+		if (first_cpu != cpu_list[0]) {
+			first_cpu = cpu_list[0];
+			xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
+		}
 
-				cpu = cpu_list[i];
-				if (cpu == 0xffff)
-					continue;
+		/* Was any mondo delivered successfully? */
+		mondo_delivered = (n_sent > prev_sent);
+		prev_sent = n_sent;
 
-				err = sun4v_cpu_state(cpu);
-				if (err == HV_CPU_STATE_ERROR) {
-					saw_cpu_error = (cpu + 1);
-					cpu_list[i] = 0xffff;
-				}
-			}
-		} else if (unlikely(status != HV_EWOULDBLOCK))
-			goto fatal_mondo_error;
+		/* or, was any target cpu busy processing other mondos? */
+		target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
+		xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
 
-		/* Don't bother rewriting the CPU list, just leave the
-		 * 0xffff and non-0xffff entries in there and the
-		 * hypervisor will do the right thing.
-		 *
-		 * Only advance timeout state if we didn't make any
-		 * forward progress.
+		/* Retry count is for no progress. If we're making progress,
+		 * reset the retry count.
 		 */
-		if (unlikely(!forward_progress)) {
-			if (unlikely(++retries > 10000))
-				goto fatal_mondo_timeout;
-
-			/* Delay a little bit to let other cpus catch up
-			 * on their cpu mondo queue work.
-			 */
-			udelay(2 * cnt);
+		if (likely(mondo_delivered || target_cpu_busy)) {
+			tot_retries += retries;
+			retries = 0;
+		} else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
+			goto fatal_mondo_timeout;
 		}
-	} while (1);
 
-	if (unlikely(saw_cpu_error))
-		goto fatal_mondo_cpu_error;
+		/* Delay a little bit to let other cpus catch up on
+		 * their cpu mondo queue work.
+		 */
+		if (!mondo_delivered)
+			udelay(usec_wait);
 
-	return;
+		retries++;
+	} while (1);
 
-fatal_mondo_cpu_error:
-	printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
-	       "(including %d) were in error state\n",
-	       this_cpu, saw_cpu_error - 1);
+xcall_done:
+	if (unlikely(ecpuerror_id > 0)) {
+		pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
+		       this_cpu, ecpuerror_id - 1);
+	} else if (unlikely(enocpu_id > 0)) {
+		pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
+		       this_cpu, enocpu_id - 1);
+	}
 	return;
 
+fatal_errors:
+	/* fatal errors include bad alignment, etc */
+	pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
+	       this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
+	panic("Unexpected SUN4V mondo error %lu\n", status);
+
 fatal_mondo_timeout:
-	printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
-	       " progress after %d retries.\n",
-	       this_cpu, retries);
-	goto dump_cpu_list_and_out;
-
-fatal_mondo_error:
-	printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
-	       this_cpu, status);
-	printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
-	       "mondo_block_pa(%lx)\n",
-	       this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
-
-dump_cpu_list_and_out:
-	printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
-	for (i = 0; i < cnt; i++)
-		printk("%u ", cpu_list[i]);
-	printk("]\n");
+	/* some cpus being non-responsive to the cpu mondo */
+	pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
+	       this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
+	panic("SUN4V mondo timeout panic\n");
 }
 
 static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
diff --git a/arch/sparc/kernel/sun4v_ivec.S b/arch/sparc/kernel/sun4v_ivec.S
index 559bc5e9c199..34631995859a 100644
--- a/arch/sparc/kernel/sun4v_ivec.S
+++ b/arch/sparc/kernel/sun4v_ivec.S
@@ -26,6 +26,21 @@ sun4v_cpu_mondo:
 	ldxa	[%g0] ASI_SCRATCHPAD, %g4
 	sub	%g4, TRAP_PER_CPU_FAULT_INFO, %g4
 
+	/* Get smp_processor_id() into %g3 */
+	sethi	%hi(trap_block), %g5
+	or	%g5, %lo(trap_block), %g5
+	sub	%g4, %g5, %g3
+	srlx	%g3, TRAP_BLOCK_SZ_SHIFT, %g3
+
+	/* Increment cpu_mondo_counter[smp_processor_id()] */
+	sethi	%hi(cpu_mondo_counter), %g5
+	or	%g5, %lo(cpu_mondo_counter), %g5
+	sllx	%g3, 3, %g3
+	add	%g5, %g3, %g5
+	ldx	[%g5], %g3
+	add	%g3, 1, %g3
+	stx	%g3, [%g5]
+
 	/* Get CPU mondo queue base phys address into %g7.  */
 	ldx	[%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7
 
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 196ee5eb4d48..ad31af1dd726 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2733,6 +2733,7 @@ void do_getpsr(struct pt_regs *regs)
 	}
 }
 
+u64 cpu_mondo_counter[NR_CPUS] = {0};
 struct trap_per_cpu trap_block[NR_CPUS];
 EXPORT_SYMBOL(trap_block);
 
diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S
index 07c0df924960..db872dbfafe9 100644
--- a/arch/sparc/kernel/tsb.S
+++ b/arch/sparc/kernel/tsb.S
@@ -360,6 +360,7 @@ tsb_flush:
 	 * %o1:	TSB base config pointer
 	 * %o2:	TSB huge config pointer, or NULL if none
 	 * %o3:	Hypervisor TSB descriptor physical address
+	 * %o4: Secondary context to load, if non-zero
 	 *
 	 * We have to run this whole thing with interrupts
 	 * disabled so that the current cpu doesn't change
@@ -372,6 +373,17 @@ __tsb_context_switch:
 	rdpr	%pstate, %g1
 	wrpr	%g1, PSTATE_IE, %pstate
 
+	brz,pn	%o4, 1f
+	 mov	SECONDARY_CONTEXT, %o5
+
+661:	stxa	%o4, [%o5] ASI_DMMU
+	.section .sun4v_1insn_patch, "ax"
+	.word	661b
+	stxa	%o4, [%o5] ASI_MMU
+	.previous
+	flush	%g6
+
+1:
 	TRAP_LOAD_TRAP_BLOCK(%g2, %g3)
 
 	stx	%o0, [%g2 + TRAP_PER_CPU_PGD_PADDR]
diff --git a/arch/sparc/lib/U3memcpy.S b/arch/sparc/lib/U3memcpy.S
index 54f98706b03b..5a8cb37f0a3b 100644
--- a/arch/sparc/lib/U3memcpy.S
+++ b/arch/sparc/lib/U3memcpy.S
@@ -145,13 +145,13 @@ ENDPROC(U3_retl_o2_plus_GS_plus_0x08)
 ENTRY(U3_retl_o2_and_7_plus_GS)
 	and	%o2, 7, %o2
 	retl
-	 add	%o2, GLOBAL_SPARE, %o2
+	 add	%o2, GLOBAL_SPARE, %o0
 ENDPROC(U3_retl_o2_and_7_plus_GS)
 ENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
 	add	GLOBAL_SPARE, 8, GLOBAL_SPARE
 	and	%o2, 7, %o2
 	retl
-	 add	%o2, GLOBAL_SPARE, %o2
+	 add	%o2, GLOBAL_SPARE, %o0
 ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
 #endif
 
diff --git a/arch/sparc/lib/multi3.S b/arch/sparc/lib/multi3.S
index d6b6c97fe3c7..703127aaf4a5 100644
--- a/arch/sparc/lib/multi3.S
+++ b/arch/sparc/lib/multi3.S
@@ -5,26 +5,26 @@
 	.align	4
 ENTRY(__multi3) /* %o0 = u, %o1 = v */
 	mov	%o1, %g1
-	srl	%o3, 0, %g4
-	mulx	%g4, %g1, %o1
+	srl	%o3, 0, %o4
+	mulx	%o4, %g1, %o1
 	srlx	%g1, 0x20, %g3
-	mulx	%g3, %g4, %g5
-	sllx	%g5, 0x20, %o5
-	srl	%g1, 0, %g4
+	mulx	%g3, %o4, %g7
+	sllx	%g7, 0x20, %o5
+	srl	%g1, 0, %o4
 	sub	%o1, %o5, %o5
 	srlx	%o5, 0x20, %o5
-	addcc	%g5, %o5, %g5
+	addcc	%g7, %o5, %g7
 	srlx	%o3, 0x20, %o5
-	mulx	%g4, %o5, %g4
+	mulx	%o4, %o5, %o4
 	mulx	%g3, %o5, %o5
 	sethi	%hi(0x80000000), %g3
-	addcc	%g5, %g4, %g5
-	srlx	%g5, 0x20, %g5
+	addcc	%g7, %o4, %g7
+	srlx	%g7, 0x20, %g7
 	add	%g3, %g3, %g3
 	movcc	%xcc, %g0, %g3
-	addcc	%o5, %g5, %o5
-	sllx	%g4, 0x20, %g4
-	add	%o1, %g4, %o1
+	addcc	%o5, %g7, %o5
+	sllx	%o4, 0x20, %o4
+	add	%o1, %o4, %o1
 	add	%o5, %g3, %g2
 	mulx	%g1, %o2, %g1
 	add	%g1, %g2, %g1
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 3c40ebd50f92..afa0099f3748 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -325,6 +325,29 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
+static void __init add_huge_page_size(unsigned long size)
+{
+	unsigned int order;
+
+	if (size_to_hstate(size))
+		return;
+
+	order = ilog2(size) - PAGE_SHIFT;
+	hugetlb_add_hstate(order);
+}
+
+static int __init hugetlbpage_init(void)
+{
+	add_huge_page_size(1UL << HPAGE_64K_SHIFT);
+	add_huge_page_size(1UL << HPAGE_SHIFT);
+	add_huge_page_size(1UL << HPAGE_256MB_SHIFT);
+	add_huge_page_size(1UL << HPAGE_2GB_SHIFT);
+
+	return 0;
+}
+
+arch_initcall(hugetlbpage_init);
+
 static int __init setup_hugepagesz(char *string)
 {
 	unsigned long long hugepage_size;
@@ -364,7 +387,7 @@ static int __init setup_hugepagesz(char *string)
 		goto out;
 	}
 
-	hugetlb_add_hstate(hugepage_shift - PAGE_SHIFT);
+	add_huge_page_size(hugepage_size);
 	rc = 1;
 
 out:
@@ -1921,12 +1944,22 @@ static void __init setup_page_offset(void)
 			break;
 		case SUN4V_CHIP_SPARC_M7:
 		case SUN4V_CHIP_SPARC_SN:
-		default:
 			/* M7 and later support 52-bit virtual addresses.  */
 			sparc64_va_hole_top =    0xfff8000000000000UL;
 			sparc64_va_hole_bottom = 0x0008000000000000UL;
 			max_phys_bits = 49;
 			break;
+		case SUN4V_CHIP_SPARC_M8:
+		default:
+			/* M8 and later support 54-bit virtual addresses.
+			 * However, restricting M8 and above VA bits to 53
+			 * as 4-level page table cannot support more than
+			 * 53 VA bits.
+			 */
+			sparc64_va_hole_top =    0xfff0000000000000UL;
+			sparc64_va_hole_bottom = 0x0010000000000000UL;
+			max_phys_bits = 51;
+			break;
 		}
 	}
 
@@ -2138,6 +2171,7 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 	 */
 	switch (sun4v_chip_type) {
 	case SUN4V_CHIP_SPARC_M7:
+	case SUN4V_CHIP_SPARC_M8:
 	case SUN4V_CHIP_SPARC_SN:
 		pagecv_flag = 0x00;
 		break;
@@ -2290,6 +2324,7 @@ void __init paging_init(void)
 	 */
 	switch (sun4v_chip_type) {
 	case SUN4V_CHIP_SPARC_M7:
+	case SUN4V_CHIP_SPARC_M8:
 	case SUN4V_CHIP_SPARC_SN:
 		page_cache4v_flag = _PAGE_CP_4V;
 		break;
diff --git a/arch/sparc/power/hibernate.c b/arch/sparc/power/hibernate.c
index 17bd2e167e07..df707a8ad311 100644
--- a/arch/sparc/power/hibernate.c
+++ b/arch/sparc/power/hibernate.c
@@ -35,6 +35,5 @@ void restore_processor_state(void)
 {
 	struct mm_struct *mm = current->active_mm;
 
-	load_secondary_context(mm);
-	tsb_context_switch(mm);
+	tsb_context_switch_ctx(mm, CTX_HWBITS(mm->context));
 }
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index a93774255136..53a423e7cb92 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n)
 	_atomic_xchg(&v->counter, n);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 /* A 64bit atomic type */
 
 typedef struct {
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index e64a1b75fc38..83c1e639b411 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -106,12 +106,9 @@
 	lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int uninitialized_var(val), ret;
 
 	__futex_prolog();
@@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	/* The 32-bit futex code makes this assumption, so validate it here. */
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (val == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (val != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (val < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (val >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (val <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (val > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = val;
+
 	return ret;
 }
 
diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h
index b14b1ba5bf9c..cba8ba9b8da6 100644
--- a/arch/tile/include/asm/spinlock_32.h
+++ b/arch/tile/include/asm/spinlock_32.h
@@ -64,8 +64,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	lock->current_ticket = old_ticket + TICKET_QUANTUM;
 }
 
-void arch_spin_unlock_wait(arch_spinlock_t *lock);
-
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index b9718fb4e74a..9a2c2d605752 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -58,8 +58,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	__insn_fetchadd4(&lock->lock, 1U << __ARCH_SPIN_CURRENT_SHIFT);
 }
 
-void arch_spin_unlock_wait(arch_spinlock_t *lock);
-
 void arch_spin_lock_slow(arch_spinlock_t *lock, u32 val);
 
 /* Grab the "next" ticket number and bump it atomically.
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 076c6cc43113..db9333f2447c 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -62,29 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock)
 }
 EXPORT_SYMBOL(arch_spin_trylock);
 
-void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	u32 iterations = 0;
-	int curr = READ_ONCE(lock->current_ticket);
-	int next = READ_ONCE(lock->next_ticket);
-
-	/* Return immediately if unlocked. */
-	if (next == curr)
-		return;
-
-	/* Wait until the current locker has released the lock. */
-	do {
-		delay_backoff(iterations++);
-	} while (READ_ONCE(lock->current_ticket) == curr);
-
-	/*
-	 * The TILE architecture doesn't do read speculation; therefore
-	 * a control dependency guarantees a LOAD->{LOAD,STORE} order.
-	 */
-	barrier();
-}
-EXPORT_SYMBOL(arch_spin_unlock_wait);
-
 /*
  * The low byte is always reserved to be the marker for a "tns" operation
  * since the low bit is set to "1" by a tns.  The next seven bits are
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
index a4b5b2cbce93..de414c22892f 100644
--- a/arch/tile/lib/spinlock_64.c
+++ b/arch/tile/lib/spinlock_64.c
@@ -62,28 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock)
 }
 EXPORT_SYMBOL(arch_spin_trylock);
 
-void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	u32 iterations = 0;
-	u32 val = READ_ONCE(lock->lock);
-	u32 curr = arch_spin_current(val);
-
-	/* Return immediately if unlocked. */
-	if (arch_spin_next(val) == curr)
-		return;
-
-	/* Wait until the current locker has released the lock. */
-	do {
-		delay_backoff(iterations++);
-	} while (arch_spin_current(READ_ONCE(lock->lock)) == curr);
-
-	/*
-	 * The TILE architecture doesn't do read speculation; therefore
-	 * a control dependency guarantees a LOAD->{LOAD,STORE} order.
-	 */
-	barrier();
-}
-EXPORT_SYMBOL(arch_spin_unlock_wait);
 
 /*
  * If the read lock fails due to a writer, we retry periodically
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index 600a2e9bfee2..344d95619d03 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -45,7 +45,8 @@ static inline void init_tlb_gather(struct mmu_gather *tlb)
 }
 
 static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+		unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 	tlb->start = start;
@@ -80,13 +81,19 @@ tlb_flush_mmu(struct mmu_gather *tlb)
 	tlb_flush_mmu_free(tlb);
 }
 
-/* tlb_finish_mmu
+/* arch_tlb_finish_mmu
  *	Called at the end of the shootdown operation to free up any resources
  *	that were required.
  */
 static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+arch_tlb_finish_mmu(struct mmu_gather *tlb,
+		unsigned long start, unsigned long end, bool force)
 {
+	if (force) {
+		tlb->start = start;
+		tlb->end = end;
+		tlb->need_flush = 1;
+	}
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h
new file mode 100644
index 000000000000..7ffa5437b761
--- /dev/null
+++ b/arch/um/include/asm/unwind.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_UML_UNWIND_H
+#define _ASM_UML_UNWIND_H
+
+static inline void
+unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+		   void *orc, size_t orc_size) {}
+
+#endif /* _ASM_UML_UNWIND_H */
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 586b786b3edf..f65a804b86f0 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
 # Hyper-V paravirtualization support
 obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
 
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
 obj-y += realmode/
 obj-y += kernel/
 obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..acb366bf6bc1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,8 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_PMEM_API		if X86_64
+	# Causing hangs/crashes, see the commit that added this change for details.
+	select ARCH_HAS_REFCOUNT		if BROKEN
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
@@ -73,7 +75,6 @@ config X86
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select BUILDTIME_EXTABLE_SORT
@@ -100,6 +101,7 @@ config X86
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
 	select GENERIC_TIME_VSYSCALL
+	select HARDLOCKUP_CHECK_TIMESTAMP	if X86_64
 	select HAVE_ACPI_APEI			if ACPI
 	select HAVE_ACPI_APEI_NMI		if ACPI
 	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
@@ -157,17 +159,19 @@ config X86
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
+	select HAVE_MOD_ARCH_SPECIFIC
 	select HAVE_NMI
 	select HAVE_OPROFILE
 	select HAVE_OPTPROBES
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
-	select HAVE_HARDLOCKUP_DETECTOR_PERF	if HAVE_PERF_EVENTS_NMI
+	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_RCU_TABLE_FREE
 	select HAVE_REGS_AND_STACK_ACCESS_API
-	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER && STACK_VALIDATION
+	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
@@ -326,6 +330,7 @@ config FIX_EARLYCON_MEM
 
 config PGTABLE_LEVELS
 	int
+	default 5 if X86_5LEVEL
 	default 4 if X86_64
 	default 3 if X86_PAE
 	default 2
@@ -424,16 +429,16 @@ config GOLDFISH
        def_bool y
        depends on X86_GOLDFISH
 
-config INTEL_RDT_A
-	bool "Intel Resource Director Technology Allocation support"
+config INTEL_RDT
+	bool "Intel Resource Director Technology support"
 	default n
 	depends on X86 && CPU_SUP_INTEL
 	select KERNFS
 	help
-	  Select to enable resource allocation which is a sub-feature of
-	  Intel Resource Director Technology(RDT). More information about
-	  RDT can be found in the Intel x86 Architecture Software
-	  Developer Manual.
+	  Select to enable resource allocation and monitoring which are
+	  sub-features of Intel Resource Director Technology(RDT). More
+	  information about RDT can be found in the Intel x86
+	  Architecture Software Developer Manual.
 
 	  Say N if unsure.
 
@@ -777,8 +782,6 @@ config KVM_DEBUG_FS
 	  Statistics are displayed in debugfs filesystem. Enabling this option
 	  may incur significant overhead.
 
-source "arch/x86/lguest/Kconfig"
-
 config PARAVIRT_TIME_ACCOUNTING
 	bool "Paravirtual steal time accounting"
 	depends on PARAVIRT
@@ -1398,6 +1401,24 @@ config X86_PAE
 	  has the cost of more pagetable lookup overhead, and also
 	  consumes more pagetable space per process.
 
+config X86_5LEVEL
+	bool "Enable 5-level page tables support"
+	depends on X86_64
+	---help---
+	  5-level paging enables access to larger address space:
+	  upto 128 PiB of virtual address space and 4 PiB of
+	  physical address space.
+
+	  It will be supported by future Intel CPUs.
+
+	  Note: a kernel with this option enabled can only be booted
+	  on machines that support the feature.
+
+	  See Documentation/x86/x86_64/5level-paging.txt for more
+	  information.
+
+	  Say N if unsure.
+
 config ARCH_PHYS_ADDR_T_64BIT
 	def_bool y
 	depends on X86_64 || X86_PAE
@@ -1415,6 +1436,35 @@ config X86_DIRECT_GBPAGES
 	  supports them), so don't confuse the user by printing
 	  that we have them enabled.
 
+config ARCH_HAS_MEM_ENCRYPT
+	def_bool y
+
+config AMD_MEM_ENCRYPT
+	bool "AMD Secure Memory Encryption (SME) support"
+	depends on X86_64 && CPU_SUP_AMD
+	---help---
+	  Say yes to enable support for the encryption of system memory.
+	  This requires an AMD processor that supports Secure Memory
+	  Encryption (SME).
+
+config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+	bool "Activate AMD Secure Memory Encryption (SME) by default"
+	default y
+	depends on AMD_MEM_ENCRYPT
+	---help---
+	  Say yes to have system memory encrypted by default if running on
+	  an AMD processor that supports Secure Memory Encryption (SME).
+
+	  If set to Y, then the encryption of system memory can be
+	  deactivated with the mem_encrypt=off command line option.
+
+	  If set to N, then the encryption of system memory can be
+	  activated with the mem_encrypt=on command line option.
+
+config ARCH_USE_MEMREMAP_PROT
+	def_bool y
+	depends on AMD_MEM_ENCRYPT
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fcb7604172ce..71a48a30fc84 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -305,8 +305,6 @@ config DEBUG_ENTRY
 	  Some of these sanity checks may slow down kernel entries and
 	  exits or otherwise impact performance.
 
-	  This is currently used to help test NMI code.
-
 	  If unsure, say N.
 
 config DEBUG_NMI_SELFTEST
@@ -348,6 +346,7 @@ config X86_DEBUG_FPU
 
 config PUNIT_ATOM_DEBUG
 	tristate "ATOM Punit debug driver"
+	depends on PCI
 	select DEBUG_FS
 	select IOSF_MBI
 	---help---
@@ -357,4 +356,61 @@ config PUNIT_ATOM_DEBUG
 	  The current power state can be read from
 	  /sys/kernel/debug/punit_atom/dev_power_state
 
+choice
+	prompt "Choose kernel unwinder"
+	default FRAME_POINTER_UNWINDER
+	---help---
+	  This determines which method will be used for unwinding kernel stack
+	  traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+	  livepatch, lockdep, and more.
+
+config FRAME_POINTER_UNWINDER
+	bool "Frame pointer unwinder"
+	select FRAME_POINTER
+	---help---
+	  This option enables the frame pointer unwinder for unwinding kernel
+	  stack traces.
+
+	  The unwinder itself is fast and it uses less RAM than the ORC
+	  unwinder, but the kernel text size will grow by ~3% and the kernel's
+	  overall performance will degrade by roughly 5-10%.
+
+	  This option is recommended if you want to use the livepatch
+	  consistency model, as this is currently the only way to get a
+	  reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config ORC_UNWINDER
+	bool "ORC unwinder"
+	depends on X86_64
+	select STACK_VALIDATION
+	---help---
+	  This option enables the ORC (Oops Rewind Capability) unwinder for
+	  unwinding kernel stack traces.  It uses a custom data format which is
+	  a simplified version of the DWARF Call Frame Information standard.
+
+	  This unwinder is more accurate across interrupt entry frames than the
+	  frame pointer unwinder.  It also enables a 5-10% performance
+	  improvement across the entire kernel compared to frame pointers.
+
+	  Enabling this option will increase the kernel's runtime memory usage
+	  by roughly 2-4MB, depending on your kernel config.
+
+config GUESS_UNWINDER
+	bool "Guess unwinder"
+	depends on EXPERT
+	---help---
+	  This option enables the "guess" unwinder for unwinding kernel stack
+	  traces.  It scans the stack and reports every kernel text address it
+	  finds.  Some of the addresses it reports may be incorrect.
+
+	  While this option often produces false positives, it can still be
+	  useful in many cases.  Unlike the other unwinders, it has no runtime
+	  overhead.
+
+endchoice
+
+config FRAME_POINTER
+	depends on !ORC_UNWINDER && !GUESS_UNWINDER
+	bool
+
 endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1e902f926be3..6276572259c8 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -14,9 +14,11 @@ endif
 # For gcc stack alignment is specified with -mpreferred-stack-boundary,
 # clang has the option -mstack-alignment for that purpose.
 ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
-        cc_stack_align_opt := -mpreferred-stack-boundary
-else ifneq ($(call cc-option, -mstack-alignment=4),)
-        cc_stack_align_opt := -mstack-alignment
+      cc_stack_align4 := -mpreferred-stack-boundary=2
+      cc_stack_align8 := -mpreferred-stack-boundary=3
+else ifneq ($(call cc-option, -mstack-alignment=16),)
+      cc_stack_align4 := -mstack-alignment=4
+      cc_stack_align8 := -mstack-alignment=8
 endif
 
 # How to compile the 16-bit code.  Note we always compile for -march=i386;
@@ -36,7 +38,7 @@ REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -D__KERNEL__ \
 
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
 export REALMODE_CFLAGS
 
 # BITS is used as extension for files which are available in a 32 bit
@@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y)
         # Align the stack to the register width instead of using the default
         # alignment of 16 bytes. This reduces stack usage and the number of
         # alignment instructions.
-        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2)
+        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4))
 
         # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
         # a lot more stack due to the lack of sharing of stacklots:
@@ -115,7 +117,7 @@ else
         # default alignment which keep the stack *mis*aligned.
         # Furthermore an alignment to the register width reduces stack usage
         # and the number of alignment instructions.
-        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3)
+        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8))
 
 	# Use -mskip-rax-setup if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
@@ -232,9 +234,6 @@ KBUILD_CFLAGS += -Wno-sign-compare
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 
-KBUILD_CFLAGS += $(mflags-y)
-KBUILD_AFLAGS += $(mflags-y)
-
 archscripts: scripts_basic
 	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
 
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0d810fb15eac..d88a2fddba8c 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -73,12 +73,13 @@ UBSAN_SANITIZE := n
 $(obj)/bzImage: asflags-y  := $(SVGA_MODE)
 
 quiet_cmd_image = BUILD   $@
+silent_redirect_image = >/dev/null
 cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
-			       $(obj)/zoffset.h $@
+			       $(obj)/zoffset.h $@ $($(quiet)redirect_image)
 
 $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
 	$(call if_changed,image)
-	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+	@$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')'
 
 OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
 $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 2c860ad4fe06..8a958274b54c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -34,6 +34,7 @@ KBUILD_CFLAGS += $(cflags-y)
 KBUILD_CFLAGS += -mno-mmx -mno-sse
 KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
 KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
+KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
 
 KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c3e869eaef0c..926c2cc4facc 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params,
 		m |= (u64)efi->efi_memmap_hi << 32;
 #endif
 
-		d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
+		d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i);
 		switch (d->type) {
 		case EFI_RESERVED_TYPE:
 		case EFI_RUNTIME_SERVICES_CODE:
@@ -1058,7 +1058,7 @@ struct boot_params *efi_main(struct efi_config *c,
 		desc->s = DESC_TYPE_CODE_DATA;
 		desc->dpl = 0;
 		desc->p = 1;
-		desc->limit = 0xf;
+		desc->limit1 = 0xf;
 		desc->avl = 0;
 		desc->l = 0;
 		desc->d = SEG_OP_SIZE_32BIT;
@@ -1078,7 +1078,7 @@ struct boot_params *efi_main(struct efi_config *c,
 	desc->s = DESC_TYPE_CODE_DATA;
 	desc->dpl = 0;
 	desc->p = 1;
-	desc->limit = 0xf;
+	desc->limit1 = 0xf;
 	desc->avl = 0;
 	if (IS_ENABLED(CONFIG_X86_64)) {
 		desc->l = 1;
@@ -1099,7 +1099,7 @@ struct boot_params *efi_main(struct efi_config *c,
 	desc->s = DESC_TYPE_CODE_DATA;
 	desc->dpl = 0;
 	desc->p = 1;
-	desc->limit = 0xf;
+	desc->limit1 = 0xf;
 	desc->avl = 0;
 	desc->l = 0;
 	desc->d = SEG_OP_SIZE_32BIT;
@@ -1116,7 +1116,7 @@ struct boot_params *efi_main(struct efi_config *c,
 		desc->s = 0;
 		desc->dpl = 0;
 		desc->p = 1;
-		desc->limit = 0x0;
+		desc->limit1 = 0x0;
 		desc->avl = 0;
 		desc->l = 0;
 		desc->d = 0;
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d85b9625e836..11c68cf53d4e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -61,71 +61,6 @@
 
 	__HEAD
 ENTRY(startup_32)
-#ifdef CONFIG_EFI_STUB
-	jmp	preferred_addr
-
-	/*
-	 * We don't need the return address, so set up the stack so
-	 * efi_main() can find its arguments.
-	 */
-ENTRY(efi_pe_entry)
-	add	$0x4, %esp
-
-	call	1f
-1:	popl	%esi
-	subl	$1b, %esi
-
-	popl	%ecx
-	movl	%ecx, efi32_config(%esi)	/* Handle */
-	popl	%ecx
-	movl	%ecx, efi32_config+8(%esi)	/* EFI System table pointer */
-
-	/* Relocate efi_config->call() */
-	leal	efi32_config(%esi), %eax
-	add	%esi, 40(%eax)
-	pushl	%eax
-
-	call	make_boot_params
-	cmpl	$0, %eax
-	je	fail
-	movl	%esi, BP_code32_start(%eax)
-	popl	%ecx
-	pushl	%eax
-	pushl	%ecx
-	jmp	2f		/* Skip efi_config initialization */
-
-ENTRY(efi32_stub_entry)
-	add	$0x4, %esp
-	popl	%ecx
-	popl	%edx
-
-	call	1f
-1:	popl	%esi
-	subl	$1b, %esi
-
-	movl	%ecx, efi32_config(%esi)	/* Handle */
-	movl	%edx, efi32_config+8(%esi)	/* EFI System table pointer */
-
-	/* Relocate efi_config->call() */
-	leal	efi32_config(%esi), %eax
-	add	%esi, 40(%eax)
-	pushl	%eax
-2:
-	call	efi_main
-	cmpl	$0, %eax
-	movl	%eax, %esi
-	jne	2f
-fail:
-	/* EFI init failed, so hang. */
-	hlt
-	jmp	fail
-2:
-	movl	BP_code32_start(%esi), %eax
-	leal	preferred_addr(%eax), %eax
-	jmp	*%eax
-
-preferred_addr:
-#endif
 	cld
 	/*
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -208,6 +143,70 @@ preferred_addr:
 	jmp	*%eax
 ENDPROC(startup_32)
 
+#ifdef CONFIG_EFI_STUB
+/*
+ * We don't need the return address, so set up the stack so efi_main() can find
+ * its arguments.
+ */
+ENTRY(efi_pe_entry)
+	add	$0x4, %esp
+
+	call	1f
+1:	popl	%esi
+	subl	$1b, %esi
+
+	popl	%ecx
+	movl	%ecx, efi32_config(%esi)	/* Handle */
+	popl	%ecx
+	movl	%ecx, efi32_config+8(%esi)	/* EFI System table pointer */
+
+	/* Relocate efi_config->call() */
+	leal	efi32_config(%esi), %eax
+	add	%esi, 40(%eax)
+	pushl	%eax
+
+	call	make_boot_params
+	cmpl	$0, %eax
+	je	fail
+	movl	%esi, BP_code32_start(%eax)
+	popl	%ecx
+	pushl	%eax
+	pushl	%ecx
+	jmp	2f		/* Skip efi_config initialization */
+ENDPROC(efi_pe_entry)
+
+ENTRY(efi32_stub_entry)
+	add	$0x4, %esp
+	popl	%ecx
+	popl	%edx
+
+	call	1f
+1:	popl	%esi
+	subl	$1b, %esi
+
+	movl	%ecx, efi32_config(%esi)	/* Handle */
+	movl	%edx, efi32_config+8(%esi)	/* EFI System table pointer */
+
+	/* Relocate efi_config->call() */
+	leal	efi32_config(%esi), %eax
+	add	%esi, 40(%eax)
+	pushl	%eax
+2:
+	call	efi_main
+	cmpl	$0, %eax
+	movl	%eax, %esi
+	jne	2f
+fail:
+	/* EFI init failed, so hang. */
+	hlt
+	jmp	fail
+2:
+	movl	BP_code32_start(%esi), %eax
+	leal	startup_32(%eax), %eax
+	jmp	*%eax
+ENDPROC(efi32_stub_entry)
+#endif
+
 	.text
 relocated:
 
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fbf4c32d0b62..b4a5d284391c 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -243,65 +243,6 @@ ENTRY(startup_64)
 	 * that maps our entire kernel(text+data+bss+brk), zero page
 	 * and command line.
 	 */
-#ifdef CONFIG_EFI_STUB
-	/*
-	 * The entry point for the PE/COFF executable is efi_pe_entry, so
-	 * only legacy boot loaders will execute this jmp.
-	 */
-	jmp	preferred_addr
-
-ENTRY(efi_pe_entry)
-	movq	%rcx, efi64_config(%rip)	/* Handle */
-	movq	%rdx, efi64_config+8(%rip) /* EFI System table pointer */
-
-	leaq	efi64_config(%rip), %rax
-	movq	%rax, efi_config(%rip)
-
-	call	1f
-1:	popq	%rbp
-	subq	$1b, %rbp
-
-	/*
-	 * Relocate efi_config->call().
-	 */
-	addq	%rbp, efi64_config+40(%rip)
-
-	movq	%rax, %rdi
-	call	make_boot_params
-	cmpq	$0,%rax
-	je	fail
-	mov	%rax, %rsi
-	leaq	startup_32(%rip), %rax
-	movl	%eax, BP_code32_start(%rsi)
-	jmp	2f		/* Skip the relocation */
-
-handover_entry:
-	call	1f
-1:	popq	%rbp
-	subq	$1b, %rbp
-
-	/*
-	 * Relocate efi_config->call().
-	 */
-	movq	efi_config(%rip), %rax
-	addq	%rbp, 40(%rax)
-2:
-	movq	efi_config(%rip), %rdi
-	call	efi_main
-	movq	%rax,%rsi
-	cmpq	$0,%rax
-	jne	2f
-fail:
-	/* EFI init failed, so hang. */
-	hlt
-	jmp	fail
-2:
-	movl	BP_code32_start(%esi), %eax
-	leaq	preferred_addr(%rax), %rax
-	jmp	*%rax
-
-preferred_addr:
-#endif
 
 	/* Setup data segments. */
 	xorl	%eax, %eax
@@ -413,6 +354,59 @@ lvl5:
 	jmp	*%rax
 
 #ifdef CONFIG_EFI_STUB
+
+/* The entry point for the PE/COFF executable is efi_pe_entry. */
+ENTRY(efi_pe_entry)
+	movq	%rcx, efi64_config(%rip)	/* Handle */
+	movq	%rdx, efi64_config+8(%rip) /* EFI System table pointer */
+
+	leaq	efi64_config(%rip), %rax
+	movq	%rax, efi_config(%rip)
+
+	call	1f
+1:	popq	%rbp
+	subq	$1b, %rbp
+
+	/*
+	 * Relocate efi_config->call().
+	 */
+	addq	%rbp, efi64_config+40(%rip)
+
+	movq	%rax, %rdi
+	call	make_boot_params
+	cmpq	$0,%rax
+	je	fail
+	mov	%rax, %rsi
+	leaq	startup_32(%rip), %rax
+	movl	%eax, BP_code32_start(%rsi)
+	jmp	2f		/* Skip the relocation */
+
+handover_entry:
+	call	1f
+1:	popq	%rbp
+	subq	$1b, %rbp
+
+	/*
+	 * Relocate efi_config->call().
+	 */
+	movq	efi_config(%rip), %rax
+	addq	%rbp, 40(%rax)
+2:
+	movq	efi_config(%rip), %rdi
+	call	efi_main
+	movq	%rax,%rsi
+	cmpq	$0,%rax
+	jne	2f
+fail:
+	/* EFI init failed, so hang. */
+	hlt
+	jmp	fail
+2:
+	movl	BP_code32_start(%esi), %eax
+	leaq	startup_64(%rax), %rax
+	jmp	*%rax
+ENDPROC(efi_pe_entry)
+
 	.org 0x390
 ENTRY(efi64_stub_entry)
 	movq	%rdi, efi64_config(%rip)	/* Handle */
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 91f27ab970ef..17818ba6906f 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -37,7 +37,9 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/ctype.h>
+#include <linux/efi.h>
 #include <generated/utsrelease.h>
+#include <asm/efi.h>
 
 /* Macros used by the included decompressor code below. */
 #define STATIC
@@ -479,35 +481,31 @@ static unsigned long slots_fetch_random(void)
 	return 0;
 }
 
-static void process_e820_entry(struct boot_e820_entry *entry,
+static void process_mem_region(struct mem_vector *entry,
 			       unsigned long minimum,
 			       unsigned long image_size)
 {
 	struct mem_vector region, overlap;
 	struct slot_area slot_area;
 	unsigned long start_orig, end;
-	struct boot_e820_entry cur_entry;
-
-	/* Skip non-RAM entries. */
-	if (entry->type != E820_TYPE_RAM)
-		return;
+	struct mem_vector cur_entry;
 
 	/* On 32-bit, ignore entries entirely above our maximum. */
-	if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
+	if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
 		return;
 
 	/* Ignore entries entirely below our minimum. */
-	if (entry->addr + entry->size < minimum)
+	if (entry->start + entry->size < minimum)
 		return;
 
 	/* Ignore entries above memory limit */
-	end = min(entry->size + entry->addr, mem_limit);
-	if (entry->addr >= end)
+	end = min(entry->size + entry->start, mem_limit);
+	if (entry->start >= end)
 		return;
-	cur_entry.addr = entry->addr;
-	cur_entry.size = end - entry->addr;
+	cur_entry.start = entry->start;
+	cur_entry.size = end - entry->start;
 
-	region.start = cur_entry.addr;
+	region.start = cur_entry.start;
 	region.size = cur_entry.size;
 
 	/* Give up if slot area array is full. */
@@ -521,8 +519,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
 		/* Potentially raise address to meet alignment needs. */
 		region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
 
-		/* Did we raise the address above this e820 region? */
-		if (region.start > cur_entry.addr + cur_entry.size)
+		/* Did we raise the address above the passed in memory entry? */
+		if (region.start > cur_entry.start + cur_entry.size)
 			return;
 
 		/* Reduce size by any delta from the original address. */
@@ -562,31 +560,126 @@ static void process_e820_entry(struct boot_e820_entry *entry,
 	}
 }
 
-static unsigned long find_random_phys_addr(unsigned long minimum,
-					   unsigned long image_size)
+#ifdef CONFIG_EFI
+/*
+ * Returns true if mirror region found (and must have been processed
+ * for slots adding)
+ */
+static bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
 {
+	struct efi_info *e = &boot_params->efi_info;
+	bool efi_mirror_found = false;
+	struct mem_vector region;
+	efi_memory_desc_t *md;
+	unsigned long pmap;
+	char *signature;
+	u32 nr_desc;
 	int i;
-	unsigned long addr;
 
-	/* Check if we had too many memmaps. */
-	if (memmap_too_large) {
-		debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n");
-		return 0;
+	signature = (char *)&e->efi_loader_signature;
+	if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+	    strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
+		return false;
+
+#ifdef CONFIG_X86_32
+	/* Can't handle data above 4GB at this time */
+	if (e->efi_memmap_hi) {
+		warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
+		return false;
 	}
+	pmap =  e->efi_memmap;
+#else
+	pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
+#endif
 
-	/* Make sure minimum is aligned. */
-	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+	nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
+	for (i = 0; i < nr_desc; i++) {
+		md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+		if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+			efi_mirror_found = true;
+			break;
+		}
+	}
+
+	for (i = 0; i < nr_desc; i++) {
+		md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+
+		/*
+		 * Here we are more conservative in picking free memory than
+		 * the EFI spec allows:
+		 *
+		 * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also
+		 * free memory and thus available to place the kernel image into,
+		 * but in practice there's firmware where using that memory leads
+		 * to crashes.
+		 *
+		 * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free.
+		 */
+		if (md->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (efi_mirror_found &&
+		    !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
+			continue;
+
+		region.start = md->phys_addr;
+		region.size = md->num_pages << EFI_PAGE_SHIFT;
+		process_mem_region(&region, minimum, image_size);
+		if (slot_area_index == MAX_SLOT_AREA) {
+			debug_putstr("Aborted EFI scan (slot_areas full)!\n");
+			break;
+		}
+	}
+	return true;
+}
+#else
+static inline bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+	return false;
+}
+#endif
+
+static void process_e820_entries(unsigned long minimum,
+				 unsigned long image_size)
+{
+	int i;
+	struct mem_vector region;
+	struct boot_e820_entry *entry;
 
 	/* Verify potential e820 positions, appending to slots list. */
 	for (i = 0; i < boot_params->e820_entries; i++) {
-		process_e820_entry(&boot_params->e820_table[i], minimum,
-				   image_size);
+		entry = &boot_params->e820_table[i];
+		/* Skip non-RAM entries. */
+		if (entry->type != E820_TYPE_RAM)
+			continue;
+		region.start = entry->addr;
+		region.size = entry->size;
+		process_mem_region(&region, minimum, image_size);
 		if (slot_area_index == MAX_SLOT_AREA) {
 			debug_putstr("Aborted e820 scan (slot_areas full)!\n");
 			break;
 		}
 	}
+}
+
+static unsigned long find_random_phys_addr(unsigned long minimum,
+					   unsigned long image_size)
+{
+	/* Check if we had too many memmaps. */
+	if (memmap_too_large) {
+		debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
+		return 0;
+	}
+
+	/* Make sure minimum is aligned. */
+	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+
+	if (process_efi_entries(minimum, image_size))
+		return slots_fetch_random();
 
+	process_e820_entries(minimum, image_size);
 	return slots_fetch_random();
 }
 
@@ -645,7 +738,7 @@ void choose_random_location(unsigned long input,
 	 */
 	min_addr = min(*output, 512UL << 20);
 
-	/* Walk e820 and find a random address. */
+	/* Walk available memory entries to find a random address. */
 	random_addr = find_random_phys_addr(min_addr, output_size);
 	if (!random_addr) {
 		warn("Physical KASLR disabled: no suitable memory region!");
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a0838ab929f2..c14217cd0155 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -116,8 +116,7 @@ void __putstr(const char *s)
 		}
 	}
 
-	if (boot_params->screen_info.orig_video_mode == 0 &&
-	    lines == 0 && cols == 0)
+	if (lines == 0 || cols == 0)
 		return;
 
 	x = boot_params->screen_info.orig_x;
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 28029be47fbb..f1aa43854bed 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -15,6 +15,13 @@
 #define __pa(x)  ((unsigned long)(x))
 #define __va(x)  ((void *)((unsigned long)(x)))
 
+/*
+ * The pgtable.h and mm/ident_map.c includes make use of the SME related
+ * information which is not used in the compressed image support. Un-define
+ * the SME support to avoid any compile and link errors.
+ */
+#undef CONFIG_AMD_MEM_ENCRYPT
+
 #include "misc.h"
 
 /* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 2ed8f0c25def..1bb08ecffd24 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -520,8 +520,14 @@ pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr
 # the description in lib/decompressor_xxx.c for specific information.
 #
 # extra_bytes = (uncompressed_size >> 12) + 65536 + 128
+#
+# LZ4 is even worse: data that cannot be further compressed grows by 0.4%,
+# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as
+# the size-dependent part now grows so fast.
+#
+# extra_bytes = (uncompressed_size >> 8) + 65536
 
-#define ZO_z_extra_bytes	((ZO_z_output_len >> 12) + 65536 + 128)
+#define ZO_z_extra_bytes	((ZO_z_output_len >> 8) + 65536)
 #if ZO_z_output_len > ZO_z_input_len
 # define ZO_z_extract_offset	(ZO_z_output_len + ZO_z_extra_bytes - \
 				 ZO_z_input_len)
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 630e3664906b..16f49123d747 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -16,6 +16,15 @@
 #include "ctype.h"
 #include "string.h"
 
+/*
+ * Undef these macros so that the functions that we provide
+ * here will have the correct names regardless of how string.h
+ * may have chosen to #define them.
+ */
+#undef memcpy
+#undef memset
+#undef memcmp
+
 int memcmp(const void *s1, const void *s2, size_t len)
 {
 	bool diff;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 6cf79e1a6830..0eb9f92f3717 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,5 +1,4 @@
 # CONFIG_64BIT is not set
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
@@ -125,7 +124,6 @@ CONFIG_NF_CONNTRACK_IPV4=y
 CONFIG_IP_NF_IPTABLES=y
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_ULOG=y
 CONFIG_NF_NAT=y
 CONFIG_IP_NF_TARGET_MASQUERADE=y
 CONFIG_IP_NF_MANGLE=y
@@ -255,7 +253,6 @@ CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_PRINTER=y
 CONFIG_USB_STORAGE=y
-CONFIG_USB_LIBUSUAL=y
 CONFIG_EDAC=y
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_HCTOSYS is not set
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 4b429df40d7a..550cd5012b73 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,3 +1,5 @@
 CONFIG_NOHIGHMEM=y
 # CONFIG_HIGHMEM4G is not set
 # CONFIG_HIGHMEM64G is not set
+CONFIG_GUESS_UNWINDER=y
+# CONFIG_FRAME_POINTER_UNWINDER is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index de45f57b410d..4a4b16e56d35 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,4 +1,3 @@
-CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
@@ -124,7 +123,6 @@ CONFIG_NF_CONNTRACK_IPV4=y
 CONFIG_IP_NF_IPTABLES=y
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_ULOG=y
 CONFIG_NF_NAT=y
 CONFIG_IP_NF_TARGET_MASQUERADE=y
 CONFIG_IP_NF_MANGLE=y
@@ -251,7 +249,6 @@ CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_PRINTER=y
 CONFIG_USB_STORAGE=y
-CONFIG_USB_LIBUSUAL=y
 CONFIG_EDAC=y
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_HCTOSYS is not set
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1cd792db15ef..1eab79c9ac48 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -117,11 +117,10 @@
 	.set T1, REG_T1
 .endm
 
-#define K_BASE		%r8
 #define HASH_PTR	%r9
+#define BLOCKS_CTR	%r8
 #define BUFFER_PTR	%r10
 #define BUFFER_PTR2	%r13
-#define BUFFER_END	%r11
 
 #define PRECALC_BUF	%r14
 #define WK_BUF		%r15
@@ -205,14 +204,14 @@
 		 * blended AVX2 and ALU instruction scheduling
 		 * 1 vector iteration per 8 rounds
 		 */
-		vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
 	.elseif ((i & 7) == 1)
-		vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
+		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
 			 WY_TMP, WY_TMP
 	.elseif ((i & 7) == 2)
 		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
 	.elseif ((i & 7) == 4)
-		vpaddd  K_XMM(K_BASE), WY, WY_TMP
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 	.elseif ((i & 7) == 7)
 		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
 
@@ -255,7 +254,7 @@
 		vpxor	WY, WY_TMP, WY_TMP
 	.elseif ((i & 7) == 7)
 		vpxor	WY_TMP2, WY_TMP, WY
-		vpaddd	K_XMM(K_BASE), WY, WY_TMP
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
 
 		PRECALC_ROTATE_WY
@@ -291,7 +290,7 @@
 		vpsrld	$30, WY, WY
 		vpor	WY, WY_TMP, WY
 	.elseif ((i & 7) == 7)
-		vpaddd	K_XMM(K_BASE), WY, WY_TMP
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
 
 		PRECALC_ROTATE_WY
@@ -446,6 +445,16 @@
 
 .endm
 
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+	mov     \a, RTA
+	add     $\d, RTA
+	cmp     $\c, \b
+	cmovge  RTA, \a
+.endm
+
 /*
  * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
  */
@@ -463,13 +472,16 @@
 	lea	(2*4*80+32)(%rsp), WK_BUF
 
 	# Precalc WK for first 2 blocks
-	PRECALC_OFFSET = 0
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
 	.set i, 0
 	.rept    160
 		PRECALC i
 		.set i, i + 1
 	.endr
-	PRECALC_OFFSET = 128
+
+	/* Go to next block if needed */
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 	xchg	WK_BUF, PRECALC_BUF
 
 	.align 32
@@ -479,8 +491,8 @@ _loop:
 	 * we use K_BASE value as a signal of a last block,
 	 * it is set below by: cmovae BUFFER_PTR, K_BASE
 	 */
-	cmp	K_BASE, BUFFER_PTR
-	jne	_begin
+	test BLOCKS_CTR, BLOCKS_CTR
+	jnz _begin
 	.align 32
 	jmp	_end
 	.align 32
@@ -512,10 +524,10 @@ _loop0:
 		.set j, j+2
 	.endr
 
-	add	$(2*64), BUFFER_PTR       /* move to next odd-64-byte block */
-	cmp	BUFFER_END, BUFFER_PTR    /* is current block the last one? */
-	cmovae	K_BASE, BUFFER_PTR	/* signal the last iteration smartly */
-
+	/* Update Counter */
+	sub $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
 	/*
 	 * rounds
 	 * 60,62,64,66,68
@@ -532,8 +544,8 @@ _loop0:
 	UPDATE_HASH	12(HASH_PTR), D
 	UPDATE_HASH	16(HASH_PTR), E
 
-	cmp	K_BASE, BUFFER_PTR	/* is current block the last one? */
-	je	_loop
+	test	BLOCKS_CTR, BLOCKS_CTR
+	jz	_loop
 
 	mov	TB, B
 
@@ -575,10 +587,10 @@ _loop2:
 		.set j, j+2
 	.endr
 
-	add	$(2*64), BUFFER_PTR2      /* move to next even-64-byte block */
-
-	cmp	BUFFER_END, BUFFER_PTR2   /* is current block the last one */
-	cmovae	K_BASE, BUFFER_PTR       /* signal the last iteration smartly */
+	/* update counter */
+	sub     $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 
 	jmp	_loop3
 _loop3:
@@ -641,19 +653,12 @@ _loop3:
 
 	avx2_zeroupper
 
-	lea	K_XMM_AR(%rip), K_BASE
-
+	/* Setup initial values */
 	mov	CTX, HASH_PTR
 	mov	BUF, BUFFER_PTR
-	lea	64(BUF), BUFFER_PTR2
-
-	shl	$6, CNT			/* mul by 64 */
-	add	BUF, CNT
-	add	$64, CNT
-	mov	CNT, BUFFER_END
 
-	cmp	BUFFER_END, BUFFER_PTR2
-	cmovae	K_BASE, BUFFER_PTR2
+	mov	BUF, BUFFER_PTR2
+	mov	CNT, BLOCKS_CTR
 
 	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
 
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index f960a043cdeb..fc61739150e7 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
 
 static bool avx2_usable(void)
 {
-	if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+	if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
 		&& boot_cpu_has(X86_FEATURE_BMI1)
 		&& boot_cpu_has(X86_FEATURE_BMI2))
 		return true;
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 9976fcecd17e..af28a8a24366 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,7 +2,6 @@
 # Makefile for the x86 low level entry code
 #
 
-OBJECT_FILES_NON_STANDARD_entry_$(BITS).o   := y
 OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
 
 CFLAGS_syscall_64.o		+= $(call cc-option,-Wno-override-init,)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d393da7..640aafebdc00 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,4 +1,5 @@
 #include <linux/jump_label.h>
+#include <asm/unwind_hints.h>
 
 /*
 
@@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
 	movq %rdx, 12*8+\offset(%rsp)
 	movq %rsi, 13*8+\offset(%rsp)
 	movq %rdi, 14*8+\offset(%rsp)
+	UNWIND_HINT_REGS offset=\offset extra=0
 	.endm
 	.macro SAVE_C_REGS offset=0
 	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with
 	movq %r12, 3*8+\offset(%rsp)
 	movq %rbp, 4*8+\offset(%rsp)
 	movq %rbx, 5*8+\offset(%rsp)
+	UNWIND_HINT_REGS offset=\offset
 	.endm
 
 	.macro RESTORE_EXTRA_REGS offset=0
@@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with
 	movq 3*8+\offset(%rsp), %r12
 	movq 4*8+\offset(%rsp), %rbp
 	movq 5*8+\offset(%rsp), %rbx
+	UNWIND_HINT_REGS offset=\offset extra=0
 	.endm
 
 	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
@@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
 	.endif
 	movq 13*8(%rsp), %rsi
 	movq 14*8(%rsp), %rdi
+	UNWIND_HINT_IRET_REGS offset=16*8
 	.endm
 	.macro RESTORE_C_REGS
 	RESTORE_C_REGS_HELPER 1,1,1,1,1
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..03505ffbe1b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/uprobes.h>
 #include <linux/livepatch.h>
+#include <linux/syscalls.h>
 
 #include <asm/desc.h>
 #include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	struct thread_info *ti = current_thread_info();
 	u32 cached_flags;
 
+	addr_limit_user_check();
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
 		local_irq_disable();
 
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 48ef7bb32c42..8a13d468635a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -673,16 +673,8 @@ ENTRY(name)				\
 	jmp	ret_from_intr;		\
 ENDPROC(name)
 
-
-#ifdef CONFIG_TRACING
-# define TRACE_BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
-#else
-# define TRACE_BUILD_INTERRUPT(name, nr)
-#endif
-
 #define BUILD_INTERRUPT(name, nr)		\
 	BUILD_INTERRUPT3(name, nr, smp_##name);	\
-	TRACE_BUILD_INTERRUPT(name, nr)
 
 /* The include is where all of the SMP etc. interrupts come from */
 #include <asm/entry_arch.h>
@@ -880,25 +872,17 @@ ENTRY(xen_failsafe_callback)
 ENDPROC(xen_failsafe_callback)
 
 BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
-		xen_evtchn_do_upcall)
+		 xen_evtchn_do_upcall)
 
 #endif /* CONFIG_XEN */
 
 #if IS_ENABLED(CONFIG_HYPERV)
 
 BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
-	hyperv_vector_handler)
+		 hyperv_vector_handler)
 
 #endif /* CONFIG_HYPERV */
 
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
-	ASM_CLAC
-	pushl	$trace_do_page_fault
-	jmp	common_exception
-END(trace_page_fault)
-#endif
-
 ENTRY(page_fault)
 	ASM_CLAC
 	pushl	$do_page_fault
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a9a8027a6c0e..49167258d587 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <asm/export.h>
+#include <asm/frame.h>
 #include <linux/err.h>
 
 .code64
@@ -43,9 +44,10 @@
 
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_usergs_sysret64)
+	UNWIND_HINT_EMPTY
 	swapgs
 	sysretq
-ENDPROC(native_usergs_sysret64)
+END(native_usergs_sysret64)
 #endif /* CONFIG_PARAVIRT */
 
 .macro TRACE_IRQS_IRETQ
@@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64)
  */
 
 ENTRY(entry_SYSCALL_64)
+	UNWIND_HINT_EMPTY
 	/*
 	 * Interrupts are off on entry.
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
 	 * it is too small to ever cause noticeable irq latency.
 	 */
-	SWAPGS_UNSAFE_STACK
-	/*
-	 * A hypervisor implementation might want to use a label
-	 * after the swapgs, so that it can do the swapgs
-	 * for the guest and jump here on syscall.
-	 */
-GLOBAL(entry_SYSCALL_64_after_swapgs)
 
+	swapgs
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
@@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	pushq	%r11				/* pt_regs->flags */
 	pushq	$__USER_CS			/* pt_regs->cs */
 	pushq	%rcx				/* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%rax				/* pt_regs->orig_ax */
 	pushq	%rdi				/* pt_regs->di */
 	pushq	%rsi				/* pt_regs->si */
@@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	pushq	%r10				/* pt_regs->r10 */
 	pushq	%r11				/* pt_regs->r11 */
 	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
+	UNWIND_HINT_REGS extra=0
 
 	/*
 	 * If we need to do entry work or if we guess we'll need to do
@@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath:
 	movq	EFLAGS(%rsp), %r11
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RSP(%rsp), %rsp
+	UNWIND_HINT_EMPTY
 	USERGS_SYSRET64
 
 1:
@@ -316,6 +316,7 @@ syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RSP(%rsp), %rsp
+	UNWIND_HINT_EMPTY
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
@@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	popq	%rax
+	UNWIND_HINT_REGS extra=0
 	jmp	entry_SYSCALL64_slow_path
 
 1:
@@ -351,6 +353,7 @@ END(stub_ptregs_64)
 
 .macro ptregs_stub func
 ENTRY(ptregs_\func)
+	UNWIND_HINT_FUNC
 	leaq	\func(%rip), %rax
 	jmp	stub_ptregs_64
 END(ptregs_\func)
@@ -367,6 +370,7 @@ END(ptregs_\func)
  * %rsi: next task
  */
 ENTRY(__switch_to_asm)
+	UNWIND_HINT_FUNC
 	/*
 	 * Save callee-saved registers
 	 * This must match the order in inactive_task_frame
@@ -406,6 +410,7 @@ END(__switch_to_asm)
  * r12: kernel thread arg
  */
 ENTRY(ret_from_fork)
+	UNWIND_HINT_EMPTY
 	movq	%rax, %rdi
 	call	schedule_tail			/* rdi: 'prev' task parameter */
 
@@ -413,6 +418,7 @@ ENTRY(ret_from_fork)
 	jnz	1f				/* kernel threads are uncommon */
 
 2:
+	UNWIND_HINT_REGS
 	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
@@ -440,13 +446,102 @@ END(ret_from_fork)
 ENTRY(irq_entries_start)
     vector=FIRST_EXTERNAL_VECTOR
     .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+	UNWIND_HINT_IRET_REGS
 	pushq	$(~vector+0x80)			/* Note: always in signed byte range */
-    vector=vector+1
 	jmp	common_interrupt
 	.align	8
+	vector=vector+1
     .endr
 END(irq_entries_start)
 
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+	pushfq
+	testl $X86_EFLAGS_IF, (%rsp)
+	jz .Lokay_\@
+	ud2
+.Lokay_\@:
+	addq $8, %rsp
+#endif
+.endm
+
+/*
+ * Enters the IRQ stack if we're not already using it.  NMI-safe.  Clobbers
+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
+ * Requires kernel GSBASE.
+ *
+ * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+ */
+.macro ENTER_IRQ_STACK regs=1 old_rsp
+	DEBUG_ENTRY_ASSERT_IRQS_OFF
+	movq	%rsp, \old_rsp
+
+	.if \regs
+	UNWIND_HINT_REGS base=\old_rsp
+	.endif
+
+	incl	PER_CPU_VAR(irq_count)
+	jnz	.Lirq_stack_push_old_rsp_\@
+
+	/*
+	 * Right now, if we just incremented irq_count to zero, we've
+	 * claimed the IRQ stack but we haven't switched to it yet.
+	 *
+	 * If anything is added that can interrupt us here without using IST,
+	 * it must be *extremely* careful to limit its stack usage.  This
+	 * could include kprobes and a hypothetical future IST-less #DB
+	 * handler.
+	 *
+	 * The OOPS unwinder relies on the word at the top of the IRQ
+	 * stack linking back to the previous RSP for the entire time we're
+	 * on the IRQ stack.  For this to work reliably, we need to write
+	 * it before we actually move ourselves to the IRQ stack.
+	 */
+
+	movq	\old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
+	movq	PER_CPU_VAR(irq_stack_ptr), %rsp
+
+#ifdef CONFIG_DEBUG_ENTRY
+	/*
+	 * If the first movq above becomes wrong due to IRQ stack layout
+	 * changes, the only way we'll notice is if we try to unwind right
+	 * here.  Assert that we set up the stack right to catch this type
+	 * of bug quickly.
+	 */
+	cmpq	-8(%rsp), \old_rsp
+	je	.Lirq_stack_okay\@
+	ud2
+	.Lirq_stack_okay\@:
+#endif
+
+.Lirq_stack_push_old_rsp_\@:
+	pushq	\old_rsp
+
+	.if \regs
+	UNWIND_HINT_REGS indirect=1
+	.endif
+.endm
+
+/*
+ * Undoes ENTER_IRQ_STACK.
+ */
+.macro LEAVE_IRQ_STACK regs=1
+	DEBUG_ENTRY_ASSERT_IRQS_OFF
+	/* We need to be off the IRQ stack before decrementing irq_count. */
+	popq	%rsp
+
+	.if \regs
+	UNWIND_HINT_REGS
+	.endif
+
+	/*
+	 * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
+	 * the irq stack but we're not on it.
+	 */
+
+	decl	PER_CPU_VAR(irq_count)
+.endm
+
 /*
  * Interrupt entry/exit.
  *
@@ -485,17 +580,7 @@ END(irq_entries_start)
 	CALL_enter_from_user_mode
 
 1:
-	/*
-	 * Save previous stack pointer, optionally switch to interrupt stack.
-	 * irq_count is used to check if a CPU is already on an interrupt stack
-	 * or not. While this is essentially redundant with preempt_count it is
-	 * a little cheaper to use a separate counter in the PDA (short of
-	 * moving irq_enter into assembly, which would be too much work)
-	 */
-	movq	%rsp, %rdi
-	incl	PER_CPU_VAR(irq_count)
-	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
-	pushq	%rdi
+	ENTER_IRQ_STACK old_rsp=%rdi
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 
@@ -515,10 +600,8 @@ common_interrupt:
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
-	decl	PER_CPU_VAR(irq_count)
 
-	/* Restore saved previous stack */
-	popq	%rsp
+	LEAVE_IRQ_STACK
 
 	testb	$3, CS(%rsp)
 	jz	retint_kernel
@@ -561,6 +644,7 @@ restore_c_regs_and_iret:
 	INTERRUPT_RETURN
 
 ENTRY(native_iret)
+	UNWIND_HINT_IRET_REGS
 	/*
 	 * Are we returning to a stack segment from the LDT?  Note: in
 	 * 64-bit mode SS:RSP on the exception stack is always valid.
@@ -633,6 +717,7 @@ native_irq_return_ldt:
 	orq	PER_CPU_VAR(espfix_stack), %rax
 	SWAPGS
 	movq	%rax, %rsp
+	UNWIND_HINT_IRET_REGS offset=8
 
 	/*
 	 * At this point, we cannot write to the stack any more, but we can
@@ -654,6 +739,7 @@ END(common_interrupt)
  */
 .macro apicinterrupt3 num sym do_sym
 ENTRY(\sym)
+	UNWIND_HINT_IRET_REGS
 	ASM_CLAC
 	pushq	$~(\num)
 .Lcommon_\sym:
@@ -662,31 +748,13 @@ ENTRY(\sym)
 END(\sym)
 .endm
 
-#ifdef CONFIG_TRACING
-#define trace(sym) trace_##sym
-#define smp_trace(sym) smp_trace_##sym
-
-.macro trace_apicinterrupt num sym
-apicinterrupt3 \num trace(\sym) smp_trace(\sym)
-.endm
-#else
-.macro trace_apicinterrupt num sym do_sym
-.endm
-#endif
-
 /* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY	.popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY	.popsection
 
 .macro apicinterrupt num sym do_sym
 PUSH_SECTION_IRQENTRY
 apicinterrupt3 \num \sym \do_sym
-trace_apicinterrupt \num \sym
 POP_SECTION_IRQENTRY
 .endm
 
@@ -705,6 +773,7 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR		x86_platform_ipi		smp_x86_platform_ipi
 #ifdef CONFIG_HAVE_KVM
 apicinterrupt3 POSTED_INTR_VECTOR		kvm_posted_intr_ipi		smp_kvm_posted_intr_ipi
 apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR	kvm_posted_intr_wakeup_ipi	smp_kvm_posted_intr_wakeup_ipi
+apicinterrupt3 POSTED_INTR_NESTED_VECTOR	kvm_posted_intr_nested_ipi	smp_kvm_posted_intr_nested_ipi
 #endif
 
 #ifdef CONFIG_X86_MCE_THRESHOLD
@@ -739,13 +808,14 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
+	UNWIND_HINT_IRET_REGS offset=8
+
 	/* Sanity check */
 	.if \shift_ist != -1 && \paranoid == 0
 	.error "using shift_ist requires paranoid=1"
 	.endif
 
 	ASM_CLAC
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
 
 	.ifeq \has_error_code
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
@@ -762,6 +832,7 @@ ENTRY(\sym)
 	.else
 	call	error_entry
 	.endif
+	UNWIND_HINT_REGS
 	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
 	.if \paranoid
@@ -828,17 +899,6 @@ ENTRY(\sym)
 END(\sym)
 .endm
 
-#ifdef CONFIG_TRACING
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#else
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#endif
-
 idtentry divide_error			do_divide_error			has_error_code=0
 idtentry overflow			do_overflow			has_error_code=0
 idtentry bounds				do_bounds			has_error_code=0
@@ -859,6 +919,7 @@ idtentry simd_coprocessor_error		do_simd_coprocessor_error	has_error_code=0
 	 * edi:  new selector
 	 */
 ENTRY(native_load_gs_index)
+	FRAME_BEGIN
 	pushfq
 	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
@@ -867,8 +928,9 @@ ENTRY(native_load_gs_index)
 2:	ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
 	SWAPGS
 	popfq
+	FRAME_END
 	ret
-END(native_load_gs_index)
+ENDPROC(native_load_gs_index)
 EXPORT_SYMBOL(native_load_gs_index)
 
 	_ASM_EXTABLE(.Lgs_change, bad_gs)
@@ -891,17 +953,15 @@ bad_gs:
 ENTRY(do_softirq_own_stack)
 	pushq	%rbp
 	mov	%rsp, %rbp
-	incl	PER_CPU_VAR(irq_count)
-	cmove	PER_CPU_VAR(irq_stack_ptr), %rsp
-	push	%rbp				/* frame pointer backlink */
+	ENTER_IRQ_STACK regs=0 old_rsp=%r11
 	call	__do_softirq
+	LEAVE_IRQ_STACK regs=0
 	leaveq
-	decl	PER_CPU_VAR(irq_count)
 	ret
-END(do_softirq_own_stack)
+ENDPROC(do_softirq_own_stack)
 
 #ifdef CONFIG_XEN
-idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
+idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
 
 /*
  * A note on the "critical region" in our callback handler.
@@ -922,14 +982,14 @@ ENTRY(xen_do_hypervisor_callback)		/* do_hypervisor_callback(struct *pt_regs) */
  * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
  * see the correct pointer to the pt_regs
  */
+	UNWIND_HINT_FUNC
 	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
-11:	incl	PER_CPU_VAR(irq_count)
-	movq	%rsp, %rbp
-	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
-	pushq	%rbp				/* frame pointer backlink */
+	UNWIND_HINT_REGS
+
+	ENTER_IRQ_STACK old_rsp=%r10
 	call	xen_evtchn_do_upcall
-	popq	%rsp
-	decl	PER_CPU_VAR(irq_count)
+	LEAVE_IRQ_STACK
+
 #ifndef CONFIG_PREEMPT
 	call	xen_maybe_preempt_hcall
 #endif
@@ -950,6 +1010,7 @@ END(xen_do_hypervisor_callback)
  * with its current contents: any discrepancy means we in category 1.
  */
 ENTRY(xen_failsafe_callback)
+	UNWIND_HINT_EMPTY
 	movl	%ds, %ecx
 	cmpw	%cx, 0x10(%rsp)
 	jne	1f
@@ -967,13 +1028,13 @@ ENTRY(xen_failsafe_callback)
 	movq	8(%rsp), %r11
 	addq	$0x30, %rsp
 	pushq	$0				/* RIP */
-	pushq	%r11
-	pushq	%rcx
+	UNWIND_HINT_IRET_REGS offset=8
 	jmp	general_protection
 1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
 	movq	(%rsp), %rcx
 	movq	8(%rsp), %r11
 	addq	$0x30, %rsp
+	UNWIND_HINT_IRET_REGS
 	pushq	$-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
@@ -997,13 +1058,12 @@ idtentry int3			do_int3			has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
 idtentry stack_segment		do_stack_segment	has_error_code=1
 
 #ifdef CONFIG_XEN
-idtentry xen_debug		do_debug		has_error_code=0
-idtentry xen_int3		do_int3			has_error_code=0
-idtentry xen_stack_segment	do_stack_segment	has_error_code=1
+idtentry xendebug		do_debug		has_error_code=0
+idtentry xenint3		do_int3			has_error_code=0
 #endif
 
 idtentry general_protection	do_general_protection	has_error_code=1
-trace_idtentry page_fault	do_page_fault		has_error_code=1
+idtentry page_fault		do_page_fault		has_error_code=1
 
 #ifdef CONFIG_KVM_GUEST
 idtentry async_page_fault	do_async_page_fault	has_error_code=1
@@ -1019,6 +1079,7 @@ idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vec
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
+	UNWIND_HINT_FUNC
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
@@ -1046,6 +1107,7 @@ END(paranoid_entry)
  * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
  */
 ENTRY(paranoid_exit)
+	UNWIND_HINT_REGS
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF_DEBUG
 	testl	%ebx, %ebx			/* swapgs needed? */
@@ -1067,6 +1129,7 @@ END(paranoid_exit)
  * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
+	UNWIND_HINT_FUNC
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
@@ -1151,6 +1214,7 @@ END(error_entry)
  *   0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
  */
 ENTRY(error_exit)
+	UNWIND_HINT_REGS
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	testl	%ebx, %ebx
@@ -1159,19 +1223,9 @@ ENTRY(error_exit)
 END(error_exit)
 
 /* Runs on exception stack */
+/* XXX: broken on Xen PV */
 ENTRY(nmi)
-	/*
-	 * Fix up the exception frame if we're on Xen.
-	 * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
-	 * one value to the stack on native, so it may clobber the rdx
-	 * scratch slot, but it won't clobber any of the important
-	 * slots past it.
-	 *
-	 * Xen is a different story, because the Xen frame itself overlaps
-	 * the "NMI executing" variable.
-	 */
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
-
+	UNWIND_HINT_IRET_REGS
 	/*
 	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
 	 * the iretq it performs will take us out of NMI context.
@@ -1210,6 +1264,8 @@ ENTRY(nmi)
 	 * other IST entries.
 	 */
 
+	ASM_CLAC
+
 	/* Use %rdx as our temp variable throughout */
 	pushq	%rdx
 
@@ -1231,11 +1287,13 @@ ENTRY(nmi)
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+	UNWIND_HINT_IRET_REGS base=%rdx offset=8
 	pushq	5*8(%rdx)	/* pt_regs->ss */
 	pushq	4*8(%rdx)	/* pt_regs->rsp */
 	pushq	3*8(%rdx)	/* pt_regs->flags */
 	pushq	2*8(%rdx)	/* pt_regs->cs */
 	pushq	1*8(%rdx)	/* pt_regs->rip */
+	UNWIND_HINT_IRET_REGS
 	pushq   $-1		/* pt_regs->orig_ax */
 	pushq   %rdi		/* pt_regs->di */
 	pushq   %rsi		/* pt_regs->si */
@@ -1252,6 +1310,7 @@ ENTRY(nmi)
 	pushq	%r13		/* pt_regs->r13 */
 	pushq	%r14		/* pt_regs->r14 */
 	pushq	%r15		/* pt_regs->r15 */
+	UNWIND_HINT_REGS
 	ENCODE_FRAME_POINTER
 
 	/*
@@ -1406,6 +1465,7 @@ first_nmi:
 	.rept 5
 	pushq	11*8(%rsp)
 	.endr
+	UNWIND_HINT_IRET_REGS
 
 	/* Everything up to here is safe from nested NMIs */
 
@@ -1421,6 +1481,7 @@ first_nmi:
 	pushq	$__KERNEL_CS	/* CS */
 	pushq	$1f		/* RIP */
 	INTERRUPT_RETURN	/* continues at repeat_nmi below */
+	UNWIND_HINT_IRET_REGS
 1:
 #endif
 
@@ -1470,6 +1531,7 @@ end_repeat_nmi:
 	 * exceptions might do.
 	 */
 	call	paranoid_entry
+	UNWIND_HINT_REGS
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	movq	%rsp, %rdi
@@ -1507,17 +1569,19 @@ nmi_restore:
 END(nmi)
 
 ENTRY(ignore_sysret)
+	UNWIND_HINT_EMPTY
 	mov	$-ENOSYS, %eax
 	sysret
 END(ignore_sysret)
 
 ENTRY(rewind_stack_do_exit)
+	UNWIND_HINT_FUNC
 	/* Prevent any naive code from trying to unwind to our caller. */
 	xorl	%ebp, %ebp
 
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rax
-	leaq	-TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
+	leaq	-PTREGS_SIZE(%rax), %rsp
+	UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
 
 	call	do_exit
-1:	jmp 1b
 END(rewind_stack_do_exit)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..e26c25ca7756 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
  */
 ENTRY(entry_SYSCALL_compat)
 	/* Interrupts are off on entry. */
-	SWAPGS_UNSAFE_STACK
+	swapgs
 
 	/* Stash user ESP and switch to the kernel stack. */
 	movl	%esp, %r8d
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
-	/* Zero-extending 32-bit regs, do not remove */
-	movl	%eax, %eax
-
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER32_DS		/* pt_regs->ss */
 	pushq	%r8			/* pt_regs->sp */
 	pushq	%r11			/* pt_regs->flags */
 	pushq	$__USER32_CS		/* pt_regs->cs */
 	pushq	%rcx			/* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+	movl	%eax, %eax		/* discard orig_ax high bits */
 	pushq	%rax			/* pt_regs->orig_ax */
 	pushq	%rdi			/* pt_regs->di */
 	pushq	%rsi			/* pt_regs->si */
@@ -294,7 +293,6 @@ ENTRY(entry_INT80_compat)
 	/*
 	 * Interrupts are off on entry.
 	 */
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	ASM_CLAC			/* Do this early to minimize exposure */
 	SWAPGS
 
@@ -342,8 +340,7 @@ ENTRY(entry_INT80_compat)
 	jmp	restore_regs_and_iret
 END(entry_INT80_compat)
 
-	ALIGN
-GLOBAL(stub32_clone)
+ENTRY(stub32_clone)
 	/*
 	 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
 	 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
@@ -353,3 +350,4 @@ GLOBAL(stub32_clone)
 	 */
 	xchg	%r8, %rcx
 	jmp	sys_clone
+ENDPROC(stub32_clone)
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 726355ce8497..1911310959f8 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -351,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg)
 	 * and 8 bits for the node)
 	 */
 	d.limit0 = cpu | ((node & 0xf) << 12);
-	d.limit = node >> 4;
+	d.limit1 = node >> 4;
 	d.type = 5;		/* RO data, expand down, accessed */
 	d.dpl = 3;		/* Visible to user code */
 	d.s = 1;		/* Not a system segment */
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index ad44af0dd667..f5cbbba99283 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
 
 	if (amd_uncore_llc) {
 		unsigned int apicid = cpu_data(cpu).apicid;
-		unsigned int nshared;
+		unsigned int nshared, subleaf, prev_eax = 0;
 
 		uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
-		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
-		nshared = ((eax >> 14) & 0xfff) + 1;
+		/*
+		 * Iterate over Cache Topology Definition leaves until no
+		 * more cache descriptions are available.
+		 */
+		for (subleaf = 0; subleaf < 5; subleaf++) {
+			cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx);
+
+			/* EAX[0:4] gives type of cache */
+			if (!(eax & 0x1f))
+				break;
+
+			prev_eax = eax;
+		}
+		nshared = ((prev_eax >> 14) & 0xfff) + 1;
+
 		uncore->id = apicid - (apicid % nshared);
 
 		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
@@ -555,7 +568,7 @@ static int __init amd_uncore_init(void)
 		ret = 0;
 	}
 
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
 		amd_uncore_llc = alloc_percpu(struct amd_uncore *);
 		if (!amd_uncore_llc) {
 			ret = -ENOMEM;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index ff1ea2fb9705..80534d3c2480 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -191,8 +191,8 @@ static void release_pmc_hardware(void) {}
 
 static bool check_hw_exists(void)
 {
-	u64 val, val_fail, val_new= ~0;
-	int i, reg, reg_fail, ret = 0;
+	u64 val, val_fail = -1, val_new= ~0;
+	int i, reg, reg_fail = -1, ret = 0;
 	int bios_fail = 0;
 	int reg_safe = -1;
 
@@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event)
 	return m == b;
 }
 
-int x86_pmu_hw_config(struct perf_event *event)
+int x86_pmu_max_precise(void)
 {
-	if (event->attr.precise_ip) {
-		int precise = 0;
+	int precise = 0;
 
-		/* Support for constant skid */
-		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+	/* Support for constant skid */
+	if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+		precise++;
+
+		/* Support for IP fixup */
+		if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
 			precise++;
 
-			/* Support for IP fixup */
-			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
-				precise++;
+		if (x86_pmu.pebs_prec_dist)
+			precise++;
+	}
+	return precise;
+}
 
-			if (x86_pmu.pebs_prec_dist)
-				precise++;
-		}
+int x86_pmu_hw_config(struct perf_event *event)
+{
+	if (event->attr.precise_ip) {
+		int precise = x86_pmu_max_precise();
 
 		if (event->attr.precise_ip > precise)
 			return -EOPNOTSUPP;
@@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
 }
 
 static struct attribute_group x86_pmu_attr_group;
+static struct attribute_group x86_pmu_caps_group;
 
 static int __init init_hw_perf_events(void)
 {
@@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void)
 
 	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
 
+	if (x86_pmu.caps_attrs) {
+		struct attribute **tmp;
+
+		tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs);
+		if (!WARN_ON(!tmp))
+			x86_pmu_caps_group.attrs = tmp;
+	}
+
 	if (x86_pmu.event_attrs)
 		x86_pmu_events_group.attrs = x86_pmu.event_attrs;
 
@@ -2114,7 +2129,7 @@ static void refresh_pce(void *ignored)
 	load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
 }
 
-static void x86_pmu_event_mapped(struct perf_event *event)
+static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
 {
 	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
 		return;
@@ -2129,22 +2144,20 @@ static void x86_pmu_event_mapped(struct perf_event *event)
 	 * For now, this can't happen because all callers hold mmap_sem
 	 * for write.  If this changes, we'll need a different solution.
 	 */
-	lockdep_assert_held_exclusive(&current->mm->mmap_sem);
+	lockdep_assert_held_exclusive(&mm->mmap_sem);
 
-	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
-		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+	if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
+		on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
 }
 
-static void x86_pmu_event_unmapped(struct perf_event *event)
+static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
 {
-	if (!current->mm)
-		return;
 
 	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
 		return;
 
-	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
-		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+	if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
+		on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
 }
 
 static int x86_pmu_event_idx(struct perf_event *event)
@@ -2215,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = {
 	.attrs = x86_pmu_attrs,
 };
 
+static ssize_t max_precise_show(struct device *cdev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
+}
+
+static DEVICE_ATTR_RO(max_precise);
+
+static struct attribute *x86_pmu_caps_attrs[] = {
+	&dev_attr_max_precise.attr,
+	NULL
+};
+
+static struct attribute_group x86_pmu_caps_group = {
+	.name = "caps",
+	.attrs = x86_pmu_caps_attrs,
+};
+
 static const struct attribute_group *x86_pmu_attr_groups[] = {
 	&x86_pmu_attr_group,
 	&x86_pmu_format_group,
 	&x86_pmu_events_group,
+	&x86_pmu_caps_group,
 	NULL,
 };
 
@@ -2337,12 +2370,9 @@ static unsigned long get_segment_base(unsigned int segment)
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
 		struct ldt_struct *ldt;
 
-		if (idx > LDT_ENTRIES)
-			return 0;
-
 		/* IRQs are off, so this synchronizes with smp_store_release */
 		ldt = lockless_dereference(current->active_mm->context.ldt);
-		if (!ldt || idx > ldt->nr_entries)
+		if (!ldt || idx >= ldt->nr_entries)
 			return 0;
 
 		desc = &ldt->entries[idx];
@@ -2350,7 +2380,7 @@ static unsigned long get_segment_base(unsigned int segment)
 		return 0;
 #endif
 	} else {
-		if (idx > GDT_ENTRIES)
+		if (idx >= GDT_ENTRIES)
 			return 0;
 
 		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index 06c2baa51814..e9d8520a801a 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)	+= intel-rapl-perf.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 8ae8c5ce3a1f..16076eb34699 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -69,7 +69,7 @@ struct bts_buffer {
 	struct bts_phys	buf[0];
 };
 
-struct pmu bts_pmu;
+static struct pmu bts_pmu;
 
 static size_t buf_size(struct page *page)
 {
@@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags)
 	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
 	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
 
-	event->hw.itrace_started = 1;
+	perf_event_itrace_started(event);
 	event->hw.state = 0;
 
 	__bts_event_start(event);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index aa62437d1aa1..829e89cfcee2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1708,6 +1708,120 @@ static __initconst const u64 glm_hw_cache_extra_regs
 	},
 };
 
+static __initconst const u64 glp_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+			[C(RESULT_MISS)]	= 0x0,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+			[C(RESULT_MISS)]	= 0x0,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x0380,	/* ICACHE.ACCESSES */
+			[C(RESULT_MISS)]	= 0x0280,	/* ICACHE.MISSES */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+			[C(RESULT_MISS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+			[C(RESULT_MISS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+	[C(DTLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+			[C(RESULT_MISS)]	= 0xe08,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+			[C(RESULT_MISS)]	= 0xe49,	/* DTLB_STORE_MISSES.WALK_COMPLETED */
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x00c0,	/* INST_RETIRED.ANY_P */
+			[C(RESULT_MISS)]	= 0x0481,	/* ITLB.MISS */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x00c4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+			[C(RESULT_MISS)]	= 0x00c5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+	},
+};
+
+static __initconst const u64 glp_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= GLM_DEMAND_READ|
+						  GLM_LLC_ACCESS,
+			[C(RESULT_MISS)]	= GLM_DEMAND_READ|
+						  GLM_LLC_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= GLM_DEMAND_WRITE|
+						  GLM_LLC_ACCESS,
+			[C(RESULT_MISS)]	= GLM_DEMAND_WRITE|
+						  GLM_LLC_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+};
+
 #define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
 #define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
 #define KNL_MCDRAM_LOCAL	BIT_ULL(21)
@@ -3016,6 +3130,9 @@ static int hsw_hw_config(struct perf_event *event)
 	return 0;
 }
 
+static struct event_constraint counter0_constraint =
+			INTEL_ALL_EVENT_CONSTRAINT(0, 0x1);
+
 static struct event_constraint counter2_constraint =
 			EVENT_CONSTRAINT(0, 0x4, 0);
 
@@ -3037,6 +3154,21 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	return c;
 }
 
+static struct event_constraint *
+glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	/* :ppp means to do reduced skid PEBS which is PMC0 only. */
+	if (event->attr.precise_ip == 3)
+		return &counter0_constraint;
+
+	c = intel_get_event_constraints(cpuc, idx, event);
+
+	return c;
+}
+
 /*
  * Broadwell:
  *
@@ -3265,10 +3397,8 @@ static void intel_pmu_cpu_dying(int cpu)
 static void intel_pmu_sched_task(struct perf_event_context *ctx,
 				 bool sched_in)
 {
-	if (x86_pmu.pebs_active)
-		intel_pmu_pebs_sched_task(ctx, sched_in);
-	if (x86_pmu.lbr_nr)
-		intel_pmu_lbr_sched_task(ctx, sched_in);
+	intel_pmu_pebs_sched_task(ctx, sched_in);
+	intel_pmu_lbr_sched_task(ctx, sched_in);
 }
 
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -3285,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = {
 	&format_attr_any.attr,
 	&format_attr_inv.attr,
 	&format_attr_cmask.attr,
+	NULL,
+};
+
+static struct attribute *hsw_format_attr[] = {
 	&format_attr_in_tx.attr,
 	&format_attr_in_tx_cp.attr,
+	&format_attr_offcore_rsp.attr,
+	&format_attr_ldlat.attr,
+	NULL
+};
 
-	&format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
-	&format_attr_ldlat.attr, /* PEBS load latency */
-	NULL,
+static struct attribute *nhm_format_attr[] = {
+	&format_attr_offcore_rsp.attr,
+	&format_attr_ldlat.attr,
+	NULL
+};
+
+static struct attribute *slm_format_attr[] = {
+	&format_attr_offcore_rsp.attr,
+	NULL
 };
 
 static struct attribute *skl_format_attr[] = {
@@ -3651,6 +3795,36 @@ done:
 
 static DEVICE_ATTR_RW(freeze_on_smi);
 
+static ssize_t branches_show(struct device *cdev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *lbr_attrs[] = {
+	&dev_attr_branches.attr,
+	NULL
+};
+
+static char pmu_name_str[30];
+
+static ssize_t pmu_name_show(struct device *cdev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
+}
+
+static DEVICE_ATTR_RO(pmu_name);
+
+static struct attribute *intel_pmu_caps_attrs[] = {
+       &dev_attr_pmu_name.attr,
+       NULL
+};
+
 static struct attribute *intel_pmu_attrs[] = {
 	&dev_attr_freeze_on_smi.attr,
 	NULL,
@@ -3665,6 +3839,8 @@ __init int intel_pmu_init(void)
 	unsigned int unused;
 	struct extra_reg *er;
 	int version, i;
+	struct attribute **extra_attr = NULL;
+	char *name;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
 		switch (boot_cpu_data.x86) {
@@ -3732,6 +3908,7 @@ __init int intel_pmu_init(void)
 	switch (boot_cpu_data.x86_model) {
 	case INTEL_FAM6_CORE_YONAH:
 		pr_cont("Core events, ");
+		name = "core";
 		break;
 
 	case INTEL_FAM6_CORE2_MEROM:
@@ -3747,6 +3924,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_core2_event_constraints;
 		x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
 		pr_cont("Core2 events, ");
+		name = "core2";
 		break;
 
 	case INTEL_FAM6_NEHALEM:
@@ -3775,8 +3953,11 @@ __init int intel_pmu_init(void)
 
 		intel_pmu_pebs_data_source_nhm();
 		x86_add_quirk(intel_nehalem_quirk);
+		x86_pmu.pebs_no_tlb = 1;
+		extra_attr = nhm_format_attr;
 
 		pr_cont("Nehalem events, ");
+		name = "nehalem";
 		break;
 
 	case INTEL_FAM6_ATOM_PINEVIEW:
@@ -3793,6 +3974,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
 		pr_cont("Atom events, ");
+		name = "bonnell";
 		break;
 
 	case INTEL_FAM6_ATOM_SILVERMONT1:
@@ -3810,7 +3992,9 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_slm_extra_regs;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.cpu_events = slm_events_attrs;
+		extra_attr = slm_format_attr;
 		pr_cont("Silvermont events, ");
+		name = "silvermont";
 		break;
 
 	case INTEL_FAM6_ATOM_GOLDMONT:
@@ -3835,7 +4019,37 @@ __init int intel_pmu_init(void)
 		x86_pmu.lbr_pt_coexist = true;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.cpu_events = glm_events_attrs;
+		extra_attr = slm_format_attr;
 		pr_cont("Goldmont events, ");
+		name = "goldmont";
+		break;
+
+	case INTEL_FAM6_ATOM_GEMINI_LAKE:
+		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_glp_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_glm_extra_regs;
+		/*
+		 * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+		 * for precise cycles.
+		 */
+		x86_pmu.pebs_aliases = NULL;
+		x86_pmu.pebs_prec_dist = true;
+		x86_pmu.lbr_pt_coexist = true;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.get_event_constraints = glp_get_event_constraints;
+		x86_pmu.cpu_events = glm_events_attrs;
+		/* Goldmont Plus has 4-wide pipeline */
+		event_attr_td_total_slots_scale_glm.event_str = "4";
+		extra_attr = slm_format_attr;
+		pr_cont("Goldmont plus events, ");
+		name = "goldmont_plus";
 		break;
 
 	case INTEL_FAM6_WESTMERE:
@@ -3864,7 +4078,9 @@ __init int intel_pmu_init(void)
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
 		intel_pmu_pebs_data_source_nhm();
+		extra_attr = nhm_format_attr;
 		pr_cont("Westmere events, ");
+		name = "westmere";
 		break;
 
 	case INTEL_FAM6_SANDYBRIDGE:
@@ -3900,7 +4116,10 @@ __init int intel_pmu_init(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
 			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
 
+		extra_attr = nhm_format_attr;
+
 		pr_cont("SandyBridge events, ");
+		name = "sandybridge";
 		break;
 
 	case INTEL_FAM6_IVYBRIDGE:
@@ -3934,7 +4153,10 @@ __init int intel_pmu_init(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 
+		extra_attr = nhm_format_attr;
+
 		pr_cont("IvyBridge events, ");
+		name = "ivybridge";
 		break;
 
 
@@ -3962,7 +4184,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
 		x86_pmu.lbr_double_abort = true;
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
 		pr_cont("Haswell events, ");
+		name = "haswell";
 		break;
 
 	case INTEL_FAM6_BROADWELL_CORE:
@@ -3998,7 +4223,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
 		x86_pmu.limit_period = bdw_limit_period;
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
 		pr_cont("Broadwell events, ");
+		name = "broadwell";
 		break;
 
 	case INTEL_FAM6_XEON_PHI_KNL:
@@ -4016,8 +4244,9 @@ __init int intel_pmu_init(void)
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
+		extra_attr = slm_format_attr;
 		pr_cont("Knights Landing/Mill events, ");
+		name = "knights-landing";
 		break;
 
 	case INTEL_FAM6_SKYLAKE_MOBILE:
@@ -4047,11 +4276,14 @@ __init int intel_pmu_init(void)
 
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
-						  skl_format_attr);
-		WARN_ON(!x86_pmu.format_attrs);
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
+		extra_attr = merge_attr(extra_attr, skl_format_attr);
 		x86_pmu.cpu_events = hsw_events_attrs;
+		intel_pmu_pebs_data_source_skl(
+			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
 		pr_cont("Skylake events, ");
+		name = "skylake";
 		break;
 
 	default:
@@ -4059,6 +4291,7 @@ __init int intel_pmu_init(void)
 		case 1:
 			x86_pmu.event_constraints = intel_v1_event_constraints;
 			pr_cont("generic architected perfmon v1, ");
+			name = "generic_arch_v1";
 			break;
 		default:
 			/*
@@ -4066,10 +4299,19 @@ __init int intel_pmu_init(void)
 			 */
 			x86_pmu.event_constraints = intel_gen_event_constraints;
 			pr_cont("generic architected perfmon, ");
+			name = "generic_arch_v2+";
 			break;
 		}
 	}
 
+	snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name);
+
+	if (version >= 2 && extra_attr) {
+		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+						  extra_attr);
+		WARN_ON(!x86_pmu.format_attrs);
+	}
+
 	if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
 		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
 		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
@@ -4116,8 +4358,13 @@ __init int intel_pmu_init(void)
 			x86_pmu.lbr_nr = 0;
 	}
 
-	if (x86_pmu.lbr_nr)
+	x86_pmu.caps_attrs = intel_pmu_caps_attrs;
+
+	if (x86_pmu.lbr_nr) {
+		x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs);
 		pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+	}
+
 	/*
 	 * Access extra MSR may cause #GP under certain circumstances.
 	 * E.g. KVM doesn't support offcore event
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
deleted file mode 100644
index 2521f771f2f5..000000000000
--- a/arch/x86/events/intel/cqm.c
+++ /dev/null
@@ -1,1766 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include <asm/intel_rdt_common.h>
-#include "../perf_event.h"
-
-#define MSR_IA32_QM_CTR		0x0c8e
-#define MSR_IA32_QM_EVTSEL	0x0c8d
-
-#define MBM_CNTR_WIDTH		24
-/*
- * Guaranteed time in ms as per SDM where MBM counters will not overflow.
- */
-#define MBM_CTR_OVERFLOW_TIME	1000
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-static bool cqm_enabled, mbm_enabled;
-unsigned int mbm_socket_max;
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-static struct hrtimer *mbm_timers;
-/**
- * struct sample - mbm event's (local or total) data
- * @total_bytes    #bytes since we began monitoring
- * @prev_msr       previous value of MSR
- */
-struct sample {
-	u64	total_bytes;
-	u64	prev_msr;
-};
-
-/*
- * samples profiled for total memory bandwidth type events
- */
-static struct sample *mbm_total;
-/*
- * samples profiled for local memory bandwidth type events
- */
-static struct sample *mbm_local;
-
-#define pkg_id	topology_physical_package_id(smp_processor_id())
-/*
- * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
- * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
- * rmids per socket, an example is given below
- * RMID1 of Socket0:  vrmid =  1
- * RMID1 of Socket1:  vrmid =  1 * (cqm_max_rmid + 1) + 1
- * RMID1 of Socket2:  vrmid =  2 * (cqm_max_rmid + 1) + 1
- */
-#define rmid_2_index(rmid)  ((pkg_id * (cqm_max_rmid + 1)) + rmid)
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR		(1ULL << 63)
-#define RMID_VAL_UNAVAIL	(1ULL << 62)
-
-/*
- * Event IDs are used to program IA32_QM_EVTSEL before reading event
- * counter from IA32_QM_CTR
- */
-#define QOS_L3_OCCUP_EVENT_ID	0x01
-#define QOS_MBM_TOTAL_EVENT_ID	0x02
-#define QOS_MBM_LOCAL_EVENT_ID	0x03
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID		(-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-	if (!rmid || rmid == INVALID_RMID)
-		return false;
-
-	return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
-	u64 val;
-
-	/*
-	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-	 * it just says that to increase confusion.
-	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
-
-	/*
-	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-	 * the number of cachelines tagged with @rmid.
-	 */
-	return val;
-}
-
-enum rmid_recycle_state {
-	RMID_YOUNG = 0,
-	RMID_AVAILABLE,
-	RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
-	u32 rmid;
-	enum rmid_recycle_state state;
-	struct list_head list;
-	unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	entry = cqm_rmid_ptrs[rmid];
-	WARN_ON(entry->rmid != rmid);
-
-	return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	if (list_empty(&cqm_rmid_free_lru))
-		return INVALID_RMID;
-
-	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-	list_del(&entry->list);
-
-	return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	WARN_ON(!__rmid_valid(rmid));
-	entry = __rmid_entry(rmid);
-
-	entry->queue_time = jiffies;
-	entry->state = RMID_YOUNG;
-
-	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static void cqm_cleanup(void)
-{
-	int i;
-
-	if (!cqm_rmid_ptrs)
-		return;
-
-	for (i = 0; i < cqm_max_rmid; i++)
-		kfree(cqm_rmid_ptrs[i]);
-
-	kfree(cqm_rmid_ptrs);
-	cqm_rmid_ptrs = NULL;
-	cqm_enabled = false;
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
-	struct cqm_rmid_entry *entry;
-	unsigned int nr_rmids;
-	int r = 0;
-
-	nr_rmids = cqm_max_rmid + 1;
-	cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
-				nr_rmids, GFP_KERNEL);
-	if (!cqm_rmid_ptrs)
-		return -ENOMEM;
-
-	for (; r <= cqm_max_rmid; r++) {
-		struct cqm_rmid_entry *entry;
-
-		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-		if (!entry)
-			goto fail;
-
-		INIT_LIST_HEAD(&entry->list);
-		entry->rmid = r;
-		cqm_rmid_ptrs[r] = entry;
-
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-	/*
-	 * RMID 0 is special and is always allocated. It's used for all
-	 * tasks that are not monitored.
-	 */
-	entry = __rmid_entry(0);
-	list_del(&entry->list);
-
-	mutex_lock(&cache_mutex);
-	intel_cqm_rotation_rmid = __get_rmid();
-	mutex_unlock(&cache_mutex);
-
-	return 0;
-
-fail:
-	cqm_cleanup();
-	return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-	/* Per-cpu and task events don't mix */
-	if ((a->attach_state & PERF_ATTACH_TASK) !=
-	    (b->attach_state & PERF_ATTACH_TASK))
-		return false;
-
-#ifdef CONFIG_CGROUP_PERF
-	if (a->cgrp != b->cgrp)
-		return false;
-#endif
-
-	/* If not task event, we're machine wide */
-	if (!(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Events that target same task are placed into the same cache group.
-	 * Mark it as a multi event group, so that we update ->count
-	 * for every event rather than just the group leader later.
-	 */
-	if (a->hw.target == b->hw.target) {
-		b->hw.is_group_event = true;
-		return true;
-	}
-
-	/*
-	 * Are we an inherited event?
-	 */
-	if (b->parent == a)
-		return true;
-
-	return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.target, event->ctx);
-
-	return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *		   PROHIBITS
- *     system-wide    -> 	cgroup and task
- *     cgroup 	      ->	system-wide
- *     		      ->	task in cgroup
- *     task 	      -> 	system-wide
- *     		      ->	task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-	/*
-	 * We can have any number of cgroups but only one system-wide
-	 * event at a time.
-	 */
-	if (a->cgrp && b->cgrp) {
-		struct perf_cgroup *ac = a->cgrp;
-		struct perf_cgroup *bc = b->cgrp;
-
-		/*
-		 * This condition should have been caught in
-		 * __match_event() and we should be sharing an RMID.
-		 */
-		WARN_ON_ONCE(ac == bc);
-
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-
-	if (a->cgrp || b->cgrp) {
-		struct perf_cgroup *ac, *bc;
-
-		/*
-		 * cgroup and system-wide events are mutually exclusive
-		 */
-		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-			return true;
-
-		/*
-		 * Ensure neither event is part of the other's cgroup
-		 */
-		ac = event_to_cgroup(a);
-		bc = event_to_cgroup(b);
-		if (ac == bc)
-			return true;
-
-		/*
-		 * Must have cgroup and non-intersecting task events.
-		 */
-		if (!ac || !bc)
-			return false;
-
-		/*
-		 * We have cgroup and task events, and the task belongs
-		 * to a cgroup. Check for for overlap.
-		 */
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-#endif
-	/*
-	 * If one of them is not a task, same story as above with cgroups.
-	 */
-	if (!(a->attach_state & PERF_ATTACH_TASK) ||
-	    !(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Must be non-overlapping.
-	 */
-	return false;
-}
-
-struct rmid_read {
-	u32 rmid;
-	u32 evt_type;
-	atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-static void init_mbm_sample(u32 rmid, u32 evt_type);
-static void __intel_mbm_event_count(void *info);
-
-static bool is_cqm_event(int e)
-{
-	return (e == QOS_L3_OCCUP_EVENT_ID);
-}
-
-static bool is_mbm_event(int e)
-{
-	return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
-}
-
-static void cqm_mask_call(struct rmid_read *rr)
-{
-	if (is_mbm_event(rr->evt_type))
-		on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
-	else
-		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
-}
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-	struct perf_event *event;
-	struct list_head *head = &group->hw.cqm_group_entry;
-	u32 old_rmid = group->hw.cqm_rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	/*
-	 * If our RMID is being deallocated, perform a read now.
-	 */
-	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-		struct rmid_read rr = {
-			.rmid = old_rmid,
-			.evt_type = group->attr.config,
-			.value = ATOMIC64_INIT(0),
-		};
-
-		cqm_mask_call(&rr);
-		local64_set(&group->count, atomic64_read(&rr.value));
-	}
-
-	raw_spin_lock_irq(&cache_lock);
-
-	group->hw.cqm_rmid = rmid;
-	list_for_each_entry(event, head, hw.cqm_group_entry)
-		event->hw.cqm_rmid = rmid;
-
-	raw_spin_unlock_irq(&cache_lock);
-
-	/*
-	 * If the allocation is for mbm, init the mbm stats.
-	 * Need to check if each event in the group is mbm event
-	 * because there could be multiple type of events in the same group.
-	 */
-	if (__rmid_valid(rmid)) {
-		event = group;
-		if (is_mbm_event(event->attr.config))
-			init_mbm_sample(rmid, event->attr.config);
-
-		list_for_each_entry(event, head, hw.cqm_group_entry) {
-			if (is_mbm_event(event->attr.config))
-				init_mbm_sample(rmid, event->attr.config);
-		}
-	}
-
-	return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-	struct cqm_rmid_entry *entry;
-
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		if (entry->state != RMID_AVAILABLE)
-			break;
-
-		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-			entry->state = RMID_DIRTY;
-	}
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-	struct perf_event *leader, *event;
-
-	lockdep_assert_held(&cache_mutex);
-
-	leader = list_first_entry(&cache_groups, struct perf_event,
-				  hw.cqm_groups_entry);
-	event = leader;
-
-	list_for_each_entry_continue(event, &cache_groups,
-				     hw.cqm_groups_entry) {
-		if (__rmid_valid(event->hw.cqm_rmid))
-			continue;
-
-		if (__conflict_event(event, leader))
-			continue;
-
-		intel_cqm_xchg_rmid(event, rmid);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-	struct cqm_rmid_entry *entry, *tmp;
-
-	lockdep_assert_held(&cache_mutex);
-
-	*available = 0;
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		unsigned long min_queue_time;
-		unsigned long now = jiffies;
-
-		/*
-		 * We hold RMIDs placed into limbo for a minimum queue
-		 * time. Before the minimum queue time has elapsed we do
-		 * not recycle RMIDs.
-		 *
-		 * The reasoning is that until a sufficient time has
-		 * passed since we stopped using an RMID, any RMID
-		 * placed onto the limbo list will likely still have
-		 * data tagged in the cache, which means we'll probably
-		 * fail to recycle it anyway.
-		 *
-		 * We can save ourselves an expensive IPI by skipping
-		 * any RMIDs that have not been queued for the minimum
-		 * time.
-		 */
-		min_queue_time = entry->queue_time +
-			msecs_to_jiffies(__rmid_queue_time_ms);
-
-		if (time_after(min_queue_time, now))
-			break;
-
-		entry->state = RMID_AVAILABLE;
-		(*available)++;
-	}
-
-	/*
-	 * Fast return if none of the RMIDs on the limbo list have been
-	 * sitting on the queue for the minimum queue time.
-	 */
-	if (!*available)
-		return false;
-
-	/*
-	 * Test whether an RMID is free for each package.
-	 */
-	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
-	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-		/*
-		 * Exhausted all RMIDs that have waited min queue time.
-		 */
-		if (entry->state == RMID_YOUNG)
-			break;
-
-		if (entry->state == RMID_DIRTY)
-			continue;
-
-		list_del(&entry->list);	/* remove from limbo */
-
-		/*
-		 * The rotation RMID gets priority if it's
-		 * currently invalid. In which case, skip adding
-		 * the RMID to the the free lru.
-		 */
-		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-			intel_cqm_rotation_rmid = entry->rmid;
-			continue;
-		}
-
-		/*
-		 * If we have groups waiting for RMIDs, hand
-		 * them one now provided they don't conflict.
-		 */
-		if (intel_cqm_sched_in_event(entry->rmid))
-			continue;
-
-		/*
-		 * Otherwise place it onto the free list.
-		 */
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-
-	return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-	struct perf_event *rotor;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	rotor = list_first_entry(&cache_groups, struct perf_event,
-				 hw.cqm_groups_entry);
-
-	/*
-	 * The group at the front of the list should always have a valid
-	 * RMID. If it doesn't then no groups have RMIDs assigned and we
-	 * don't need to rotate the list.
-	 */
-	if (next == rotor)
-		return;
-
-	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-	__put_rmid(rmid);
-
-	list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-	struct perf_event *group, *g;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-		if (group == event)
-			continue;
-
-		rmid = group->hw.cqm_rmid;
-
-		/*
-		 * Skip events that don't have a valid RMID.
-		 */
-		if (!__rmid_valid(rmid))
-			continue;
-
-		/*
-		 * No conflict? No problem! Leave the event alone.
-		 */
-		if (!__conflict_event(group, event))
-			continue;
-
-		intel_cqm_xchg_rmid(group, INVALID_RMID);
-		__put_rmid(rmid);
-	}
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-	struct perf_event *group, *start = NULL;
-	unsigned int threshold_limit;
-	unsigned int nr_needed = 0;
-	unsigned int nr_available;
-	bool rotated = false;
-
-	mutex_lock(&cache_mutex);
-
-again:
-	/*
-	 * Fast path through this function if there are no groups and no
-	 * RMIDs that need cleaning.
-	 */
-	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-		if (!__rmid_valid(group->hw.cqm_rmid)) {
-			if (!start)
-				start = group;
-			nr_needed++;
-		}
-	}
-
-	/*
-	 * We have some event groups, but they all have RMIDs assigned
-	 * and no RMIDs need cleaning.
-	 */
-	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	if (!nr_needed)
-		goto stabilize;
-
-	/*
-	 * We have more event groups without RMIDs than available RMIDs,
-	 * or we have event groups that conflict with the ones currently
-	 * scheduled.
-	 *
-	 * We force deallocate the rmid of the group at the head of
-	 * cache_groups. The first event group without an RMID then gets
-	 * assigned intel_cqm_rotation_rmid. This ensures we always make
-	 * forward progress.
-	 *
-	 * Rotate the cache_groups list so the previous head is now the
-	 * tail.
-	 */
-	__intel_cqm_pick_and_rotate(start);
-
-	/*
-	 * If the rotation is going to succeed, reduce the threshold so
-	 * that we don't needlessly reuse dirty RMIDs.
-	 */
-	if (__rmid_valid(intel_cqm_rotation_rmid)) {
-		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-		intel_cqm_rotation_rmid = __get_rmid();
-
-		intel_cqm_sched_out_conflicting_events(start);
-
-		if (__intel_cqm_threshold)
-			__intel_cqm_threshold--;
-	}
-
-	rotated = true;
-
-stabilize:
-	/*
-	 * We now need to stablize the RMID we freed above (if any) to
-	 * ensure that the next time we rotate we have an RMID with zero
-	 * occupancy value.
-	 *
-	 * Alternatively, if we didn't need to perform any rotation,
-	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
-	 */
-	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
-	while (intel_cqm_rmid_stabilize(&nr_available) &&
-	       __intel_cqm_threshold < threshold_limit) {
-		unsigned int steal_limit;
-
-		/*
-		 * Don't spin if nobody is actively waiting for an RMID,
-		 * the rotation worker will be kicked as soon as an
-		 * event needs an RMID anyway.
-		 */
-		if (!nr_needed)
-			break;
-
-		/* Allow max 25% of RMIDs to be in limbo. */
-		steal_limit = (cqm_max_rmid + 1) / 4;
-
-		/*
-		 * We failed to stabilize any RMIDs so our rotation
-		 * logic is now stuck. In order to make forward progress
-		 * we have a few options:
-		 *
-		 *   1. rotate ("steal") another RMID
-		 *   2. increase the threshold
-		 *   3. do nothing
-		 *
-		 * We do both of 1. and 2. until we hit the steal limit.
-		 *
-		 * The steal limit prevents all RMIDs ending up on the
-		 * limbo list. This can happen if every RMID has a
-		 * non-zero occupancy above threshold_limit, and the
-		 * occupancy values aren't dropping fast enough.
-		 *
-		 * Note that there is prioritisation at work here - we'd
-		 * rather increase the number of RMIDs on the limbo list
-		 * than increase the threshold, because increasing the
-		 * threshold skews the event data (because we reuse
-		 * dirty RMIDs) - threshold bumps are a last resort.
-		 */
-		if (nr_available < steal_limit)
-			goto again;
-
-		__intel_cqm_threshold++;
-	}
-
-out:
-	mutex_unlock(&cache_mutex);
-	return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-	unsigned long delay;
-
-	__intel_cqm_rmid_rotate();
-
-	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-	schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
-{
-	struct sample *mbm_current;
-	u32 vrmid = rmid_2_index(rmid);
-	u64 val, bytes, shift;
-	u32 eventid;
-
-	if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
-		mbm_current = &mbm_local[vrmid];
-		eventid     = QOS_MBM_LOCAL_EVENT_ID;
-	} else {
-		mbm_current = &mbm_total[vrmid];
-		eventid     = QOS_MBM_TOTAL_EVENT_ID;
-	}
-
-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return mbm_current->total_bytes;
-
-	if (first) {
-		mbm_current->prev_msr = val;
-		mbm_current->total_bytes = 0;
-		return mbm_current->total_bytes;
-	}
-
-	/*
-	 * The h/w guarantees that counters will not overflow
-	 * so long as we poll them at least once per second.
-	 */
-	shift = 64 - MBM_CNTR_WIDTH;
-	bytes = (val << shift) - (mbm_current->prev_msr << shift);
-	bytes >>= shift;
-
-	bytes *= cqm_l3_scale;
-
-	mbm_current->total_bytes += bytes;
-	mbm_current->prev_msr = val;
-
-	return mbm_current->total_bytes;
-}
-
-static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
-{
-	return update_sample(rmid, evt_type, 0);
-}
-
-static void __intel_mbm_event_init(void *info)
-{
-	struct rmid_read *rr = info;
-
-	update_sample(rr->rmid, rr->evt_type, 1);
-}
-
-static void init_mbm_sample(u32 rmid, u32 evt_type)
-{
-	struct rmid_read rr = {
-		.rmid = rmid,
-		.evt_type = evt_type,
-		.value = ATOMIC64_INIT(0),
-	};
-
-	/* on each socket, init sample */
-	on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-				  struct perf_event **group)
-{
-	struct perf_event *iter;
-	bool conflict = false;
-	u32 rmid;
-
-	event->hw.is_group_event = false;
-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-		rmid = iter->hw.cqm_rmid;
-
-		if (__match_event(iter, event)) {
-			/* All tasks in a group share an RMID */
-			event->hw.cqm_rmid = rmid;
-			*group = iter;
-			if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
-				init_mbm_sample(rmid, event->attr.config);
-			return;
-		}
-
-		/*
-		 * We only care about conflicts for events that are
-		 * actually scheduled in (and hence have a valid RMID).
-		 */
-		if (__conflict_event(iter, event) && __rmid_valid(rmid))
-			conflict = true;
-	}
-
-	if (conflict)
-		rmid = INVALID_RMID;
-	else
-		rmid = __get_rmid();
-
-	if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
-		init_mbm_sample(rmid, event->attr.config);
-
-	event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
-	unsigned long flags;
-	u32 rmid;
-	u64 val;
-
-	/*
-	 * Task events are handled by intel_cqm_event_count().
-	 */
-	if (event->cpu == -1)
-		return;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	rmid = event->hw.cqm_rmid;
-
-	if (!__rmid_valid(rmid))
-		goto out;
-
-	if (is_mbm_event(event->attr.config))
-		val = rmid_read_mbm(rmid, event->attr.config);
-	else
-		val = __rmid_read(rmid);
-
-	/*
-	 * Ignore this reading on error states and do not update the value.
-	 */
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		goto out;
-
-	local64_set(&event->count, val);
-out:
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
-	struct rmid_read *rr = info;
-	u64 val;
-
-	val = __rmid_read(rr->rmid);
-
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
-
-	atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-	return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static void __intel_mbm_event_count(void *info)
-{
-	struct rmid_read *rr = info;
-	u64 val;
-
-	val = rmid_read_mbm(rr->rmid, rr->evt_type);
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
-	atomic64_add(val, &rr->value);
-}
-
-static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
-{
-	struct perf_event *iter, *iter1;
-	int ret = HRTIMER_RESTART;
-	struct list_head *head;
-	unsigned long flags;
-	u32 grp_rmid;
-
-	/*
-	 * Need to cache_lock as the timer Event Select MSR reads
-	 * can race with the mbm/cqm count() and mbm_init() reads.
-	 */
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	if (list_empty(&cache_groups)) {
-		ret = HRTIMER_NORESTART;
-		goto out;
-	}
-
-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-		grp_rmid = iter->hw.cqm_rmid;
-		if (!__rmid_valid(grp_rmid))
-			continue;
-		if (is_mbm_event(iter->attr.config))
-			update_sample(grp_rmid, iter->attr.config, 0);
-
-		head = &iter->hw.cqm_group_entry;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry(iter1, head, hw.cqm_group_entry) {
-			if (!iter1->hw.is_group_event)
-				break;
-			if (is_mbm_event(iter1->attr.config))
-				update_sample(iter1->hw.cqm_rmid,
-					      iter1->attr.config, 0);
-		}
-	}
-
-	hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
-out:
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	return ret;
-}
-
-static void __mbm_start_timer(void *info)
-{
-	hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
-			     HRTIMER_MODE_REL_PINNED);
-}
-
-static void __mbm_stop_timer(void *info)
-{
-	hrtimer_cancel(&mbm_timers[pkg_id]);
-}
-
-static void mbm_start_timers(void)
-{
-	on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
-}
-
-static void mbm_stop_timers(void)
-{
-	on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
-}
-
-static void mbm_hrtimer_init(void)
-{
-	struct hrtimer *hr;
-	int i;
-
-	for (i = 0; i < mbm_socket_max; i++) {
-		hr = &mbm_timers[i];
-		hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		hr->function = mbm_hrtimer_handle;
-	}
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-	unsigned long flags;
-	struct rmid_read rr = {
-		.evt_type = event->attr.config,
-		.value = ATOMIC64_INIT(0),
-	};
-
-	/*
-	 * We only need to worry about task events. System-wide events
-	 * are handled like usual, i.e. entirely with
-	 * intel_cqm_event_read().
-	 */
-	if (event->cpu != -1)
-		return __perf_event_count(event);
-
-	/*
-	 * Only the group leader gets to report values except in case of
-	 * multiple events in the same group, we still need to read the
-	 * other events.This stops us
-	 * reporting duplicate values to userspace, and gives us a clear
-	 * rule for which task gets to report the values.
-	 *
-	 * Note that it is impossible to attribute these values to
-	 * specific packages - we forfeit that ability when we create
-	 * task events.
-	 */
-	if (!cqm_group_leader(event) && !event->hw.is_group_event)
-		return 0;
-
-	/*
-	 * Getting up-to-date values requires an SMP IPI which is not
-	 * possible if we're being called in interrupt context. Return
-	 * the cached values instead.
-	 */
-	if (unlikely(in_interrupt()))
-		goto out;
-
-	/*
-	 * Notice that we don't perform the reading of an RMID
-	 * atomically, because we can't hold a spin lock across the
-	 * IPIs.
-	 *
-	 * Speculatively perform the read, since @event might be
-	 * assigned a different (possibly invalid) RMID while we're
-	 * busying performing the IPI calls. It's therefore necessary to
-	 * check @event's RMID afterwards, and if it has changed,
-	 * discard the result of the read.
-	 */
-	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
-	if (!__rmid_valid(rr.rmid))
-		goto out;
-
-	cqm_mask_call(&rr);
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	if (event->hw.cqm_rmid == rr.rmid)
-		local64_set(&event->count, atomic64_read(&rr.value));
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-	return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-	u32 rmid = event->hw.cqm_rmid;
-
-	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-		return;
-
-	event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
-	if (state->rmid_usecnt++) {
-		if (!WARN_ON_ONCE(state->rmid != rmid))
-			return;
-	} else {
-		WARN_ON_ONCE(state->rmid);
-	}
-
-	state->rmid = rmid;
-	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
-	if (event->hw.cqm_state & PERF_HES_STOPPED)
-		return;
-
-	event->hw.cqm_state |= PERF_HES_STOPPED;
-
-	intel_cqm_event_read(event);
-
-	if (!--state->rmid_usecnt) {
-		state->rmid = 0;
-		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-	} else {
-		WARN_ON_ONCE(!state->rmid);
-	}
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-	unsigned long flags;
-	u32 rmid;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	event->hw.cqm_state = PERF_HES_STOPPED;
-	rmid = event->hw.cqm_rmid;
-
-	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-		intel_cqm_event_start(event, mode);
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-	struct perf_event *group_other = NULL;
-	unsigned long flags;
-
-	mutex_lock(&cache_mutex);
-	/*
-	* Hold the cache_lock as mbm timer handlers could be
-	* scanning the list of events.
-	*/
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	/*
-	 * If there's another event in this group...
-	 */
-	if (!list_empty(&event->hw.cqm_group_entry)) {
-		group_other = list_first_entry(&event->hw.cqm_group_entry,
-					       struct perf_event,
-					       hw.cqm_group_entry);
-		list_del(&event->hw.cqm_group_entry);
-	}
-
-	/*
-	 * And we're the group leader..
-	 */
-	if (cqm_group_leader(event)) {
-		/*
-		 * If there was a group_other, make that leader, otherwise
-		 * destroy the group and return the RMID.
-		 */
-		if (group_other) {
-			list_replace(&event->hw.cqm_groups_entry,
-				     &group_other->hw.cqm_groups_entry);
-		} else {
-			u32 rmid = event->hw.cqm_rmid;
-
-			if (__rmid_valid(rmid))
-				__put_rmid(rmid);
-			list_del(&event->hw.cqm_groups_entry);
-		}
-	}
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	/*
-	 * Stop the mbm overflow timers when the last event is destroyed.
-	*/
-	if (mbm_enabled && list_empty(&cache_groups))
-		mbm_stop_timers();
-
-	mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
-	struct perf_event *group = NULL;
-	bool rotate = false;
-	unsigned long flags;
-
-	if (event->attr.type != intel_cqm_pmu.type)
-		return -ENOENT;
-
-	if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
-	     (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
-		return -EINVAL;
-
-	if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
-	    (is_mbm_event(event->attr.config) && !mbm_enabled))
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
-	event->destroy = intel_cqm_event_destroy;
-
-	mutex_lock(&cache_mutex);
-
-	/*
-	 * Start the mbm overflow timers when the first event is created.
-	*/
-	if (mbm_enabled && list_empty(&cache_groups))
-		mbm_start_timers();
-
-	/* Will also set rmid */
-	intel_cqm_setup_event(event, &group);
-
-	/*
-	* Hold the cache_lock as mbm timer handlers be
-	* scanning the list of events.
-	*/
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	if (group) {
-		list_add_tail(&event->hw.cqm_group_entry,
-			      &group->hw.cqm_group_entry);
-	} else {
-		list_add_tail(&event->hw.cqm_groups_entry,
-			      &cache_groups);
-
-		/*
-		 * All RMIDs are either in use or have recently been
-		 * used. Kick the rotation worker to clean/free some.
-		 *
-		 * We only do this for the group leader, rather than for
-		 * every event in a group to save on needless work.
-		 */
-		if (!__rmid_valid(event->hw.cqm_rmid))
-			rotate = true;
-	}
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-	mutex_unlock(&cache_mutex);
-
-	if (rotate)
-		schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
-	return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
-EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
-EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
-EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
-
-EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
-EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
-EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
-EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
-
-static struct attribute *intel_cqm_events_attr[] = {
-	EVENT_PTR(intel_cqm_llc),
-	EVENT_PTR(intel_cqm_llc_pkg),
-	EVENT_PTR(intel_cqm_llc_unit),
-	EVENT_PTR(intel_cqm_llc_scale),
-	EVENT_PTR(intel_cqm_llc_snapshot),
-	NULL,
-};
-
-static struct attribute *intel_mbm_events_attr[] = {
-	EVENT_PTR(intel_cqm_total_bytes),
-	EVENT_PTR(intel_cqm_local_bytes),
-	EVENT_PTR(intel_cqm_total_bytes_pkg),
-	EVENT_PTR(intel_cqm_local_bytes_pkg),
-	EVENT_PTR(intel_cqm_total_bytes_unit),
-	EVENT_PTR(intel_cqm_local_bytes_unit),
-	EVENT_PTR(intel_cqm_total_bytes_scale),
-	EVENT_PTR(intel_cqm_local_bytes_scale),
-	NULL,
-};
-
-static struct attribute *intel_cmt_mbm_events_attr[] = {
-	EVENT_PTR(intel_cqm_llc),
-	EVENT_PTR(intel_cqm_total_bytes),
-	EVENT_PTR(intel_cqm_local_bytes),
-	EVENT_PTR(intel_cqm_llc_pkg),
-	EVENT_PTR(intel_cqm_total_bytes_pkg),
-	EVENT_PTR(intel_cqm_local_bytes_pkg),
-	EVENT_PTR(intel_cqm_llc_unit),
-	EVENT_PTR(intel_cqm_total_bytes_unit),
-	EVENT_PTR(intel_cqm_local_bytes_unit),
-	EVENT_PTR(intel_cqm_llc_scale),
-	EVENT_PTR(intel_cqm_total_bytes_scale),
-	EVENT_PTR(intel_cqm_local_bytes_scale),
-	EVENT_PTR(intel_cqm_llc_snapshot),
-	NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
-	.name = "events",
-	.attrs = NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
-	.name = "format",
-	.attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-			   char *page)
-{
-	ssize_t rv;
-
-	mutex_lock(&cache_mutex);
-	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-	mutex_unlock(&cache_mutex);
-
-	return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t count)
-{
-	unsigned int bytes, cachelines;
-	int ret;
-
-	ret = kstrtouint(buf, 0, &bytes);
-	if (ret)
-		return ret;
-
-	mutex_lock(&cache_mutex);
-
-	__intel_cqm_max_threshold = bytes;
-	cachelines = bytes / cqm_l3_scale;
-
-	/*
-	 * The new maximum takes effect immediately.
-	 */
-	if (__intel_cqm_threshold > cachelines)
-		__intel_cqm_threshold = cachelines;
-
-	mutex_unlock(&cache_mutex);
-
-	return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
-	&dev_attr_max_recycle_threshold.attr,
-	NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
-	.attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-	&intel_cqm_events_group,
-	&intel_cqm_format_group,
-	&intel_cqm_group,
-	NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
-	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-	.attr_groups	     = intel_cqm_attr_groups,
-	.task_ctx_nr	     = perf_sw_context,
-	.event_init	     = intel_cqm_event_init,
-	.add		     = intel_cqm_event_add,
-	.del		     = intel_cqm_event_stop,
-	.start		     = intel_cqm_event_start,
-	.stop		     = intel_cqm_event_stop,
-	.read		     = intel_cqm_event_read,
-	.count		     = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
-	int reader;
-
-	/* First online cpu in package becomes the reader */
-	reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
-	if (reader >= nr_cpu_ids)
-		cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static int intel_cqm_cpu_starting(unsigned int cpu)
-{
-	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-	state->rmid = 0;
-	state->closid = 0;
-	state->rmid_usecnt = 0;
-
-	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-
-	cqm_pick_event_reader(cpu);
-	return 0;
-}
-
-static int intel_cqm_cpu_exit(unsigned int cpu)
-{
-	int target;
-
-	/* Is @cpu the current cqm reader for this package ? */
-	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-		return 0;
-
-	/* Find another online reader in this package */
-	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
-
-	if (target < nr_cpu_ids)
-		cpumask_set_cpu(target, &cqm_cpumask);
-
-	return 0;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-	{}
-};
-
-static void mbm_cleanup(void)
-{
-	if (!mbm_enabled)
-		return;
-
-	kfree(mbm_local);
-	kfree(mbm_total);
-	mbm_enabled = false;
-}
-
-static const struct x86_cpu_id intel_mbm_local_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
-	{}
-};
-
-static const struct x86_cpu_id intel_mbm_total_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
-	{}
-};
-
-static int intel_mbm_init(void)
-{
-	int ret = 0, array_size, maxid = cqm_max_rmid + 1;
-
-	mbm_socket_max = topology_max_packages();
-	array_size = sizeof(struct sample) * maxid * mbm_socket_max;
-	mbm_local = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_local)
-		return -ENOMEM;
-
-	mbm_total = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_total) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	array_size = sizeof(struct hrtimer) * mbm_socket_max;
-	mbm_timers = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_timers) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	mbm_hrtimer_init();
-
-out:
-	if (ret)
-		mbm_cleanup();
-
-	return ret;
-}
-
-static int __init intel_cqm_init(void)
-{
-	char *str = NULL, scale[20];
-	int cpu, ret;
-
-	if (x86_match_cpu(intel_cqm_match))
-		cqm_enabled = true;
-
-	if (x86_match_cpu(intel_mbm_local_match) &&
-	     x86_match_cpu(intel_mbm_total_match))
-		mbm_enabled = true;
-
-	if (!cqm_enabled && !mbm_enabled)
-		return -ENODEV;
-
-	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
-	/*
-	 * It's possible that not all resources support the same number
-	 * of RMIDs. Instead of making scheduling much more complicated
-	 * (where we have to match a task's RMID to a cpu that supports
-	 * that many RMIDs) just find the minimum RMIDs supported across
-	 * all cpus.
-	 *
-	 * Also, check that the scales match on all cpus.
-	 */
-	cpus_read_lock();
-	for_each_online_cpu(cpu) {
-		struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-		if (c->x86_cache_max_rmid < cqm_max_rmid)
-			cqm_max_rmid = c->x86_cache_max_rmid;
-
-		if (c->x86_cache_occ_scale != cqm_l3_scale) {
-			pr_err("Multiple LLC scale values, disabling\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	/*
-	 * A reasonable upper limit on the max threshold is the number
-	 * of lines tagged per RMID if all RMIDs have the same number of
-	 * lines tagged in the LLC.
-	 *
-	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-	 */
-	__intel_cqm_max_threshold =
-		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
-	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-	str = kstrdup(scale, GFP_KERNEL);
-	if (!str) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	event_attr_intel_cqm_llc_scale.event_str = str;
-
-	ret = intel_cqm_setup_rmid_cache();
-	if (ret)
-		goto out;
-
-	if (mbm_enabled)
-		ret = intel_mbm_init();
-	if (ret && !cqm_enabled)
-		goto out;
-
-	if (cqm_enabled && mbm_enabled)
-		intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
-	else if (!cqm_enabled && mbm_enabled)
-		intel_cqm_events_group.attrs = intel_mbm_events_attr;
-	else if (cqm_enabled && !mbm_enabled)
-		intel_cqm_events_group.attrs = intel_cqm_events_attr;
-
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-	if (ret) {
-		pr_err("Intel CQM perf registration failed: %d\n", ret);
-		goto out;
-	}
-
-	if (cqm_enabled)
-		pr_info("Intel CQM monitoring enabled\n");
-	if (mbm_enabled)
-		pr_info("Intel MBM enabled\n");
-
-	/*
-	 * Setup the hot cpu notifier once we are sure cqm
-	 * is enabled to avoid notifier leak.
-	 */
-	cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
-				     "perf/x86/cqm:starting",
-				     intel_cqm_cpu_starting, NULL);
-	cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
-				     "perf/x86/cqm:online",
-				     NULL, intel_cqm_cpu_exit);
-out:
-	cpus_read_unlock();
-
-	if (ret) {
-		kfree(str);
-		cqm_cleanup();
-		mbm_cleanup();
-	}
-
-	return ret;
-}
-device_initcall(intel_cqm_init);
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 238ae3248ba5..4cf100ff2a37 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,16 +40,16 @@
  * Model specific counters:
  *	MSR_CORE_C1_RES: CORE C1 Residency Counter
  *			 perf code: 0x00
- *			 Available model: SLM,AMT
+ *			 Available model: SLM,AMT,GLM
  *			 Scope: Core (each processor core has a MSR)
  *	MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
  *			       perf code: 0x01
- *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM
  *			       Scope: Core
  *	MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
  *			       perf code: 0x02
  *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW
- *						SKL,KNL
+ *						SKL,KNL,GLM
  *			       Scope: Core
  *	MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
  *			       perf code: 0x03
@@ -57,16 +57,17 @@
  *			       Scope: Core
  *	MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
  *			       perf code: 0x00
- *			       Available model: SNB,IVB,HSW,BDW,SKL,KNL
+ *			       Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM
  *			       Scope: Package (physical package)
  *	MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
  *			       perf code: 0x01
  *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL
+ *						GLM
  *			       Scope: Package (physical package)
  *	MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
  *			       perf code: 0x02
  *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW
- *						SKL,KNL
+ *						SKL,KNL,GLM
  *			       Scope: Package (physical package)
  *	MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
  *			       perf code: 0x03
@@ -82,7 +83,7 @@
  *			       Scope: Package (physical package)
  *	MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
  *			       perf code: 0x06
- *			       Available model: HSW ULT only
+ *			       Available model: HSW ULT, GLM
  *			       Scope: Package (physical package)
  *
  */
@@ -504,6 +505,17 @@ static const struct cstate_model knl_cstates __initconst = {
 };
 
 
+static const struct cstate_model glm_cstates __initconst = {
+	.core_events		= BIT(PERF_CSTATE_CORE_C1_RES) |
+				  BIT(PERF_CSTATE_CORE_C3_RES) |
+				  BIT(PERF_CSTATE_CORE_C6_RES),
+
+	.pkg_events		= BIT(PERF_CSTATE_PKG_C2_RES) |
+				  BIT(PERF_CSTATE_PKG_C3_RES) |
+				  BIT(PERF_CSTATE_PKG_C6_RES) |
+				  BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
 
 #define X86_CSTATES_MODEL(model, states)				\
 	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) }
@@ -546,6 +558,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
 
 	X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates),
 	X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates),
+
+	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates),
 	{ },
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c6d23ffe422d..e1965e5ff570 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -49,34 +49,47 @@ union intel_x86_pebs_dse {
  */
 #define P(a, b) PERF_MEM_S(a, b)
 #define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
 #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
 
 /* Version for Sandy Bridge and later */
 static u64 pebs_data_source[] = {
-	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
-	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
-	OP_LH | P(LVL, L2)  | P(SNOOP, NONE),	/* 0x03: L2 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, NONE),	/* 0x04: L3 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, MISS),	/* 0x05: L3 hit, snoop miss */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HIT),	/* 0x06: L3 hit, snoop hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HITM),	/* 0x07: L3 hit, snoop hitm */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-	OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-	OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-	OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-	OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-	OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+	P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+	OP_LH | P(LVL, L1)  | LEVEL(L1) | P(SNOOP, NONE),  /* 0x01: L1 local */
+	OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, NONE),  /* 0x03: L2 hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, NONE),  /* 0x04: L3 hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, MISS),  /* 0x05: L3 hit, snoop miss */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HIT),   /* 0x06: L3 hit, snoop hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HITM),  /* 0x07: L3 hit, snoop hitm */
+	OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+	OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+	OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | P(SNOOP, HIT),       /* 0x0a: L3 miss, shared */
+	OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+	OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | SNOOP_NONE_MISS,     /* 0x0c: L3 miss, excl */
+	OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+	OP_LH | P(LVL, IO)  | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+	OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
 };
 
 /* Patch up minor differences in the bits */
 void __init intel_pmu_pebs_data_source_nhm(void)
 {
-	pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
-	pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
-	pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+	pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+	pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+	pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+	u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+	pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+	pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+	pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+	pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+	pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
 }
 
 static u64 precise_store_data(u64 status)
@@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status)
 {
 	union intel_x86_pebs_dse dse;
 	u64 val;
-	int model = boot_cpu_data.x86_model;
-	int fam = boot_cpu_data.x86;
 
 	dse.val = status;
 
@@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status)
 	/*
 	 * Nehalem models do not support TLB, Lock infos
 	 */
-	if (fam == 0x6 && (model == 26 || model == 30
-	    || model == 31 || model == 46)) {
+	if (x86_pmu.pebs_no_tlb) {
 		val |= P(TLB, NA) | P(LOCK, NA);
 		return val;
 	}
@@ -606,12 +616,6 @@ static inline void intel_pmu_drain_pebs_buffer(void)
 	x86_pmu.drain_pebs(&regs);
 }
 
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-	if (!sched_in)
-		intel_pmu_drain_pebs_buffer();
-}
-
 /*
  * PEBS
  */
@@ -651,6 +655,12 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_glp_pebs_event_constraints[] = {
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
 struct event_constraint intel_nehalem_pebs_event_constraints[] = {
 	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
@@ -816,6 +826,14 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
 	return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
 }
 
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (!sched_in && pebs_needs_sched_cb(cpuc))
+		intel_pmu_drain_pebs_buffer();
+}
+
 static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
 {
 	struct debug_store *ds = cpuc->ds;
@@ -889,6 +907,8 @@ void intel_pmu_pebs_enable(struct perf_event *event)
 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
 		ds->pebs_event_reset[hwc->idx] =
 			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+	} else {
+		ds->pebs_event_reset[hwc->idx] = 0;
 	}
 }
 
@@ -1165,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	else
 		regs->flags &= ~PERF_EFLAGS_EXACT;
 
-	if ((sample_type & PERF_SAMPLE_ADDR) &&
+	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index eb261656a320..8a6bbacd17dc 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@ enum {
 	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
 	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
 	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
+
+	X86_BR_TYPE_SAVE	= 1 << 18,/* indicate to save branch type */
+
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -380,8 +383,12 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx;
 
+	if (!cpuc->lbr_users)
+		return;
+
 	/*
 	 * If LBR callstack feature is enabled and the stack was saved when
 	 * the task was scheduled out, restore the stack. Otherwise flush
@@ -510,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[i].in_tx	= 0;
 		cpuc->lbr_entries[i].abort	= 0;
 		cpuc->lbr_entries[i].cycles	= 0;
+		cpuc->lbr_entries[i].type	= 0;
 		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
@@ -596,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[out].in_tx	 = in_tx;
 		cpuc->lbr_entries[out].abort	 = abort;
 		cpuc->lbr_entries[out].cycles	 = cycles;
+		cpuc->lbr_entries[out].type	 = 0;
 		cpuc->lbr_entries[out].reserved	 = 0;
 		out++;
 	}
@@ -673,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 
 	if (br_type & PERF_SAMPLE_BRANCH_CALL)
 		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+		mask |= X86_BR_TYPE_SAVE;
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
@@ -926,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
 	return ret;
 }
 
+#define X86_BR_TYPE_MAP_MAX	16
+
+static int branch_map[X86_BR_TYPE_MAP_MAX] = {
+	PERF_BR_CALL,		/* X86_BR_CALL */
+	PERF_BR_RET,		/* X86_BR_RET */
+	PERF_BR_SYSCALL,	/* X86_BR_SYSCALL */
+	PERF_BR_SYSRET,		/* X86_BR_SYSRET */
+	PERF_BR_UNKNOWN,	/* X86_BR_INT */
+	PERF_BR_UNKNOWN,	/* X86_BR_IRET */
+	PERF_BR_COND,		/* X86_BR_JCC */
+	PERF_BR_UNCOND,		/* X86_BR_JMP */
+	PERF_BR_UNKNOWN,	/* X86_BR_IRQ */
+	PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
+	PERF_BR_UNKNOWN,	/* X86_BR_ABORT */
+	PERF_BR_UNKNOWN,	/* X86_BR_IN_TX */
+	PERF_BR_UNKNOWN,	/* X86_BR_NO_TX */
+	PERF_BR_CALL,		/* X86_BR_ZERO_CALL */
+	PERF_BR_UNKNOWN,	/* X86_BR_CALL_STACK */
+	PERF_BR_IND,		/* X86_BR_IND_JMP */
+};
+
+static int
+common_branch_type(int type)
+{
+	int i;
+
+	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+
+	if (type) {
+		i = __ffs(type);
+		if (i < X86_BR_TYPE_MAP_MAX)
+			return branch_map[i];
+	}
+
+	return PERF_BR_UNKNOWN;
+}
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -942,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	bool compress = false;
 
 	/* if sampling all branches, then nothing to filter */
-	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+	if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+	    ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
 		return;
 
 	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -963,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 			cpuc->lbr_entries[i].from = 0;
 			compress = true;
 		}
+
+		if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+			cpuc->lbr_entries[i].type = common_branch_type(type);
 	}
 
 	if (!compress)
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index eb0533558c2b..d32c0eed38ca 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -587,7 +587,7 @@ static __initconst const u64 p4_hw_cache_event_ids
  * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
  * either up to date automatically or not applicable at all.
  */
-struct p4_event_alias {
+static struct p4_event_alias {
 	u64 original;
 	u64 alternative;
 } p4_event_aliases[] = {
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index ae8324d65e61..81fd41d5a0d9 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event)
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	u64 reg;
 
-	if (!event->hw.itrace_started) {
-		event->hw.itrace_started = 1;
+	/* First round: clear STATUS, in particular the PSB byte counter. */
+	if (!event->hw.config) {
+		perf_event_itrace_started(event);
 		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 	}
 
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index a45e2114a846..8e2457cb6b4a 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -559,7 +559,7 @@ static struct attribute_group rapl_pmu_format_group = {
 	.attrs = rapl_formats_attr,
 };
 
-const struct attribute_group *rapl_attr_groups[] = {
+static const struct attribute_group *rapl_attr_groups[] = {
 	&rapl_pmu_attr_group,
 	&rapl_pmu_format_group,
 	&rapl_pmu_events_group,
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 44ec523287f6..1c5390f1cf09 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -721,7 +721,7 @@ static struct attribute *uncore_pmu_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group uncore_pmu_attr_group = {
+static const struct attribute_group uncore_pmu_attr_group = {
 	.attrs = uncore_pmu_attrs,
 };
 
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index cda569332005..6a5cbe90f859 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -272,7 +272,7 @@ static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group nhmex_uncore_ubox_format_group = {
+static const struct attribute_group nhmex_uncore_ubox_format_group = {
 	.name		= "format",
 	.attrs		= nhmex_uncore_ubox_formats_attr,
 };
@@ -299,7 +299,7 @@ static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group nhmex_uncore_cbox_format_group = {
+static const struct attribute_group nhmex_uncore_cbox_format_group = {
 	.name = "format",
 	.attrs = nhmex_uncore_cbox_formats_attr,
 };
@@ -407,7 +407,7 @@ static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group nhmex_uncore_bbox_format_group = {
+static const struct attribute_group nhmex_uncore_bbox_format_group = {
 	.name = "format",
 	.attrs = nhmex_uncore_bbox_formats_attr,
 };
@@ -484,7 +484,7 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group nhmex_uncore_sbox_format_group = {
+static const struct attribute_group nhmex_uncore_sbox_format_group = {
 	.name			= "format",
 	.attrs			= nhmex_uncore_sbox_formats_attr,
 };
@@ -898,7 +898,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group nhmex_uncore_mbox_format_group = {
+static const struct attribute_group nhmex_uncore_mbox_format_group = {
 	.name		= "format",
 	.attrs		= nhmex_uncore_mbox_formats_attr,
 };
@@ -1163,7 +1163,7 @@ static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group nhmex_uncore_rbox_format_group = {
+static const struct attribute_group nhmex_uncore_rbox_format_group = {
 	.name = "format",
 	.attrs = nhmex_uncore_rbox_formats_attr,
 };
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index a3dcc12bef4a..db1127ce685e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -130,7 +130,7 @@ static struct attribute *snb_uncore_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group snb_uncore_format_group = {
+static const struct attribute_group snb_uncore_format_group = {
 	.name		= "format",
 	.attrs		= snb_uncore_formats_attr,
 };
@@ -289,7 +289,7 @@ static struct attribute *snb_uncore_imc_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group snb_uncore_imc_format_group = {
+static const struct attribute_group snb_uncore_imc_format_group = {
 	.name = "format",
 	.attrs = snb_uncore_imc_formats_attr,
 };
@@ -769,7 +769,7 @@ static struct attribute *nhm_uncore_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group nhm_uncore_format_group = {
+static const struct attribute_group nhm_uncore_format_group = {
 	.name = "format",
 	.attrs = nhm_uncore_formats_attr,
 };
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index dae2fedc1601..db1fe377e6dd 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -316,7 +316,7 @@
 #define SKX_UPI_PCI_PMON_CTL0		0x350
 #define SKX_UPI_PCI_PMON_CTR0		0x318
 #define SKX_UPI_PCI_PMON_BOX_CTL	0x378
-#define SKX_PMON_CTL_UMASK_EXT		0xff
+#define SKX_UPI_CTL_UMASK_EXT		0xffefff
 
 /* SKX M2M */
 #define SKX_M2M_PCI_PMON_CTL0		0x228
@@ -328,7 +328,7 @@ DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
 DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
 DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55");
 DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
@@ -351,7 +351,6 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
@@ -603,27 +602,27 @@ static struct uncore_event_desc snbep_uncore_qpi_events[] = {
 	{ /* end: all zeroes */ },
 };
 
-static struct attribute_group snbep_uncore_format_group = {
+static const struct attribute_group snbep_uncore_format_group = {
 	.name = "format",
 	.attrs = snbep_uncore_formats_attr,
 };
 
-static struct attribute_group snbep_uncore_ubox_format_group = {
+static const struct attribute_group snbep_uncore_ubox_format_group = {
 	.name = "format",
 	.attrs = snbep_uncore_ubox_formats_attr,
 };
 
-static struct attribute_group snbep_uncore_cbox_format_group = {
+static const struct attribute_group snbep_uncore_cbox_format_group = {
 	.name = "format",
 	.attrs = snbep_uncore_cbox_formats_attr,
 };
 
-static struct attribute_group snbep_uncore_pcu_format_group = {
+static const struct attribute_group snbep_uncore_pcu_format_group = {
 	.name = "format",
 	.attrs = snbep_uncore_pcu_formats_attr,
 };
 
-static struct attribute_group snbep_uncore_qpi_format_group = {
+static const struct attribute_group snbep_uncore_qpi_format_group = {
 	.name = "format",
 	.attrs = snbep_uncore_qpi_formats_attr,
 };
@@ -1432,27 +1431,27 @@ static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group ivbep_uncore_format_group = {
+static const struct attribute_group ivbep_uncore_format_group = {
 	.name = "format",
 	.attrs = ivbep_uncore_formats_attr,
 };
 
-static struct attribute_group ivbep_uncore_ubox_format_group = {
+static const struct attribute_group ivbep_uncore_ubox_format_group = {
 	.name = "format",
 	.attrs = ivbep_uncore_ubox_formats_attr,
 };
 
-static struct attribute_group ivbep_uncore_cbox_format_group = {
+static const struct attribute_group ivbep_uncore_cbox_format_group = {
 	.name = "format",
 	.attrs = ivbep_uncore_cbox_formats_attr,
 };
 
-static struct attribute_group ivbep_uncore_pcu_format_group = {
+static const struct attribute_group ivbep_uncore_pcu_format_group = {
 	.name = "format",
 	.attrs = ivbep_uncore_pcu_formats_attr,
 };
 
-static struct attribute_group ivbep_uncore_qpi_format_group = {
+static const struct attribute_group ivbep_uncore_qpi_format_group = {
 	.name = "format",
 	.attrs = ivbep_uncore_qpi_formats_attr,
 };
@@ -1888,7 +1887,7 @@ static struct attribute *knl_uncore_ubox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group knl_uncore_ubox_format_group = {
+static const struct attribute_group knl_uncore_ubox_format_group = {
 	.name = "format",
 	.attrs = knl_uncore_ubox_formats_attr,
 };
@@ -1928,7 +1927,7 @@ static struct attribute *knl_uncore_cha_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group knl_uncore_cha_format_group = {
+static const struct attribute_group knl_uncore_cha_format_group = {
 	.name = "format",
 	.attrs = knl_uncore_cha_formats_attr,
 };
@@ -2038,7 +2037,7 @@ static struct attribute *knl_uncore_pcu_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group knl_uncore_pcu_format_group = {
+static const struct attribute_group knl_uncore_pcu_format_group = {
 	.name = "format",
 	.attrs = knl_uncore_pcu_formats_attr,
 };
@@ -2188,7 +2187,7 @@ static struct attribute *knl_uncore_irp_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group knl_uncore_irp_format_group = {
+static const struct attribute_group knl_uncore_irp_format_group = {
 	.name = "format",
 	.attrs = knl_uncore_irp_formats_attr,
 };
@@ -2386,7 +2385,7 @@ static struct attribute *hswep_uncore_ubox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group hswep_uncore_ubox_format_group = {
+static const struct attribute_group hswep_uncore_ubox_format_group = {
 	.name = "format",
 	.attrs = hswep_uncore_ubox_formats_attr,
 };
@@ -2440,7 +2439,7 @@ static struct attribute *hswep_uncore_cbox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group hswep_uncore_cbox_format_group = {
+static const struct attribute_group hswep_uncore_cbox_format_group = {
 	.name = "format",
 	.attrs = hswep_uncore_cbox_formats_attr,
 };
@@ -2622,7 +2621,7 @@ static struct attribute *hswep_uncore_sbox_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group hswep_uncore_sbox_format_group = {
+static const struct attribute_group hswep_uncore_sbox_format_group = {
 	.name = "format",
 	.attrs = hswep_uncore_sbox_formats_attr,
 };
@@ -3302,7 +3301,6 @@ static struct attribute *skx_uncore_cha_formats_attr[] = {
 	&format_attr_inv.attr,
 	&format_attr_thresh8.attr,
 	&format_attr_filter_tid4.attr,
-	&format_attr_filter_link4.attr,
 	&format_attr_filter_state5.attr,
 	&format_attr_filter_rem.attr,
 	&format_attr_filter_loc.attr,
@@ -3312,12 +3310,11 @@ static struct attribute *skx_uncore_cha_formats_attr[] = {
 	&format_attr_filter_opc_0.attr,
 	&format_attr_filter_opc_1.attr,
 	&format_attr_filter_nc.attr,
-	&format_attr_filter_c6.attr,
 	&format_attr_filter_isoc.attr,
 	NULL,
 };
 
-static struct attribute_group skx_uncore_chabox_format_group = {
+static const struct attribute_group skx_uncore_chabox_format_group = {
 	.name = "format",
 	.attrs = skx_uncore_cha_formats_attr,
 };
@@ -3333,8 +3330,11 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = {
 	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
 	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
 	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x3134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8),
+	EVENT_EXTRA_END
 };
 
 static u64 skx_cha_filter_mask(int fields)
@@ -3347,6 +3347,17 @@ static u64 skx_cha_filter_mask(int fields)
 		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK;
 	if (fields & 0x4)
 		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8) {
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_REM;
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LOC;
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC;
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NM;
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM;
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC0;
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC1;
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NC;
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ISOC;
+	}
 	return mask;
 }
 
@@ -3416,7 +3427,7 @@ static struct attribute *skx_uncore_iio_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group skx_uncore_iio_format_group = {
+static const struct attribute_group skx_uncore_iio_format_group = {
 	.name = "format",
 	.attrs = skx_uncore_iio_formats_attr,
 };
@@ -3473,7 +3484,7 @@ static struct attribute *skx_uncore_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group skx_uncore_format_group = {
+static const struct attribute_group skx_uncore_format_group = {
 	.name = "format",
 	.attrs = skx_uncore_formats_attr,
 };
@@ -3492,6 +3503,26 @@ static struct intel_uncore_type skx_uncore_irp = {
 	.format_group		= &skx_uncore_format_group,
 };
 
+static struct attribute *skx_uncore_pcu_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge_det.attr,
+	&format_attr_filter_band0.attr,
+	&format_attr_filter_band1.attr,
+	&format_attr_filter_band2.attr,
+	&format_attr_filter_band3.attr,
+	NULL,
+};
+
+static struct attribute_group skx_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = skx_uncore_pcu_formats_attr,
+};
+
 static struct intel_uncore_ops skx_uncore_pcu_ops = {
 	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
 	.hw_config		= hswep_pcu_hw_config,
@@ -3510,7 +3541,7 @@ static struct intel_uncore_type skx_uncore_pcu = {
 	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
 	.num_shared_regs	= 1,
 	.ops			= &skx_uncore_pcu_ops,
-	.format_group		= &snbep_uncore_pcu_format_group,
+	.format_group		= &skx_uncore_pcu_format_group,
 };
 
 static struct intel_uncore_type *skx_msr_uncores[] = {
@@ -3574,7 +3605,7 @@ static struct attribute *skx_upi_uncore_formats_attr[] = {
 	NULL,
 };
 
-static struct attribute_group skx_upi_uncore_format_group = {
+static const struct attribute_group skx_upi_uncore_format_group = {
 	.name = "format",
 	.attrs = skx_upi_uncore_formats_attr,
 };
@@ -3603,8 +3634,8 @@ static struct intel_uncore_type skx_uncore_upi = {
 	.perf_ctr_bits	= 48,
 	.perf_ctr	= SKX_UPI_PCI_PMON_CTR0,
 	.event_ctl	= SKX_UPI_PCI_PMON_CTL0,
-	.event_mask	= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.event_mask_ext = SKX_PMON_CTL_UMASK_EXT,
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
+	.event_mask_ext = SKX_UPI_CTL_UMASK_EXT,
 	.box_ctl	= SKX_UPI_PCI_PMON_BOX_CTL,
 	.ops		= &skx_upi_uncore_pci_ops,
 	.format_group	= &skx_upi_uncore_format_group,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 53728eea1bed..4196f81ec0e1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
 	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
+	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
@@ -558,6 +558,7 @@ struct x86_pmu {
 	int		attr_rdpmc;
 	struct attribute **format_attrs;
 	struct attribute **event_attrs;
+	struct attribute **caps_attrs;
 
 	ssize_t		(*events_sysfs_show)(char *page, u64 config);
 	struct attribute **cpu_events;
@@ -591,7 +592,8 @@ struct x86_pmu {
 			pebs		:1,
 			pebs_active	:1,
 			pebs_broken	:1,
-			pebs_prec_dist	:1;
+			pebs_prec_dist	:1,
+			pebs_no_tlb	:1;
 	int		pebs_record_size;
 	int		pebs_buffer_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
@@ -741,6 +743,8 @@ int x86_reserve_hardware(void);
 
 void x86_release_hardware(void);
 
+int x86_pmu_max_precise(void);
+
 void hw_perf_lbr_event_destroy(struct perf_event *event);
 
 int x86_setup_perfctr(struct perf_event *event);
@@ -879,6 +883,8 @@ extern struct event_constraint intel_slm_pebs_event_constraints[];
 
 extern struct event_constraint intel_glm_pebs_event_constraints[];
 
+extern struct event_constraint intel_glp_pebs_event_constraints[];
+
 extern struct event_constraint intel_nehalem_pebs_event_constraints[];
 
 extern struct event_constraint intel_westmere_pebs_event_constraints[];
@@ -945,6 +951,8 @@ void intel_pmu_lbr_init_knl(void);
 
 void intel_pmu_pebs_data_source_nhm(void);
 
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 724153797209..e0bb46c02857 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 	if (ksig->ka.sa.sa_flags & SA_ONSTACK)
 		sp = sigsp(sp, ksig);
 	/* This is the legacy signal stack switching. */
-	else if ((regs->ss & 0xffff) != __USER32_DS &&
+	else if (regs->ss != __USER32_DS &&
 		!(ksig->ka.sa.sa_flags & SA_RESTORER) &&
 		 ksig->ka.sa.sa_restorer)
 		sp = (unsigned long) ksig->ka.sa.sa_restorer;
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2efc768e4362..72d867f6b518 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -150,8 +150,6 @@ static inline void disable_acpi(void) { }
 extern int x86_acpi_numa_init(void);
 #endif /* CONFIG_ACPI_NUMA */
 
-#define acpi_unlazy_tlb(x)	leave_mm(x)
-
 #ifdef CONFIG_ACPI_APEI
 static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
 {
@@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
 	 * you call efi_mem_attributes() during boot and at runtime,
 	 * you could theoretically see different attributes.
 	 *
-	 * Since we are yet to see any x86 platforms that require
-	 * anything other than PAGE_KERNEL (some arm64 platforms
-	 * require the equivalent of PAGE_KERNEL_NOCACHE), return that
-	 * until we know differently.
+	 * We are yet to see any x86 platforms that require anything
+	 * other than PAGE_KERNEL (some ARM64 platforms require the
+	 * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME
+	 * is active, the ACPI information will not be encrypted,
+	 * so return PAGE_KERNEL_NOENC until we know differently.
 	 */
-	 return PAGE_KERNEL;
+	return PAGE_KERNEL_NOENC;
 }
 #endif
 
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7a9df3beb89b..676ee5807d86 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -74,6 +74,9 @@
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
+# define _ASM_EXTABLE_REFCOUNT(from, to)			\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
 	_ASM_ALIGN ;						\
@@ -123,6 +126,9 @@
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
+# define _ASM_EXTABLE_REFCOUNT(from, to)			\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 33380b871463..0874ebda3069 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new)
 	return xchg(&v->counter, new);
 }
 
-#define ATOMIC_OP(op)							\
-static inline void atomic_##op(int i, atomic_t *v)			\
-{									\
-	asm volatile(LOCK_PREFIX #op"l %1,%0"				\
-			: "+m" (v->counter)				\
-			: "ir" (i)					\
-			: "memory");					\
+static inline void atomic_and(int i, atomic_t *v)
+{
+	asm volatile(LOCK_PREFIX "andl %1,%0"
+			: "+m" (v->counter)
+			: "ir" (i)
+			: "memory");
+}
+
+static inline int atomic_fetch_and(int i, atomic_t *v)
+{
+	int val = atomic_read(v);
+
+	do { } while (!atomic_try_cmpxchg(v, &val, val & i));
+
+	return val;
 }
 
-#define ATOMIC_FETCH_OP(op, c_op)					\
-static inline int atomic_fetch_##op(int i, atomic_t *v)			\
-{									\
-	int val = atomic_read(v);					\
-	do {								\
-	} while (!atomic_try_cmpxchg(v, &val, val c_op i));		\
-	return val;							\
+static inline void atomic_or(int i, atomic_t *v)
+{
+	asm volatile(LOCK_PREFIX "orl %1,%0"
+			: "+m" (v->counter)
+			: "ir" (i)
+			: "memory");
 }
 
-#define ATOMIC_OPS(op, c_op)						\
-	ATOMIC_OP(op)							\
-	ATOMIC_FETCH_OP(op, c_op)
+static inline int atomic_fetch_or(int i, atomic_t *v)
+{
+	int val = atomic_read(v);
 
-ATOMIC_OPS(and, &)
-ATOMIC_OPS(or , |)
-ATOMIC_OPS(xor, ^)
+	do { } while (!atomic_try_cmpxchg(v, &val, val | i));
 
-#undef ATOMIC_OPS
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP
+	return val;
+}
+
+static inline void atomic_xor(int i, atomic_t *v)
+{
+	asm volatile(LOCK_PREFIX "xorl %1,%0"
+			: "+m" (v->counter)
+			: "ir" (i)
+			: "memory");
+}
+
+static inline int atomic_fetch_xor(int i, atomic_t *v)
+{
+	int val = atomic_read(v);
+
+	do { } while (!atomic_try_cmpxchg(v, &val, val ^ i));
+
+	return val;
+}
 
 /**
  * __atomic_add_unless - add unless the number is already a given value
@@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^)
 static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c = atomic_read(v);
+
 	do {
 		if (unlikely(c == u))
 			break;
 	} while (!atomic_try_cmpxchg(v, &c, c + a));
+
 	return c;
 }
 
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 71d7705fb303..9e206f31ce2a 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 #undef alternative_atomic64
 #undef __alternative_atomic64
 
-#define ATOMIC64_OP(op, c_op)						\
-static inline void atomic64_##op(long long i, atomic64_t *v)		\
-{									\
-	long long old, c = 0;						\
-	while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)		\
-		c = old;						\
+static inline void atomic64_and(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
+		c = old;
 }
 
-#define ATOMIC64_FETCH_OP(op, c_op)					\
-static inline long long atomic64_fetch_##op(long long i, atomic64_t *v)	\
-{									\
-	long long old, c = 0;						\
-	while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)		\
-		c = old;						\
-	return old;							\
+static inline long long atomic64_fetch_and(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
+		c = old;
+
+	return old;
 }
 
-ATOMIC64_FETCH_OP(add, +)
+static inline void atomic64_or(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
 
-#define atomic64_fetch_sub(i, v)	atomic64_fetch_add(-(i), (v))
+	while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+		c = old;
+}
+
+static inline long long atomic64_fetch_or(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+		c = old;
+
+	return old;
+}
 
-#define ATOMIC64_OPS(op, c_op)						\
-	ATOMIC64_OP(op, c_op)						\
-	ATOMIC64_FETCH_OP(op, c_op)
+static inline void atomic64_xor(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+		c = old;
+}
 
-ATOMIC64_OPS(and, &)
-ATOMIC64_OPS(or, |)
-ATOMIC64_OPS(xor, ^)
+static inline long long atomic64_fetch_xor(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+		c = old;
+
+	return old;
+}
 
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
+static inline long long atomic64_fetch_add(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c + i)) != c)
+		c = old;
+
+	return old;
+}
+
+#define atomic64_fetch_sub(i, v)	atomic64_fetch_add(-(i), (v))
 
 #endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6189a433c9a9..5d9de36a2f04 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
 }
 
 #define atomic64_try_cmpxchg atomic64_try_cmpxchg
-static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new)
 {
 	return try_cmpxchg(&v->counter, old, new);
 }
@@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
  */
 static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
 {
-	long c = atomic64_read(v);
+	s64 c = atomic64_read(v);
 	do {
 		if (unlikely(c == u))
 			return false;
@@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
  */
 static inline long atomic64_dec_if_positive(atomic64_t *v)
 {
-	long dec, c = atomic64_read(v);
+	s64 dec, c = atomic64_read(v);
 	do {
 		dec = c - 1;
 		if (unlikely(dec < 0))
@@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return dec;
 }
 
-#define ATOMIC64_OP(op)							\
-static inline void atomic64_##op(long i, atomic64_t *v)			\
-{									\
-	asm volatile(LOCK_PREFIX #op"q %1,%0"				\
-			: "+m" (v->counter)				\
-			: "er" (i)					\
-			: "memory");					\
+static inline void atomic64_and(long i, atomic64_t *v)
+{
+	asm volatile(LOCK_PREFIX "andq %1,%0"
+			: "+m" (v->counter)
+			: "er" (i)
+			: "memory");
 }
 
-#define ATOMIC64_FETCH_OP(op, c_op)					\
-static inline long atomic64_fetch_##op(long i, atomic64_t *v)		\
-{									\
-	long val = atomic64_read(v);					\
-	do {								\
-	} while (!atomic64_try_cmpxchg(v, &val, val c_op i));		\
-	return val;							\
+static inline long atomic64_fetch_and(long i, atomic64_t *v)
+{
+	s64 val = atomic64_read(v);
+
+	do {
+	} while (!atomic64_try_cmpxchg(v, &val, val & i));
+	return val;
 }
 
-#define ATOMIC64_OPS(op, c_op)						\
-	ATOMIC64_OP(op)							\
-	ATOMIC64_FETCH_OP(op, c_op)
+static inline void atomic64_or(long i, atomic64_t *v)
+{
+	asm volatile(LOCK_PREFIX "orq %1,%0"
+			: "+m" (v->counter)
+			: "er" (i)
+			: "memory");
+}
 
-ATOMIC64_OPS(and, &)
-ATOMIC64_OPS(or, |)
-ATOMIC64_OPS(xor, ^)
+static inline long atomic64_fetch_or(long i, atomic64_t *v)
+{
+	s64 val = atomic64_read(v);
 
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
+	do {
+	} while (!atomic64_try_cmpxchg(v, &val, val | i));
+	return val;
+}
+
+static inline void atomic64_xor(long i, atomic64_t *v)
+{
+	asm volatile(LOCK_PREFIX "xorq %1,%0"
+			: "+m" (v->counter)
+			: "er" (i)
+			: "memory");
+}
+
+static inline long atomic64_fetch_xor(long i, atomic64_t *v)
+{
+	s64 val = atomic64_read(v);
+
+	do {
+	} while (!atomic64_try_cmpxchg(v, &val, val ^ i));
+	return val;
+}
 
 #endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 39e702d90cdb..aa6b2023d8f8 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -35,7 +35,7 @@
 #define _BUG_FLAGS(ins, flags)						\
 do {									\
 	asm volatile("1:\t" ins "\n"					\
-		     ".pushsection __bug_table,\"a\"\n"			\
+		     ".pushsection __bug_table,\"aw\"\n"		\
 		     "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n"	\
 		     "\t"  __BUG_REL(%c0) "\t# bug_entry::file\n"	\
 		     "\t.word %c1"        "\t# bug_entry::line\n"	\
@@ -52,7 +52,7 @@ do {									\
 #define _BUG_FLAGS(ins, flags)						\
 do {									\
 	asm volatile("1:\t" ins "\n"					\
-		     ".pushsection __bug_table,\"a\"\n"			\
+		     ".pushsection __bug_table,\"aw\"\n"		\
 		     "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n"	\
 		     "\t.word %c0"        "\t# bug_entry::flags\n"	\
 		     "\t.org 2b+%c1\n"					\
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
 #define _ASM_X86_CMDLINE_H
 
 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+			char *buffer, int bufsize);
 
 #endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d90296d061e8..b5069e802d5c 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -157,7 +157,7 @@ extern void __add_wrong_size(void)
 #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock)		\
 ({									\
 	bool success;							\
-	__typeof__(_ptr) _old = (_pold);				\
+	__typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);		\
 	__typeof__(*(_ptr)) __old = *_old;				\
 	__typeof__(*(_ptr)) __new = (_new);				\
 	switch (size) {							\
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ca3c48c0872f..42bbbf0f173d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -177,7 +177,7 @@
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
 #define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
 #define X86_FEATURE_PTSC	( 6*32+27) /* performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_PERFCTR_LLC	( 6*32+28) /* Last Level Cache performance counter extensions */
 #define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
 
 /*
@@ -196,6 +196,7 @@
 
 #define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_SME		( 7*32+10) /* AMD Secure Memory Encryption */
 
 #define X86_FEATURE_INTEL_PPIN	( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
@@ -286,7 +287,7 @@
 #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
 #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
 #define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
-#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
 #define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index d0a21b12dd58..1a2ba368da39 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -5,6 +5,7 @@
 #include <asm/ldt.h>
 #include <asm/mmu.h>
 #include <asm/fixmap.h>
+#include <asm/irq_vectors.h>
 
 #include <linux/smp.h>
 #include <linux/percpu.h>
@@ -22,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 	desc->s			= 1;
 	desc->dpl		= 0x3;
 	desc->p			= info->seg_not_present ^ 1;
-	desc->limit		= (info->limit & 0xf0000) >> 16;
+	desc->limit1		= (info->limit & 0xf0000) >> 16;
 	desc->avl		= info->useable;
 	desc->d			= info->seg_32bit;
 	desc->g			= info->limit_in_pages;
@@ -83,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
 	return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
 }
 
-#ifdef CONFIG_X86_64
-
 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
 			     unsigned dpl, unsigned ist, unsigned seg)
 {
-	gate->offset_low	= PTR_LOW(func);
+	gate->offset_low	= (u16) func;
+	gate->bits.p		= 1;
+	gate->bits.dpl		= dpl;
+	gate->bits.zero		= 0;
+	gate->bits.type		= type;
+	gate->offset_middle	= (u16) (func >> 16);
+#ifdef CONFIG_X86_64
 	gate->segment		= __KERNEL_CS;
-	gate->ist		= ist;
-	gate->p			= 1;
-	gate->dpl		= dpl;
-	gate->zero0		= 0;
-	gate->zero1		= 0;
-	gate->type		= type;
-	gate->offset_middle	= PTR_MIDDLE(func);
-	gate->offset_high	= PTR_HIGH(func);
-}
-
+	gate->bits.ist		= ist;
+	gate->reserved		= 0;
+	gate->offset_high	= (u32) (func >> 32);
 #else
-static inline void pack_gate(gate_desc *gate, unsigned char type,
-			     unsigned long base, unsigned dpl, unsigned flags,
-			     unsigned short seg)
-{
-	gate->a = (seg << 16) | (base & 0xffff);
-	gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
-}
-
+	gate->segment		= seg;
+	gate->bits.ist		= 0;
 #endif
+}
 
 static inline int desc_empty(const void *ptr)
 {
@@ -173,35 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int
 	memcpy(&gdt[entry], desc, size);
 }
 
-static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
-				   unsigned long limit, unsigned char type,
-				   unsigned char flags)
-{
-	desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
-	desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
-		(limit & 0x000f0000) | ((type & 0xff) << 8) |
-		((flags & 0xf) << 20);
-	desc->p = 1;
-}
-
-
-static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+					 unsigned type, unsigned size)
 {
-#ifdef CONFIG_X86_64
-	struct ldttss_desc64 *desc = d;
+	struct ldttss_desc *desc = d;
 
 	memset(desc, 0, sizeof(*desc));
 
-	desc->limit0		= size & 0xFFFF;
-	desc->base0		= PTR_LOW(addr);
-	desc->base1		= PTR_MIDDLE(addr) & 0xFF;
+	desc->limit0		= (u16) size;
+	desc->base0		= (u16) addr;
+	desc->base1		= (addr >> 16) & 0xFF;
 	desc->type		= type;
 	desc->p			= 1;
 	desc->limit1		= (size >> 16) & 0xF;
-	desc->base2		= (PTR_MIDDLE(addr) >> 8) & 0xFF;
-	desc->base3		= PTR_HIGH(addr);
-#else
-	pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+	desc->base2		= (addr >> 24) & 0xFF;
+#ifdef CONFIG_X86_64
+	desc->base3		= (u32) (addr >> 32);
 #endif
 }
 
@@ -401,147 +381,20 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
 
 static inline unsigned long get_desc_limit(const struct desc_struct *desc)
 {
-	return desc->limit0 | (desc->limit << 16);
+	return desc->limit0 | (desc->limit1 << 16);
 }
 
 static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
 {
 	desc->limit0 = limit & 0xffff;
-	desc->limit = (limit >> 16) & 0xf;
-}
-
-#ifdef CONFIG_X86_64
-static inline void set_nmi_gate(int gate, void *addr)
-{
-	gate_desc s;
-
-	pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
-	write_idt_entry(debug_idt_table, gate, &s);
+	desc->limit1 = (limit >> 16) & 0xf;
 }
-#endif
 
-#ifdef CONFIG_TRACING
-extern struct desc_ptr trace_idt_descr;
-extern gate_desc trace_idt_table[];
-static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
-{
-	write_idt_entry(trace_idt_table, entry, gate);
-}
+void update_intr_gate(unsigned int n, const void *addr);
+void alloc_intr_gate(unsigned int n, const void *addr);
 
-static inline void _trace_set_gate(int gate, unsigned type, void *addr,
-				   unsigned dpl, unsigned ist, unsigned seg)
-{
-	gate_desc s;
-
-	pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
-	/*
-	 * does not need to be atomic because it is only done once at
-	 * setup time
-	 */
-	write_trace_idt_entry(gate, &s);
-}
-#else
-static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
-{
-}
-
-#define _trace_set_gate(gate, type, addr, dpl, ist, seg)
-#endif
-
-static inline void _set_gate(int gate, unsigned type, void *addr,
-			     unsigned dpl, unsigned ist, unsigned seg)
-{
-	gate_desc s;
-
-	pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
-	/*
-	 * does not need to be atomic because it is only done once at
-	 * setup time
-	 */
-	write_idt_entry(idt_table, gate, &s);
-	write_trace_idt_entry(gate, &s);
-}
-
-/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
- */
-#define set_intr_gate_notrace(n, addr)					\
-	do {								\
-		BUG_ON((unsigned)n > 0xFF);				\
-		_set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0,	\
-			  __KERNEL_CS);					\
-	} while (0)
-
-#define set_intr_gate(n, addr)						\
-	do {								\
-		set_intr_gate_notrace(n, addr);				\
-		_trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
-				0, 0, __KERNEL_CS);			\
-	} while (0)
-
-extern int first_system_vector;
-/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
 extern unsigned long used_vectors[];
 
-static inline void alloc_system_vector(int vector)
-{
-	if (!test_bit(vector, used_vectors)) {
-		set_bit(vector, used_vectors);
-		if (first_system_vector > vector)
-			first_system_vector = vector;
-	} else {
-		BUG();
-	}
-}
-
-#define alloc_intr_gate(n, addr)				\
-	do {							\
-		alloc_system_vector(n);				\
-		set_intr_gate(n, addr);				\
-	} while (0)
-
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
-	BUG_ON((unsigned)n > 0xFF);
-	_set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
-}
-
-static inline void set_system_trap_gate(unsigned int n, void *addr)
-{
-	BUG_ON((unsigned)n > 0xFF);
-	_set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
-}
-
-static inline void set_trap_gate(unsigned int n, void *addr)
-{
-	BUG_ON((unsigned)n > 0xFF);
-	_set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
-}
-
-static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
-	BUG_ON((unsigned)n > 0xFF);
-	_set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
-}
-
-static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
-{
-	BUG_ON((unsigned)n > 0xFF);
-	_set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
-}
-
-static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
-{
-	BUG_ON((unsigned)n > 0xFF);
-	_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
-}
-
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(u32, debug_idt_ctr);
 static inline bool is_debug_idt_enabled(void)
@@ -567,31 +420,6 @@ static inline void load_debug_idt(void)
 }
 #endif
 
-#ifdef CONFIG_TRACING
-extern atomic_t trace_idt_ctr;
-static inline bool is_trace_idt_enabled(void)
-{
-	if (atomic_read(&trace_idt_ctr))
-		return true;
-
-	return false;
-}
-
-static inline void load_trace_idt(void)
-{
-	load_idt((const struct desc_ptr *)&trace_idt_descr);
-}
-#else
-static inline bool is_trace_idt_enabled(void)
-{
-	return false;
-}
-
-static inline void load_trace_idt(void)
-{
-}
-#endif
-
 /*
  * The load_current_idt() must be called with interrupts disabled
  * to avoid races. That way the IDT will always be set back to the expected
@@ -603,9 +431,25 @@ static inline void load_current_idt(void)
 {
 	if (is_debug_idt_enabled())
 		load_debug_idt();
-	else if (is_trace_idt_enabled())
-		load_trace_idt();
 	else
 		load_idt((const struct desc_ptr *)&idt_descr);
 }
+
+extern void idt_setup_early_handler(void);
+extern void idt_setup_early_traps(void);
+extern void idt_setup_traps(void);
+extern void idt_setup_apic_and_irq_gates(void);
+
+#ifdef CONFIG_X86_64
+extern void idt_setup_early_pf(void);
+extern void idt_setup_ist_traps(void);
+extern void idt_setup_debugidt_traps(void);
+#else
+static inline void idt_setup_early_pf(void) { }
+static inline void idt_setup_ist_traps(void) { }
+static inline void idt_setup_debugidt_traps(void) { }
+#endif
+
+extern void idt_invalidate(void *addr);
+
 #endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 49265345d4d2..346d252029b7 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -11,34 +11,30 @@
 
 #include <linux/types.h>
 
-/*
- * FIXME: Accessing the desc_struct through its fields is more elegant,
- * and should be the one valid thing to do. However, a lot of open code
- * still touches the a and b accessors, and doing this allow us to do it
- * incrementally. We keep the signature as a struct, rather than a union,
- * so we can get rid of it transparently in the future -- glommer
- */
 /* 8 byte segment descriptor */
 struct desc_struct {
-	union {
-		struct {
-			unsigned int a;
-			unsigned int b;
-		};
-		struct {
-			u16 limit0;
-			u16 base0;
-			unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
-			unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
-		};
-	};
+	u16	limit0;
+	u16	base0;
+	u16	base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+	u16	limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
 } __attribute__((packed));
 
-#define GDT_ENTRY_INIT(flags, base, limit) { { { \
-		.a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \
-		.b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \
-			((limit) & 0xf0000) | ((base) & 0xff000000), \
-	} } }
+#define GDT_ENTRY_INIT(flags, base, limit)			\
+	{							\
+		.limit0		= (u16) (limit),		\
+		.limit1		= ((limit) >> 16) & 0x0F,	\
+		.base0		= (u16) (base),			\
+		.base1		= ((base) >> 16) & 0xFF,	\
+		.base2		= ((base) >> 24) & 0xFF,	\
+		.type		= (flags & 0x0f),		\
+		.s		= (flags >> 4) & 0x01,		\
+		.dpl		= (flags >> 5) & 0x03,		\
+		.p		= (flags >> 7) & 0x01,		\
+		.avl		= (flags >> 12) & 0x01,		\
+		.l		= (flags >> 13) & 0x01,		\
+		.d		= (flags >> 14) & 0x01,		\
+		.g		= (flags >> 15) & 0x01,		\
+	}
 
 enum {
 	GATE_INTERRUPT = 0xE,
@@ -47,49 +43,63 @@ enum {
 	GATE_TASK = 0x5,
 };
 
-/* 16byte gate */
-struct gate_struct64 {
-	u16 offset_low;
-	u16 segment;
-	unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
-	u16 offset_middle;
-	u32 offset_high;
-	u32 zero1;
-} __attribute__((packed));
-
-#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
-#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
-#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
-
 enum {
 	DESC_TSS = 0x9,
 	DESC_LDT = 0x2,
 	DESCTYPE_S = 0x10,	/* !system */
 };
 
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct ldttss_desc64 {
-	u16 limit0;
-	u16 base0;
-	unsigned base1 : 8, type : 5, dpl : 2, p : 1;
-	unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
-	u32 base3;
-	u32 zero1;
+/* LDT or TSS descriptor in the GDT. */
+struct ldttss_desc {
+	u16	limit0;
+	u16	base0;
+
+	u16	base1 : 8, type : 5, dpl : 2, p : 1;
+	u16	limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+#ifdef CONFIG_X86_64
+	u32	base3;
+	u32	zero1;
+#endif
 } __attribute__((packed));
 
+typedef struct ldttss_desc ldt_desc;
+typedef struct ldttss_desc tss_desc;
+
+struct idt_bits {
+	u16		ist	: 3,
+			zero	: 5,
+			type	: 5,
+			dpl	: 2,
+			p	: 1;
+} __attribute__((packed));
+
+struct gate_struct {
+	u16		offset_low;
+	u16		segment;
+	struct idt_bits	bits;
+	u16		offset_middle;
+#ifdef CONFIG_X86_64
+	u32		offset_high;
+	u32		reserved;
+#endif
+} __attribute__((packed));
+
+typedef struct gate_struct gate_desc;
+
+static inline unsigned long gate_offset(const gate_desc *g)
+{
 #ifdef CONFIG_X86_64
-typedef struct gate_struct64 gate_desc;
-typedef struct ldttss_desc64 ldt_desc;
-typedef struct ldttss_desc64 tss_desc;
-#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32))
-#define gate_segment(g) ((g).segment)
+	return g->offset_low | ((unsigned long)g->offset_middle << 16) |
+		((unsigned long) g->offset_high << 32);
 #else
-typedef struct desc_struct gate_desc;
-typedef struct desc_struct ldt_desc;
-typedef struct desc_struct tss_desc;
-#define gate_offset(g)		(((g).b & 0xffff0000) | ((g).a & 0x0000ffff))
-#define gate_segment(g)		((g).a >> 16)
+	return g->offset_low | ((unsigned long)g->offset_middle << 16);
 #endif
+}
+
+static inline unsigned long gate_segment(const gate_desc *g)
+{
+	return g->segment;
+}
 
 struct desc_ptr {
 	unsigned short size;
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 5dff775af7cd..c10c9128f54e 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -21,11 +21,13 @@
 # define DISABLE_K6_MTRR	(1<<(X86_FEATURE_K6_MTRR & 31))
 # define DISABLE_CYRIX_ARR	(1<<(X86_FEATURE_CYRIX_ARR & 31))
 # define DISABLE_CENTAUR_MCR	(1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID		0
 #else
 # define DISABLE_VME		0
 # define DISABLE_K6_MTRR	0
 # define DISABLE_CYRIX_ARR	0
 # define DISABLE_CENTAUR_MCR	0
+# define DISABLE_PCID		(1<<(X86_FEATURE_PCID & 31))
 #endif /* CONFIG_X86_64 */
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -49,7 +51,7 @@
 #define DISABLED_MASK1	0
 #define DISABLED_MASK2	0
 #define DISABLED_MASK3	(DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
-#define DISABLED_MASK4	0
+#define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
 #define DISABLED_MASK7	0
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 398c79889f5c..1387dafdba2d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 #include <linux/dma-contiguous.h>
+#include <linux/mem_encrypt.h>
 
 #ifdef CONFIG_ISA
 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-	return paddr;
+	return __sme_set(paddr);
 }
 
 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-	return daddr;
+	return __sme_clr(daddr);
 }
 #endif /* CONFIG_X86_DMA_REMAP */
 
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 3c69fed215c5..a8e15b04565b 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_early_remap		early_ioremap
-#define dmi_early_unmap		early_iounmap
-#define dmi_remap		ioremap_cache
-#define dmi_unmap		iounmap
+#define dmi_early_remap		early_memremap
+#define dmi_early_unmap		early_memunmap
+#define dmi_remap(_x, _l)	memremap(_x, _l, MEMREMAP_WB)
+#define dmi_unmap(_x)		memunmap(_x)
 
 #endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index a504adc661a4..cd266d830e49 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void);
 extern void e820__reallocate_tables(void);
 extern void e820__register_nosave_regions(unsigned long limit_pfn);
 
+extern int  e820__get_entry_type(u64 start, u64 end);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1c18d83d3f09..04330c8d9af9 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -126,15 +126,15 @@ do {						\
 	pr_reg[4] = regs->di;			\
 	pr_reg[5] = regs->bp;			\
 	pr_reg[6] = regs->ax;			\
-	pr_reg[7] = regs->ds & 0xffff;		\
-	pr_reg[8] = regs->es & 0xffff;		\
-	pr_reg[9] = regs->fs & 0xffff;		\
+	pr_reg[7] = regs->ds;			\
+	pr_reg[8] = regs->es;			\
+	pr_reg[9] = regs->fs;			\
 	pr_reg[11] = regs->orig_ax;		\
 	pr_reg[12] = regs->ip;			\
-	pr_reg[13] = regs->cs & 0xffff;		\
+	pr_reg[13] = regs->cs;			\
 	pr_reg[14] = regs->flags;		\
 	pr_reg[15] = regs->sp;			\
-	pr_reg[16] = regs->ss & 0xffff;		\
+	pr_reg[16] = regs->ss;			\
 } while (0);
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)	\
@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)			\
 do {								\
+	unsigned long base;					\
 	unsigned v;						\
 	(pr_reg)[0] = (regs)->r15;				\
 	(pr_reg)[1] = (regs)->r14;				\
@@ -226,8 +227,8 @@ do {								\
 	(pr_reg)[18] = (regs)->flags;				\
 	(pr_reg)[19] = (regs)->sp;				\
 	(pr_reg)[20] = (regs)->ss;				\
-	(pr_reg)[21] = current->thread.fsbase;			\
-	(pr_reg)[22] = current->thread.gsbase;			\
+	rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base;		\
+	rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base;	\
 	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
 	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
 	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
@@ -247,11 +248,11 @@ extern int force_personality32;
 
 /*
  * This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
  * space open for things that want to use the area for 32-bit pointers.
  */
 #define ELF_ET_DYN_BASE		(mmap_is_ia32() ? 0x000400000UL : \
-						  0x100000000UL)
+						  (TASK_SIZE / 3 * 2))
 
 /* This yields a mask that user programs can use to figure out what
    instruction set this CPU supports.  This could be done in user space,
@@ -304,8 +305,8 @@ static inline int mmap_is_ia32(void)
 		test_thread_flag(TIF_ADDR32));
 }
 
-extern unsigned long tasksize_32bit(void);
-extern unsigned long tasksize_64bit(void);
+extern unsigned long task_size_32bit(void);
+extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index df002992d8fd..aa15d1f7e530 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -13,18 +13,14 @@
 BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
 BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
-BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR,
-		 smp_irq_move_cleanup_interrupt)
-BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt)
+BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR)
+BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR)
 #endif
 
-BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
-
 #ifdef CONFIG_HAVE_KVM
-BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
-		 smp_kvm_posted_intr_ipi)
-BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
-		 smp_kvm_posted_intr_wakeup_ipi)
+BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
+BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR)
+BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR)
 #endif
 
 /*
@@ -39,6 +35,7 @@ BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
 BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 
 #ifdef CONFIG_IRQ_WORK
 BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b65155cc3760..dcd9fb55e679 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx,
 }
 #endif
 
+/*
+ * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
+ * supported for MMIO addresses, so make sure that the memory encryption
+ * mask is not part of the page attributes.
+ */
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
+
+/*
+ * Early memremap routines used for in-place encryption. The mappings created
+ * by these routines are intended to be used as temporary mappings.
+ */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+				      unsigned long size);
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+					 unsigned long size);
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+				      unsigned long size);
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+					 unsigned long size);
+
 #include <asm-generic/fixmap.h>
 
 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 255645f60ca2..554cdb205d17 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -450,10 +450,10 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
 	return 0;
 }
 
-static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate)
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
 {
 	if (use_xsave()) {
-		copy_kernel_to_xregs(&fpstate->xsave, -1);
+		copy_kernel_to_xregs(&fpstate->xsave, mask);
 	} else {
 		if (use_fxsr())
 			copy_kernel_to_fxregs(&fpstate->fxsave);
@@ -477,7 +477,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
 			: : [addr] "m" (fpstate));
 	}
 
-	__copy_kernel_to_fpregs(fpstate);
+	__copy_kernel_to_fpregs(fpstate, -1);
 }
 
 extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index b4c1f5453436..f4dc9b63bdda 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -41,20 +41,11 @@
 		       "+m" (*uaddr), "=&r" (tem)		\
 		     : "r" (oparg), "i" (-EFAULT), "1" (0))
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 9b76cd331990..ad1ed531febc 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -15,6 +15,7 @@ typedef struct {
 #ifdef CONFIG_HAVE_KVM
 	unsigned int kvm_posted_intr_ipis;
 	unsigned int kvm_posted_intr_wakeup_ipis;
+	unsigned int kvm_posted_intr_nested_ipis;
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b90e1053049b..6dfe366a8804 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,7 @@ extern asmlinkage void apic_timer_interrupt(void);
 extern asmlinkage void x86_platform_ipi(void);
 extern asmlinkage void kvm_posted_intr_ipi(void);
 extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
+extern asmlinkage void kvm_posted_intr_nested_ipi(void);
 extern asmlinkage void error_interrupt(void);
 extern asmlinkage void irq_work_interrupt(void);
 
@@ -45,25 +46,6 @@ extern asmlinkage void deferred_error_interrupt(void);
 extern asmlinkage void call_function_interrupt(void);
 extern asmlinkage void call_function_single_interrupt(void);
 
-#ifdef CONFIG_TRACING
-/* Interrupt handlers registered during init_IRQ */
-extern void trace_apic_timer_interrupt(void);
-extern void trace_x86_platform_ipi(void);
-extern void trace_error_interrupt(void);
-extern void trace_irq_work_interrupt(void);
-extern void trace_spurious_interrupt(void);
-extern void trace_thermal_interrupt(void);
-extern void trace_reschedule_interrupt(void);
-extern void trace_threshold_interrupt(void);
-extern void trace_deferred_error_interrupt(void);
-extern void trace_call_function_interrupt(void);
-extern void trace_call_function_single_interrupt(void);
-#define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
-#define trace_reboot_interrupt  reboot_interrupt
-#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
-#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
-#endif /* CONFIG_TRACING */
-
 #ifdef	CONFIG_X86_LOCAL_APIC
 struct irq_data;
 struct pci_dev;
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 21126155a739..0ead9dbb9130 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -43,6 +43,9 @@ struct hypervisor_x86 {
 
 	/* pin current vcpu to specified physical cpu (run rarely) */
 	void		(*pin_vcpu)(int);
+
+	/* called during init_mem_mapping() to setup early mappings. */
+	void		(*init_mem_mapping)(void);
 };
 
 extern const struct hypervisor_x86 *x86_hyper;
@@ -57,8 +60,15 @@ extern const struct hypervisor_x86 x86_hyper_kvm;
 extern void init_hypervisor_platform(void);
 extern bool hypervisor_x2apic_available(void);
 extern void hypervisor_pin_vcpu(int cpu);
+
+static inline void hypervisor_init_mem_mapping(void)
+{
+	if (x86_hyper && x86_hyper->init_mem_mapping)
+		x86_hyper->init_mem_mapping();
+}
 #else
 static inline void init_hypervisor_platform(void) { }
 static inline bool hypervisor_x2apic_available(void) { return false; }
+static inline void hypervisor_init_mem_mapping(void) { }
 #endif /* CONFIG_HYPERVISOR_GUEST */
 #endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 474eb8c66fee..05c4aa00cc86 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -7,6 +7,7 @@ struct x86_mapping_info {
 	unsigned long page_flag;	 /* page flag for PMD or PUD entry */
 	unsigned long offset;		 /* ident mapping offset */
 	bool direct_gbpages;		 /* PUD level 1GB page support */
+	unsigned long kernpg_flag;	 /* kernel pagetable flag override */
 };
 
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
deleted file mode 100644
index 597dc4995678..000000000000
--- a/arch/x86/include/asm/intel_rdt.h
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_H
-#define _ASM_X86_INTEL_RDT_H
-
-#ifdef CONFIG_INTEL_RDT_A
-
-#include <linux/sched.h>
-#include <linux/kernfs.h>
-#include <linux/jump_label.h>
-
-#include <asm/intel_rdt_common.h>
-
-#define IA32_L3_QOS_CFG		0xc81
-#define IA32_L3_CBM_BASE	0xc90
-#define IA32_L2_CBM_BASE	0xd10
-#define IA32_MBA_THRTL_BASE	0xd50
-
-#define L3_QOS_CDP_ENABLE	0x01ULL
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn:				kernfs node
- * @rdtgroup_list:		linked list for all rdtgroups
- * @closid:			closid for this rdtgroup
- * @cpu_mask:			CPUs assigned to this rdtgroup
- * @flags:			status bits
- * @waitcount:			how many cpus expect to find this
- *				group when they acquire rdtgroup_mutex
- */
-struct rdtgroup {
-	struct kernfs_node	*kn;
-	struct list_head	rdtgroup_list;
-	int			closid;
-	struct cpumask		cpu_mask;
-	int			flags;
-	atomic_t		waitcount;
-};
-
-/* rdtgroup.flags */
-#define	RDT_DELETED		1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST	1
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name:	File name
- * @mode:	Access mode
- * @kf_ops:	File operations
- * @flags:	File specific RFTYPE_FLAGS_* flags
- * @seq_show:	Show content of the file
- * @write:	Write to the file
- */
-struct rftype {
-	char			*name;
-	umode_t			mode;
-	struct kernfs_ops	*kf_ops;
-	unsigned long		flags;
-
-	int (*seq_show)(struct kernfs_open_file *of,
-			struct seq_file *sf, void *v);
-	/*
-	 * write() is the generic write callback which maps directly to
-	 * kernfs write operation and overrides all other operations.
-	 * Maximum write size is determined by ->max_write_len.
-	 */
-	ssize_t (*write)(struct kernfs_open_file *of,
-			 char *buf, size_t nbytes, loff_t off);
-};
-
-/**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list:	all instances of this resource
- * @id:		unique id for this instance
- * @cpu_mask:	which cpus share this resource
- * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
- * @new_ctrl:	new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- */
-struct rdt_domain {
-	struct list_head	list;
-	int			id;
-	struct cpumask		cpu_mask;
-	u32			*ctrl_val;
-	u32			new_ctrl;
-	bool			have_new_ctrl;
-};
-
-/**
- * struct msr_param - set a range of MSRs from a domain
- * @res:       The resource to use
- * @low:       Beginning index from base MSR
- * @high:      End index
- */
-struct msr_param {
-	struct rdt_resource	*res;
-	int			low;
-	int			high;
-};
-
-/**
- * struct rdt_cache - Cache allocation related data
- * @cbm_len:		Length of the cache bit mask
- * @min_cbm_bits:	Minimum number of consecutive bits to be set
- * @cbm_idx_mult:	Multiplier of CBM index
- * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
- *			closid * cbm_idx_multi + cbm_idx_offset
- *			in a cache bit mask
- */
-struct rdt_cache {
-	unsigned int	cbm_len;
-	unsigned int	min_cbm_bits;
-	unsigned int	cbm_idx_mult;
-	unsigned int	cbm_idx_offset;
-};
-
-/**
- * struct rdt_membw - Memory bandwidth allocation related data
- * @max_delay:		Max throttle delay. Delay is the hardware
- *			representation for memory bandwidth.
- * @min_bw:		Minimum memory bandwidth percentage user can request
- * @bw_gran:		Granularity at which the memory bandwidth is allocated
- * @delay_linear:	True if memory B/W delay is in linear scale
- * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
- */
-struct rdt_membw {
-	u32		max_delay;
-	u32		min_bw;
-	u32		bw_gran;
-	u32		delay_linear;
-	u32		*mb_map;
-};
-
-/**
- * struct rdt_resource - attributes of an RDT resource
- * @enabled:		Is this feature enabled on this machine
- * @capable:		Is this feature available on this machine
- * @name:		Name to use in "schemata" file
- * @num_closid:		Number of CLOSIDs available
- * @cache_level:	Which cache level defines scope of this resource
- * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
- * @msr_base:		Base MSR address for CBMs
- * @msr_update:		Function pointer to update QOS MSRs
- * @data_width:		Character width of data when displaying
- * @domains:		All domains for this resource
- * @cache:		Cache allocation related data
- * @info_files:		resctrl info files for the resource
- * @nr_info_files:	Number of info files
- * @format_str:		Per resource format string to show domain value
- * @parse_ctrlval:	Per resource function pointer to parse control values
- */
-struct rdt_resource {
-	bool			enabled;
-	bool			capable;
-	char			*name;
-	int			num_closid;
-	int			cache_level;
-	u32			default_ctrl;
-	unsigned int		msr_base;
-	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
-				 struct rdt_resource *r);
-	int			data_width;
-	struct list_head	domains;
-	struct rdt_cache	cache;
-	struct rdt_membw	membw;
-	struct rftype		*info_files;
-	int			nr_info_files;
-	const char		*format_str;
-	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
-				 struct rdt_domain *d);
-};
-
-void rdt_get_cache_infofile(struct rdt_resource *r);
-void rdt_get_mba_infofile(struct rdt_resource *r);
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
-
-extern struct mutex rdtgroup_mutex;
-
-extern struct rdt_resource rdt_resources_all[];
-extern struct rdtgroup rdtgroup_default;
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-
-int __init rdtgroup_init(void);
-
-enum {
-	RDT_RESOURCE_L3,
-	RDT_RESOURCE_L3DATA,
-	RDT_RESOURCE_L3CODE,
-	RDT_RESOURCE_L2,
-	RDT_RESOURCE_MBA,
-
-	/* Must be the last */
-	RDT_NUM_RESOURCES,
-};
-
-#define for_each_capable_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
-		if (r->capable)
-
-#define for_each_enabled_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
-		if (r->enabled)
-
-/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
-union cpuid_0x10_1_eax {
-	struct {
-		unsigned int cbm_len:5;
-	} split;
-	unsigned int full;
-};
-
-/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
-union cpuid_0x10_3_eax {
-	struct {
-		unsigned int max_delay:12;
-	} split;
-	unsigned int full;
-};
-
-/* CPUID.(EAX=10H, ECX=ResID).EDX */
-union cpuid_0x10_x_edx {
-	struct {
-		unsigned int cos_max:16;
-	} split;
-	unsigned int full;
-};
-
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
-void rdt_ctrl_update(void *arg);
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
-void rdtgroup_kn_unlock(struct kernfs_node *kn);
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
-				char *buf, size_t nbytes, loff_t off);
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
-			   struct seq_file *s, void *v);
-
-/*
- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
- *
- * Following considerations are made so that this has minimal impact
- * on scheduler hot path:
- * - This will stay as no-op unless we are running on an Intel SKU
- *   which supports resource control and we enable by mounting the
- *   resctrl file system.
- * - Caches the per cpu CLOSid values and does the MSR write only
- *   when a task with a different CLOSid is scheduled in.
- *
- * Must be called with preemption disabled.
- */
-static inline void intel_rdt_sched_in(void)
-{
-	if (static_branch_likely(&rdt_enable_key)) {
-		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-		int closid;
-
-		/*
-		 * If this task has a closid assigned, use it.
-		 * Else use the closid assigned to this cpu.
-		 */
-		closid = current->closid;
-		if (closid == 0)
-			closid = this_cpu_read(cpu_closid);
-
-		if (closid != state->closid) {
-			state->closid = closid;
-			wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
-		}
-	}
-}
-
-#else
-
-static inline void intel_rdt_sched_in(void) {}
-
-#endif /* CONFIG_INTEL_RDT_A */
-#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
deleted file mode 100644
index b31081b89407..000000000000
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_COMMON_H
-#define _ASM_X86_INTEL_RDT_COMMON_H
-
-#define MSR_IA32_PQR_ASSOC	0x0c8f
-
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:		The cached Resource Monitoring ID
- * @closid:		The cached Class Of Service ID
- * @rmid_usecnt:	The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-	u32			rmid;
-	u32			closid;
-	int			rmid_usecnt;
-};
-
-DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
-
-#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
new file mode 100644
index 000000000000..b4bbf8b21512
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -0,0 +1,92 @@
+#ifndef _ASM_X86_INTEL_RDT_SCHED_H
+#define _ASM_X86_INTEL_RDT_SCHED_H
+
+#ifdef CONFIG_INTEL_RDT
+
+#include <linux/sched.h>
+#include <linux/jump_label.h>
+
+#define IA32_PQR_ASSOC	0x0c8f
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @cur_rmid:		The cached Resource Monitoring ID
+ * @cur_closid:	The cached Class Of Service ID
+ * @default_rmid:	The user assigned Resource Monitoring ID
+ * @default_closid:	The user assigned cached Class Of Service ID
+ *
+ * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them. This also
+ * stores the user configured per cpu CLOSID and RMID.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+	u32			cur_rmid;
+	u32			cur_closid;
+	u32			default_rmid;
+	u32			default_closid;
+};
+
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
+/*
+ * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ *   which supports resource control or monitoring and we enable by
+ *   mounting the resctrl file system.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ *   when a task with a different CLOSid/RMID is scheduled in.
+ * - We allocate RMIDs/CLOSids globally in order to keep this as
+ *   simple as possible.
+ * Must be called with preemption disabled.
+ */
+static void __intel_rdt_sched_in(void)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u32 closid = state->default_closid;
+	u32 rmid = state->default_rmid;
+
+	/*
+	 * If this task has a closid/rmid assigned, use it.
+	 * Else use the closid/rmid assigned to this cpu.
+	 */
+	if (static_branch_likely(&rdt_alloc_enable_key)) {
+		if (current->closid)
+			closid = current->closid;
+	}
+
+	if (static_branch_likely(&rdt_mon_enable_key)) {
+		if (current->rmid)
+			rmid = current->rmid;
+	}
+
+	if (closid != state->cur_closid || rmid != state->cur_rmid) {
+		state->cur_closid = closid;
+		state->cur_rmid = rmid;
+		wrmsr(IA32_PQR_ASSOC, rmid, closid);
+	}
+}
+
+static inline void intel_rdt_sched_in(void)
+{
+	if (static_branch_likely(&rdt_enable_key))
+		__intel_rdt_sched_in();
+}
+
+#else
+
+static inline void intel_rdt_sched_in(void) {}
+
+#endif /* CONFIG_INTEL_RDT */
+
+#endif /* _ASM_X86_INTEL_RDT_SCHED_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 7afb0e2f07f4..c40a95c33bb8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", )
 build_mmio_write(__writew, "w", unsigned short, "r", )
 build_mmio_write(__writel, "l", unsigned int, "r", )
 
+#define readb readb
+#define readw readw
+#define readl readl
 #define readb_relaxed(a) __readb(a)
 #define readw_relaxed(a) __readw(a)
 #define readl_relaxed(a) __readl(a)
@@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
 #define __raw_readw __readw
 #define __raw_readl __readl
 
+#define writeb writeb
+#define writew writew
+#define writel writel
 #define writeb_relaxed(v, a) __writeb(v, a)
 #define writew_relaxed(v, a) __writew(v, a)
 #define writel_relaxed(v, a) __writel(v, a)
@@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
 #ifdef CONFIG_X86_64
 
 build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_read(__readq, "q", unsigned long, "=r", )
 build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+build_mmio_write(__writeq, "q", unsigned long, "r", )
 
-#define readq_relaxed(a)	readq(a)
-#define writeq_relaxed(v, a)	writeq(v, a)
+#define readq_relaxed(a)	__readq(a)
+#define writeq_relaxed(v, a)	__writeq(v, a)
 
-#define __raw_readq(a)		readq(a)
-#define __raw_writeq(val, addr)	writeq(val, addr)
+#define __raw_readq		__readq
+#define __raw_writeq		__writeq
 
 /* Let people know that we have them */
 #define readq			readq
@@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address)
 {
 	return __pa(address);
 }
+#define virt_to_phys virt_to_phys
 
 /**
  *	phys_to_virt	-	map physical address to virtual
@@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address)
 {
 	return __va(address);
 }
+#define phys_to_virt phys_to_virt
 
 /*
  * Change "struct page" to physical address.
@@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  * else, you probably want one of the following.
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+#define ioremap_nocache ioremap_nocache
 extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 #define ioremap_uc ioremap_uc
 
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+#define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
+#define ioremap_prot ioremap_prot
 
 /**
  * ioremap     -   map bus memory into CPU space
@@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 {
 	return ioremap_nocache(offset, size);
 }
+#define ioremap ioremap
 
 extern void iounmap(volatile void __iomem *addr);
+#define iounmap iounmap
 
 extern void set_iounmap_nonlazy(void);
 
@@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void);
 #include <asm-generic/iomap.h>
 
 /*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)	p
-
-/**
- * memset_io	Set a range of I/O memory to a constant value
- * @addr:	The beginning of the I/O-memory range to set
- * @val:	The value to set the memory to
- * @count:	The number of bytes to set
- *
- * Set a range of I/O memory to a given value.
- */
-static inline void
-memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
-{
-	memset((void __force *)addr, val, count);
-}
-
-/**
- * memcpy_fromio	Copy a block of data from I/O memory
- * @dst:		The (RAM) destination for the copy
- * @src:		The (I/O memory) source for the data
- * @count:		The number of bytes to copy
- *
- * Copy a block of data from I/O memory.
- */
-static inline void
-memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
-{
-	memcpy(dst, (const void __force *)src, count);
-}
-
-/**
- * memcpy_toio		Copy a block of data into I/O memory
- * @dst:		The (I/O memory) destination for the copy
- * @src:		The (RAM) source for the data
- * @count:		The number of bytes to copy
- *
- * Copy a block of data to I/O memory.
- */
-static inline void
-memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
-{
-	memcpy((void __force *)dst, src, count);
-}
-
-/*
  * ISA space is 'always mapped' on a typical x86 system, no need to
  * explicitly ioremap() it. The fact that the ISA IO space is mapped
  * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
@@ -328,26 +296,51 @@ static inline unsigned type in##bwl##_p(int port)			\
 static inline void outs##bwl(int port, const void *addr, unsigned long count) \
 {									\
 	asm volatile("rep; outs" #bwl					\
-		     : "+S"(addr), "+c"(count) : "d"(port));		\
+		     : "+S"(addr), "+c"(count) : "d"(port) : "memory");	\
 }									\
 									\
 static inline void ins##bwl(int port, void *addr, unsigned long count)	\
 {									\
 	asm volatile("rep; ins" #bwl					\
-		     : "+D"(addr), "+c"(count) : "d"(port));		\
+		     : "+D"(addr), "+c"(count) : "d"(port) : "memory");	\
 }
 
 BUILDIO(b, b, char)
 BUILDIO(w, w, short)
 BUILDIO(l, , int)
 
+#define inb inb
+#define inw inw
+#define inl inl
+#define inb_p inb_p
+#define inw_p inw_p
+#define inl_p inl_p
+#define insb insb
+#define insw insw
+#define insl insl
+
+#define outb outb
+#define outw outw
+#define outl outl
+#define outb_p outb_p
+#define outw_p outw_p
+#define outl_p outl_p
+#define outsb outsb
+#define outsw outsw
+#define outsl outsl
+
 extern void *xlate_dev_mem_ptr(phys_addr_t phys);
 extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
 
+#define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
+
 extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 				enum page_cache_mode pcm);
 extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+#define ioremap_wc ioremap_wc
 extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
+#define ioremap_wt ioremap_wt
 
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
@@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 
 #define IO_SPACE_LIMIT 0xffff
 
+#include <asm-generic/io.h>
+#undef PCI_IOBASE
+
 #ifdef CONFIG_MTRR
 extern int __must_check arch_phys_wc_index(int handle);
 #define arch_phys_wc_index arch_phys_wc_index
@@ -381,4 +377,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
 #endif
 
+extern bool arch_memremap_can_ram_remap(resource_size_t offset,
+					unsigned long size,
+					unsigned long flags);
+#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+
+extern bool phys_mem_access_encrypted(unsigned long phys_addr,
+				      unsigned long size);
+
 #endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 668cca540025..9958ceea2fa3 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -42,10 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
 
 extern __visible unsigned int do_IRQ(struct pt_regs *regs);
 
-/* Interrupt vector management */
-extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
-extern int vector_used_by_percpu_irq(unsigned int vector);
-
 extern void init_ISA_irqs(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6ca9fd6234e1..aaf8d28b5d00 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -83,7 +83,6 @@
  */
 #define X86_PLATFORM_IPI_VECTOR		0xf7
 
-#define POSTED_INTR_WAKEUP_VECTOR	0xf1
 /*
  * IRQ work vector:
  */
@@ -98,6 +97,8 @@
 /* Vector for KVM to deliver posted interrupt IPI */
 #ifdef CONFIG_HAVE_KVM
 #define POSTED_INTR_VECTOR		0xf2
+#define POSTED_INTR_WAKEUP_VECTOR	0xf1
+#define POSTED_INTR_NESTED_VECTOR	0xf0
 #endif
 
 /*
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index f70604125286..ddbb8ea0f5a9 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -3,9 +3,17 @@
 
 #include <asm/cpufeature.h>
 
+#ifdef CONFIG_X86_LOCAL_APIC
 static inline bool arch_irq_work_has_interrupt(void)
 {
 	return boot_cpu_has(X86_FEATURE_APIC);
 }
+extern void arch_irq_work_raise(void);
+#else
+static inline bool arch_irq_work_has_interrupt(void)
+{
+	return false;
+}
+#endif
 
 #endif /* _ASM_IRQ_WORK_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 70ef205489f0..942c1f444da8 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -147,7 +147,8 @@ unsigned long
 relocate_kernel(unsigned long indirection_page,
 		unsigned long page_list,
 		unsigned long start_address,
-		unsigned int preserve_context);
+		unsigned int preserve_context,
+		unsigned int sme_active);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
@@ -207,6 +208,14 @@ struct kexec_entry64_regs {
 	uint64_t r15;
 	uint64_t rip;
 };
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+				       gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
 #endif
 
 typedef void crash_vmclear_fn(void);
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 34b984c60790..6cf65437b5e5 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -52,10 +52,10 @@ typedef u8 kprobe_opcode_t;
 #define flush_insn_slot(p)	do { } while (0)
 
 /* optinsn template addresses */
-extern __visible kprobe_opcode_t optprobe_template_entry;
-extern __visible kprobe_opcode_t optprobe_template_val;
-extern __visible kprobe_opcode_t optprobe_template_call;
-extern __visible kprobe_opcode_t optprobe_template_end;
+extern __visible kprobe_opcode_t optprobe_template_entry[];
+extern __visible kprobe_opcode_t optprobe_template_val[];
+extern __visible kprobe_opcode_t optprobe_template_call[];
+extern __visible kprobe_opcode_t optprobe_template_end[];
 #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
 #define MAX_OPTINSN_SIZE 				\
 	(((unsigned long)&optprobe_template_end -	\
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 87ac4fba6d8e..369e41c23f07 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -492,6 +492,7 @@ struct kvm_vcpu_arch {
 	unsigned long cr4;
 	unsigned long cr4_guest_owned_bits;
 	unsigned long cr8;
+	u32 pkru;
 	u32 hflags;
 	u64 efer;
 	u64 apic_base;
@@ -1078,7 +1079,7 @@ void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-		u64 acc_track_mask);
+		u64 acc_track_mask, u64 me_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1374,8 +1375,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
-					   unsigned long address);
 
 void kvm_define_shared_msr(unsigned index, u32 msr);
 int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
deleted file mode 100644
index 73d0c9b92087..000000000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS	10
-#define GDT_ENTRY_LGUEST_DS	11
-#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
-	/* Manually saved part. */
-	unsigned long eax, ebx, ecx, edx;
-	unsigned long esi, edi, ebp;
-	unsigned long gs;
-	unsigned long fs, ds, es;
-	unsigned long trapnum, errcode;
-	/* Trap pushed part */
-	unsigned long eip;
-	unsigned long cs;
-	unsigned long eflags;
-	unsigned long esp;
-	unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
-	/* Host information we need to restore when we switch back. */
-	u32 host_cr3;
-	struct desc_ptr host_idt_desc;
-	struct desc_ptr host_gdt_desc;
-	u32 host_sp;
-
-	/* Fields which are used when guest is running. */
-	struct desc_ptr guest_idt_desc;
-	struct desc_ptr guest_gdt_desc;
-	struct x86_hw_tss guest_tss;
-	struct desc_struct guest_idt[IDT_ENTRIES];
-	struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
-	/* The GDT entries copied into lguest_ro_state when running. */
-	struct desc_struct gdt[GDT_ENTRIES];
-
-	/* The IDT entries: some copied into lguest_ro_state when running. */
-	struct desc_struct idt[IDT_ENTRIES];
-
-	/* The address of the last guest-visible pagefault (ie. cr2). */
-	unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
-	u32 cr0;
-
-	cr0 = read_cr0();
-	if (!(cr0 & 8))
-		write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
-	((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
deleted file mode 100644
index 6c119cfae218..000000000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC	0
-#define LHCALL_LGUEST_INIT	1
-#define LHCALL_SHUTDOWN		2
-#define LHCALL_NEW_PGTABLE	4
-#define LHCALL_FLUSH_TLB	5
-#define LHCALL_LOAD_IDT_ENTRY	6
-#define LHCALL_SET_STACK	7
-#define LHCALL_SET_CLOCKEVENT	9
-#define LHCALL_HALT		10
-#define LHCALL_SET_PMD		13
-#define LHCALL_SET_PTE		14
-#define LHCALL_SET_PGD		15
-#define LHCALL_LOAD_TLS		16
-#define LHCALL_LOAD_GDT_ENTRY	18
-#define LHCALL_SEND_INTERRUPTS	19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF	1
-#define LGUEST_SHUTDOWN_RESTART		2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations?  There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts).  Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure.  This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
-      unsigned long arg1, unsigned long arg2, unsigned long arg3,
-      unsigned long arg4)
-{
-	/* "int" is the Intel instruction to trigger a trap. */
-	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
-		     /* The call in %eax (aka "a") might be overwritten */
-		     : "=a"(call)
-		       /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
-		     : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
-		       /* "memory" means this might write somewhere in memory.
-			* This isn't true for all calls, but it's safe to tell
-			* gcc that it might happen so it doesn't get clever. */
-		     : "memory");
-	return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
-	/* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
-	unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..8e618fcf1f7c
--- /dev/null
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -0,0 +1,80 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __X86_MEM_ENCRYPT_H__
+#define __X86_MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern unsigned long sme_me_mask;
+
+void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
+			 unsigned long decrypted_kernel_vaddr,
+			 unsigned long kernel_len,
+			 unsigned long encryption_wa,
+			 unsigned long encryption_pgd);
+
+void __init sme_early_encrypt(resource_size_t paddr,
+			      unsigned long size);
+void __init sme_early_decrypt(resource_size_t paddr,
+			      unsigned long size);
+
+void __init sme_map_bootdata(char *real_mode_data);
+void __init sme_unmap_bootdata(char *real_mode_data);
+
+void __init sme_early_init(void);
+
+void __init sme_encrypt_kernel(void);
+void __init sme_enable(struct boot_params *bp);
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
+
+#else	/* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define sme_me_mask	0UL
+
+static inline void __init sme_early_encrypt(resource_size_t paddr,
+					    unsigned long size) { }
+static inline void __init sme_early_decrypt(resource_size_t paddr,
+					    unsigned long size) { }
+
+static inline void __init sme_map_bootdata(char *real_mode_data) { }
+static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
+
+static inline void __init sme_early_init(void) { }
+
+static inline void __init sme_encrypt_kernel(void) { }
+static inline void __init sme_enable(struct boot_params *bp) { }
+
+#endif	/* CONFIG_AMD_MEM_ENCRYPT */
+
+/*
+ * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
+ * writing to or comparing values from the cr3 register.  Having the
+ * encryption mask set in cr3 enables the PGD entry to be encrypted and
+ * avoid special case handling of PGD allocations.
+ */
+#define __sme_pa(x)		(__pa(x) | sme_me_mask)
+#define __sme_pa_nodebug(x)	(__pa_nodebug(x) | sme_me_mask)
+
+#endif	/* __ASSEMBLY__ */
+
+#endif	/* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 79b647a7ebd0..bb8c597c2248 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -3,12 +3,28 @@
 
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
+#include <linux/atomic.h>
 
 /*
- * The x86 doesn't have a mmu context, but
- * we put the segment information here.
+ * x86 has arch-specific MMU state beyond what lives in mm_struct.
  */
 typedef struct {
+	/*
+	 * ctx_id uniquely identifies this mm_struct.  A ctx_id will never
+	 * be reused, and zero is not a valid ctx_id.
+	 */
+	u64 ctx_id;
+
+	/*
+	 * Any code that needs to do any sort of TLB flushing for this
+	 * mm will first make its changes to the page tables, then
+	 * increment tlb_gen, then flush.  This lets the low-level
+	 * flushing code keep track of what needs flushing.
+	 *
+	 * This is not used on Xen PV.
+	 */
+	atomic64_t tlb_gen;
+
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 #endif
@@ -37,6 +53,11 @@ typedef struct {
 #endif
 } mm_context_t;
 
+#define INIT_MM_CONTEXT(mm)						\
+	.context = {							\
+		.ctx_id = 1,						\
+	}
+
 void leave_mm(int cpu);
 
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index ecfcb6643c9b..7ae318c340d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,6 +12,9 @@
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
 #include <asm/mpx.h>
+
+extern atomic64_t last_mm_ctx_id;
+
 #ifndef CONFIG_PARAVIRT
 static inline void paravirt_activate_mm(struct mm_struct *prev,
 					struct mm_struct *next)
@@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
-	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-		this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+	int cpu = smp_processor_id();
+
+	if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+		cpumask_clear_cpu(cpu, mm_cpumask(mm));
 }
 
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
+	mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
+	atomic64_set(&mm->context.tlb_gen, 0);
+
 	#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
 		/* pkey 0 is the default and always allocated */
@@ -140,9 +148,7 @@ static inline int init_new_context(struct task_struct *tsk,
 		mm->context.execute_only_pkey = -1;
 	}
 	#endif
-	init_new_context_ldt(tsk, mm);
-
-	return 0;
+	return init_new_context_ldt(tsk, mm);
 }
 static inline void destroy_context(struct mm_struct *mm)
 {
@@ -292,8 +298,11 @@ static inline unsigned long __get_current_cr3_fast(void)
 {
 	unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
 
+	if (static_cpu_has(X86_FEATURE_PCID))
+		cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
 	/* For now, be very restrictive about when this can be called. */
-	VM_WARN_ON(in_nmi() || !in_atomic());
+	VM_WARN_ON(in_nmi() || preemptible());
 
 	VM_BUG_ON(cr3 != __read_cr3());
 	return cr3;
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e3b7819caeef..9eb7c718aaf8 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -2,6 +2,15 @@
 #define _ASM_X86_MODULE_H
 
 #include <asm-generic/module.h>
+#include <asm/orc_types.h>
+
+struct mod_arch_specific {
+#ifdef CONFIG_ORC_UNWINDER
+	unsigned int num_orcs;
+	int *orc_unwind_ip;
+	struct orc_entry *orc_unwind;
+#endif
+};
 
 #ifdef CONFIG_X86_64
 /* X86_64 does not define MODULE_PROC_FAMILY */
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index a0d662be4c5b..7d7404756bb4 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm)
 }
 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long start, unsigned long end);
+
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+		unsigned long flags);
 #else
 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
@@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm,
 				    unsigned long start, unsigned long end)
 {
 }
+
+static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
+		unsigned long len, unsigned long flags)
+{
+	return addr;
+}
 #endif /* CONFIG_X86_INTEL_MPX */
 
 #endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5573c75f8e4c..17f5c12e1afd 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -356,6 +356,8 @@
 #define MSR_K8_TOP_MEM1			0xc001001a
 #define MSR_K8_TOP_MEM2			0xc001001d
 #define MSR_K8_SYSCFG			0xc0010010
+#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT	23
+#define MSR_K8_SYSCFG_MEM_ENCRYPT	BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
 #define MSR_K8_INT_PENDING_MSG		0xc0010055
 /* C1E active bits in int pending message */
 #define K8_INTP_C1E_ACTIVE_MASK		0x18000000
diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h
new file mode 100644
index 000000000000..91c8d868424d
--- /dev/null
+++ b/arch/x86/include/asm/orc_lookup.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _ORC_LOOKUP_H
+#define _ORC_LOOKUP_H
+
+/*
+ * This is a lookup table for speeding up access to the .orc_unwind table.
+ * Given an input address offset, the corresponding lookup table entry
+ * specifies a subset of the .orc_unwind table to search.
+ *
+ * Each block represents the end of the previous range and the start of the
+ * next range.  An extra block is added to give the last range an end.
+ *
+ * The block size should be a power of 2 to avoid a costly 'div' instruction.
+ *
+ * A block size of 256 was chosen because it roughly doubles unwinder
+ * performance while only adding ~5% to the ORC data footprint.
+ */
+#define LOOKUP_BLOCK_ORDER	8
+#define LOOKUP_BLOCK_SIZE	(1 << LOOKUP_BLOCK_ORDER)
+
+#ifndef LINKER_SCRIPT
+
+extern unsigned int orc_lookup[];
+extern unsigned int orc_lookup_end[];
+
+#define LOOKUP_START_IP		(unsigned long)_stext
+#define LOOKUP_STOP_IP		(unsigned long)_etext
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* _ORC_LOOKUP_H */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
new file mode 100644
index 000000000000..9c9dc579bd7d
--- /dev/null
+++ b/arch/x86/include/asm/orc_types.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and BP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED		0
+#define ORC_REG_PREV_SP			1
+#define ORC_REG_DX			2
+#define ORC_REG_DI			3
+#define ORC_REG_BP			4
+#define ORC_REG_SP			5
+#define ORC_REG_R10			6
+#define ORC_REG_R13			7
+#define ORC_REG_BP_INDIRECT		8
+#define ORC_REG_SP_INDIRECT		9
+#define ORC_REG_MAX			15
+
+/*
+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
+ * caller's SP right before it made the call).  Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
+ * to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
+ * points to the iret return frame.
+ *
+ * The UNWIND_HINT macros are used only for the unwind_hint struct.  They
+ * aren't used in struct orc_entry due to size and complexity constraints.
+ * Objtool converts them to real types when it converts the hints to orc
+ * entries.
+ */
+#define ORC_TYPE_CALL			0
+#define ORC_TYPE_REGS			1
+#define ORC_TYPE_REGS_IRET		2
+#define UNWIND_HINT_TYPE_SAVE		3
+#define UNWIND_HINT_TYPE_RESTORE	4
+
+#ifndef __ASSEMBLY__
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard.  It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder.  It tells the
+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on
+ * the stack for a given code address.  Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+	s16		sp_offset;
+	s16		bp_offset;
+	unsigned	sp_reg:4;
+	unsigned	bp_reg:4;
+	unsigned	type:2;
+} __packed;
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack for the ORC unwinder.
+ *
+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
+ */
+struct unwind_hint {
+	u32		ip;
+	s16		sp_offset;
+	u8		sp_reg;
+	u8		type;
+};
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b4a0d43248cf..b50df06ad251 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -51,6 +51,10 @@ static inline void clear_page(void *page)
 
 void copy_page(void *to, void *from);
 
+#ifdef CONFIG_X86_MCE
+#define arch_unmap_kpfn arch_unmap_kpfn
+#endif
+
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 7bd0099384ca..b98ed9d14630 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
 
 #include <linux/const.h>
 #include <linux/types.h>
+#include <linux/mem_encrypt.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT		12
@@ -15,7 +16,7 @@
 #define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
 
-#define __PHYSICAL_MASK		((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#define __PHYSICAL_MASK		((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1)))
 #define __VIRTUAL_MASK		((1UL << __VIRTUAL_MASK_SHIFT) - 1)
 
 /* Cast *PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9ccac1926587..c25dd22f7c70 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -960,11 +960,6 @@ extern void default_banner(void);
 #define GET_CR2_INTO_RAX				\
 	call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2)
 
-#define PARAVIRT_ADJUST_EXCEPTION_FRAME					\
-	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
-		  CLBR_NONE,						\
-		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
-
 #define USERGS_SYSRET64							\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 		  CLBR_NONE,						\
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index cb976bab6299..6b64fc6367f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -84,7 +84,7 @@ struct pv_init_ops {
 	 */
 	unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
 			  unsigned long addr, unsigned len);
-};
+} __no_randomize_layout;
 
 
 struct pv_lazy_ops {
@@ -92,12 +92,12 @@ struct pv_lazy_ops {
 	void (*enter)(void);
 	void (*leave)(void);
 	void (*flush)(void);
-};
+} __no_randomize_layout;
 
 struct pv_time_ops {
 	unsigned long long (*sched_clock)(void);
 	unsigned long long (*steal_clock)(int cpu);
-};
+} __no_randomize_layout;
 
 struct pv_cpu_ops {
 	/* hooks for various privileged instructions */
@@ -176,7 +176,7 @@ struct pv_cpu_ops {
 
 	void (*start_context_switch)(struct task_struct *prev);
 	void (*end_context_switch)(struct task_struct *next);
-};
+} __no_randomize_layout;
 
 struct pv_irq_ops {
 	/*
@@ -196,10 +196,7 @@ struct pv_irq_ops {
 	void (*safe_halt)(void);
 	void (*halt)(void);
 
-#ifdef CONFIG_X86_64
-	void (*adjust_exception_frame)(void);
-#endif
-};
+} __no_randomize_layout;
 
 struct pv_mmu_ops {
 	unsigned long (*read_cr2)(void);
@@ -305,7 +302,7 @@ struct pv_mmu_ops {
 	   an mfn.  We can tell which is which from the index. */
 	void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
 			   phys_addr_t phys, pgprot_t flags);
-};
+} __no_randomize_layout;
 
 struct arch_spinlock;
 #ifdef CONFIG_SMP
@@ -322,7 +319,7 @@ struct pv_lock_ops {
 	void (*kick)(int cpu);
 
 	struct paravirt_callee_save vcpu_is_preempted;
-};
+} __no_randomize_layout;
 
 /* This contains all the paravirt structures: we get a convenient
  * number for each function using the offset which we use to indicate
@@ -334,7 +331,7 @@ struct paravirt_patch_template {
 	struct pv_irq_ops pv_irq_ops;
 	struct pv_mmu_ops pv_mmu_ops;
 	struct pv_lock_ops pv_lock_ops;
-};
+} __no_randomize_layout;
 
 extern struct pv_info pv_info;
 extern struct pv_init_ops pv_init_ops;
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 77037b6f1caa..bbeae4a2bd01 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_PGTABLE_H
 #define _ASM_X86_PGTABLE_H
 
+#include <linux/mem_encrypt.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
 
@@ -13,9 +14,18 @@
 		     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))	\
 	 : (prot))
 
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot)	__pgprot(__sme_set(pgprot_val(prot)))
+#define pgprot_decrypted(prot)	__pgprot(__sme_clr(pgprot_val(prot)))
+
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
@@ -38,6 +48,8 @@ extern struct list_head pgd_list;
 
 extern struct mm_struct *pgd_page_get_mm(struct page *page);
 
+extern pmdval_t early_pmd_flags;
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
@@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d)
 	return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
 }
 
+static inline unsigned long pgd_pfn(pgd_t pgd)
+{
+	return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
 static inline int p4d_large(p4d_t p4d)
 {
 	/* No 512 GiB pages yet */
@@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pmd_page(pmd)		\
-	pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
+#define pmd_page(pmd)	pfn_to_page(pmd_pfn(pmd))
 
 /*
  * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pud_page(pud)		\
-	pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
+#define pud_page(pud)	pfn_to_page(pud_pfn(pud))
 
 /* Find an entry in the second-level page table.. */
 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define p4d_page(p4d)		\
-	pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
+#define p4d_page(p4d)	pfn_to_page(p4d_pfn(p4d))
 
 /* Find an entry in the third-level page table.. */
 static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
@@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pgd_page(pgd)		pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+#define pgd_page(pgd)	pfn_to_page(pgd_pfn(pgd))
 
 /* to find an entry in a page-table-directory. */
 static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index bf9638e1ee42..399261ce904c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -2,6 +2,8 @@
 #define _ASM_X86_PGTABLE_DEFS_H
 
 #include <linux/const.h>
+#include <linux/mem_encrypt.h>
+
 #include <asm/page_types.h>
 
 #define FIRST_USER_ADDRESS	0UL
@@ -121,10 +123,10 @@
 
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
-			 _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
-			 _PAGE_DIRTY)
+#define _PAGE_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
+				 _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW |		\
+				 _PAGE_ACCESSED | _PAGE_DIRTY)
 
 /*
  * Set of bits not changed in pte_modify.  The pte's
@@ -159,6 +161,7 @@ enum page_cache_mode {
 
 #define _PAGE_CACHE_MASK	(_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
 #define _PAGE_NOCACHE		(cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP		(cachemode2protval(_PAGE_CACHE_MODE_WP))
 
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
 #define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
@@ -187,22 +190,42 @@ enum page_cache_mode {
 #define __PAGE_KERNEL_VVAR		(__PAGE_KERNEL_RO | _PAGE_USER)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_WP		(__PAGE_KERNEL | _PAGE_CACHE_WP)
 
 #define __PAGE_KERNEL_IO		(__PAGE_KERNEL)
 #define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE)
 
-#define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VVAR		__pgprot(__PAGE_KERNEL_VVAR)
+#ifndef __ASSEMBLY__
+
+#define _PAGE_ENC	(_AT(pteval_t, sme_me_mask))
+
+#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
+			 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
+#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
+			 _PAGE_DIRTY | _PAGE_ENC)
+
+#define __PAGE_KERNEL_ENC	(__PAGE_KERNEL | _PAGE_ENC)
+#define __PAGE_KERNEL_ENC_WP	(__PAGE_KERNEL_WP | _PAGE_ENC)
+
+#define __PAGE_KERNEL_NOENC	(__PAGE_KERNEL)
+#define __PAGE_KERNEL_NOENC_WP	(__PAGE_KERNEL_WP)
+
+#define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
+#define PAGE_KERNEL_NOENC	__pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC_NOENC	__pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
+#define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_VSYSCALL	__pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC)
+#define PAGE_KERNEL_VVAR	__pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
+
+#define PAGE_KERNEL_IO		__pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE	__pgprot(__PAGE_KERNEL_IO_NOCACHE)
 
-#define PAGE_KERNEL_IO			__pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#endif	/* __ASSEMBLY__ */
 
 /*         xwr */
 #define __P000	PAGE_NONE
@@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d)
 #else
 #include <asm-generic/pgtable-nop4d.h>
 
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+	return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
+}
+
 static inline p4dval_t native_p4d_val(p4d_t p4d)
 {
 	return native_pgd_val(p4d.pgd);
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 79aa2f98398d..dc723b64acf0 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_PROCESSOR_FLAGS_H
 
 #include <uapi/asm/processor-flags.h>
+#include <linux/mem_encrypt.h>
 
 #ifdef CONFIG_VM86
 #define X86_VM_MASK	X86_EFLAGS_VM
@@ -32,16 +33,18 @@
  * CR3_ADDR_MASK is the mask used by read_cr3_pa().
  */
 #ifdef CONFIG_X86_64
-/* Mask off the address space ID bits. */
-#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
-#define CR3_PCID_MASK 0xFFFull
+/* Mask off the address space ID and SME encryption bits. */
+#define CR3_ADDR_MASK	__sme_clr(0x7FFFFFFFFFFFF000ull)
+#define CR3_PCID_MASK	0xFFFull
+#define CR3_NOFLUSH	BIT_ULL(63)
 #else
 /*
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
  * a tiny bit of code size by setting all the bits.
  */
-#define CR3_ADDR_MASK 0xFFFFFFFFull
-#define CR3_PCID_MASK 0ull
+#define CR3_ADDR_MASK	0xFFFFFFFFull
+#define CR3_PCID_MASK	0ull
+#define CR3_NOFLUSH	0
 #endif
 
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6a79547e8ee0..3fa26a61eabc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
 #include <asm/nops.h>
 #include <asm/special_insns.h>
 #include <asm/fpu/types.h>
+#include <asm/unwind_hints.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -29,6 +30,7 @@ struct vm86;
 #include <linux/math64.h>
 #include <linux/err.h>
 #include <linux/irqflags.h>
+#include <linux/mem_encrypt.h>
 
 /*
  * We handle most unaligned accesses in hardware.  On the other hand
@@ -129,7 +131,7 @@ struct cpuinfo_x86 {
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 	u32			microcode;
-};
+} __randomize_layout;
 
 struct cpuid_regs {
 	u32 eax, ebx, ecx, edx;
@@ -239,9 +241,14 @@ static inline unsigned long read_cr3_pa(void)
 	return __read_cr3() & CR3_ADDR_MASK;
 }
 
+static inline unsigned long native_read_cr3_pa(void)
+{
+	return __native_read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
-	write_cr3(__pa(pgdir));
+	write_cr3(__sme_pa(pgdir));
 }
 
 #ifdef CONFIG_X86_32
@@ -661,7 +668,7 @@ static inline void sync_core(void)
 	 * In case NMI unmasking or performance ever becomes a problem,
 	 * the next best option appears to be MOV-to-CR2 and an
 	 * unconditional jump.  That sequence also works on all CPUs,
-	 * but it will fault at CPL3 (i.e. Xen PV and lguest).
+	 * but it will fault at CPL3 (i.e. Xen PV).
 	 *
 	 * CPUID is the conventional way, but it's nasty: it doesn't
 	 * exist on some 486-like CPUs, and it usually exits to a
@@ -684,6 +691,7 @@ static inline void sync_core(void)
 	unsigned int tmp;
 
 	asm volatile (
+		UNWIND_HINT_SAVE
 		"mov %%ss, %0\n\t"
 		"pushq %q0\n\t"
 		"pushq %%rsp\n\t"
@@ -693,6 +701,7 @@ static inline void sync_core(void)
 		"pushq %q0\n\t"
 		"pushq $1f\n\t"
 		"iretq\n\t"
+		UNWIND_HINT_RESTORE
 		"1:"
 		: "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
 #endif
@@ -802,7 +811,9 @@ static inline void spin_lock_prefetch(const void *x)
  */
 #define IA32_PAGE_OFFSET	PAGE_OFFSET
 #define TASK_SIZE		PAGE_OFFSET
+#define TASK_SIZE_LOW		TASK_SIZE
 #define TASK_SIZE_MAX		TASK_SIZE
+#define DEFAULT_MAP_WINDOW	TASK_SIZE
 #define STACK_TOP		TASK_SIZE
 #define STACK_TOP_MAX		STACK_TOP
 
@@ -842,7 +853,9 @@ static inline void spin_lock_prefetch(const void *x)
  * particular problem by preventing anything from being mapped
  * at the maximum canonical address.
  */
-#define TASK_SIZE_MAX	((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+#define DEFAULT_MAP_WINDOW	((1UL << 47) - PAGE_SIZE)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
@@ -850,12 +863,14 @@ static inline void spin_lock_prefetch(const void *x)
 #define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
 					0xc0000000 : 0xFFFFe000)
 
+#define TASK_SIZE_LOW		(test_thread_flag(TIF_ADDR32) ? \
+					IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
 #define TASK_SIZE		(test_thread_flag(TIF_ADDR32) ? \
 					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 #define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
 					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 
-#define STACK_TOP		TASK_SIZE
+#define STACK_TOP		TASK_SIZE_LOW
 #define STACK_TOP_MAX		TASK_SIZE_MAX
 
 #define INIT_THREAD  {						\
@@ -876,7 +891,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
  * space during mmap's.
  */
 #define __TASK_UNMAPPED_BASE(task_size)	(PAGE_ALIGN(task_size / 3))
-#define TASK_UNMAPPED_BASE		__TASK_UNMAPPED_BASE(TASK_SIZE)
+#define TASK_UNMAPPED_BASE		__TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
 
 #define KSTK_EIP(task)		(task_pt_regs(task)->ip)
 
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 8d3964fc5f91..b408b1886195 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void);
 void __end_entry_SYSENTER_compat(void);
 void entry_SYSCALL_compat(void);
 void entry_INT80_compat(void);
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+void xen_entry_INT80_compat(void);
+#endif
 #endif
 
 void x86_configure_nx(void);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2b5d686ea9f3..91c04c8e67fa 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -9,6 +9,20 @@
 #ifdef __i386__
 
 struct pt_regs {
+	/*
+	 * NB: 32-bit x86 CPUs are inconsistent as what happens in the
+	 * following cases (where %seg represents a segment register):
+	 *
+	 * - pushl %seg: some do a 16-bit write and leave the high
+	 *   bits alone
+	 * - movl %seg, [mem]: some do a 16-bit write despite the movl
+	 * - IDT entry: some (e.g. 486) will leave the high bits of CS
+	 *   and (if applicable) SS undefined.
+	 *
+	 * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
+	 * so we can just treat all of the segment registers as 16-bit
+	 * values.
+	 */
 	unsigned long bx;
 	unsigned long cx;
 	unsigned long dx;
@@ -16,16 +30,22 @@ struct pt_regs {
 	unsigned long di;
 	unsigned long bp;
 	unsigned long ax;
-	unsigned long ds;
-	unsigned long es;
-	unsigned long fs;
-	unsigned long gs;
+	unsigned short ds;
+	unsigned short __dsh;
+	unsigned short es;
+	unsigned short __esh;
+	unsigned short fs;
+	unsigned short __fsh;
+	unsigned short gs;
+	unsigned short __gsh;
 	unsigned long orig_ax;
 	unsigned long ip;
-	unsigned long cs;
+	unsigned short cs;
+	unsigned short __csh;
 	unsigned long flags;
 	unsigned long sp;
-	unsigned long ss;
+	unsigned short ss;
+	unsigned short __ssh;
 };
 
 #else /* __i386__ */
@@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
 	if (offset == offsetof(struct pt_regs, sp) &&
 	    regs->cs == __KERNEL_CS)
 		return kernel_stack_pointer(regs);
+
+	/* The selector fields are 16-bit. */
+	if (offset == offsetof(struct pt_regs, cs) ||
+	    offset == offsetof(struct pt_regs, ss) ||
+	    offset == offsetof(struct pt_regs, ds) ||
+	    offset == offsetof(struct pt_regs, es) ||
+	    offset == offsetof(struct pt_regs, fs) ||
+	    offset == offsetof(struct pt_regs, gs)) {
+		return *(u16 *)((unsigned long)regs + offset);
+
+	}
 #endif
 	return *(unsigned long *)((unsigned long)regs + offset);
 }
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 230e1903acf0..90d91520c13a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -1,6 +1,15 @@
 #ifndef _ARCH_X86_REALMODE_H
 #define _ARCH_X86_REALMODE_H
 
+/*
+ * Flag bit definitions for use with the flags field of the trampoline header
+ * in the CONFIG_X86_64 variant.
+ */
+#define TH_FLAGS_SME_ACTIVE_BIT		0
+#define TH_FLAGS_SME_ACTIVE		BIT(TH_FLAGS_SME_ACTIVE_BIT)
+
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <asm/io.h>
 
@@ -38,6 +47,7 @@ struct trampoline_header {
 	u64 start;
 	u64 efer;
 	u32 cr4;
+	u32 flags;
 #endif
 };
 
@@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void)
 void set_real_mode_mem(phys_addr_t mem, size_t size);
 void reserve_real_mode(void);
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..ff871210b9f2
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,109 @@
+#ifndef __ASM_X86_REFCOUNT_H
+#define __ASM_X86_REFCOUNT_H
+/*
+ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
+ * PaX/grsecurity.
+ */
+#include <linux/refcount.h>
+
+/*
+ * This is the first portion of the refcount error handling, which lives in
+ * .text.unlikely, and is jumped to from the CPU flag check (in the
+ * following macros). This saves the refcount value location into CX for
+ * the exception handler to use (in mm/extable.c), and then triggers the
+ * central refcount exception. The fixup address for the exception points
+ * back to the regular execution flow in .text.
+ */
+#define _REFCOUNT_EXCEPTION				\
+	".pushsection .text.unlikely\n"			\
+	"111:\tlea %[counter], %%" _ASM_CX "\n"		\
+	"112:\t" ASM_UD0 "\n"				\
+	ASM_UNREACHABLE					\
+	".popsection\n"					\
+	"113:\n"					\
+	_ASM_EXTABLE_REFCOUNT(112b, 113b)
+
+/* Trigger refcount exception if refcount result is negative. */
+#define REFCOUNT_CHECK_LT_ZERO				\
+	"js 111f\n\t"					\
+	_REFCOUNT_EXCEPTION
+
+/* Trigger refcount exception if refcount result is zero or negative. */
+#define REFCOUNT_CHECK_LE_ZERO				\
+	"jz 111f\n\t"					\
+	REFCOUNT_CHECK_LT_ZERO
+
+/* Trigger refcount exception unconditionally. */
+#define REFCOUNT_ERROR					\
+	"jmp 111f\n\t"					\
+	_REFCOUNT_EXCEPTION
+
+static __always_inline void refcount_add(unsigned int i, refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
+		REFCOUNT_CHECK_LT_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: "ir" (i)
+		: "cc", "cx");
+}
+
+static __always_inline void refcount_inc(refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "incl %0\n\t"
+		REFCOUNT_CHECK_LT_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: : "cc", "cx");
+}
+
+static __always_inline void refcount_dec(refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "decl %0\n\t"
+		REFCOUNT_CHECK_LE_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: : "cc", "cx");
+}
+
+static __always_inline __must_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+	GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
+				  r->refs.counter, "er", i, "%0", e);
+}
+
+static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+	GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
+				 r->refs.counter, "%0", e);
+}
+
+static __always_inline __must_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+	int c, result;
+
+	c = atomic_read(&(r->refs));
+	do {
+		if (unlikely(c == 0))
+			return false;
+
+		result = c + i;
+
+		/* Did we try to increment from/to an undesirable state? */
+		if (unlikely(c < 0 || c == INT_MAX || result < c)) {
+			asm volatile(REFCOUNT_ERROR
+				     : : [counter] "m" (r->refs.counter)
+				     : "cc", "cx");
+			break;
+		}
+
+	} while (!atomic_try_cmpxchg(&(r->refs), &c, result));
+
+	return c != 0;
+}
+
+static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+	return refcount_add_not_zero(1, r);
+}
+
+#endif
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 661dd305694a..045f99211a99 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -1,45 +1,56 @@
 #ifndef _ASM_X86_RMWcc
 #define _ASM_X86_RMWcc
 
+#define __CLOBBERS_MEM		"memory"
+#define __CLOBBERS_MEM_CC_CX	"memory", "cc", "cx"
+
 #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
 
 /* Use asm goto */
 
-#define __GEN_RMWcc(fullop, var, cc, ...)				\
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...)			\
 do {									\
 	asm_volatile_goto (fullop "; j" #cc " %l[cc_label]"		\
-			: : "m" (var), ## __VA_ARGS__ 			\
-			: "memory" : cc_label);				\
+			: : [counter] "m" (var), ## __VA_ARGS__		\
+			: clobbers : cc_label);				\
 	return 0;							\
 cc_label:								\
 	return 1;							\
 } while (0)
 
-#define GEN_UNARY_RMWcc(op, var, arg0, cc) 				\
-	__GEN_RMWcc(op " " arg0, var, cc)
+#define __BINARY_RMWcc_ARG	" %1, "
 
-#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
-	__GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
 
 #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
 
 /* Use flags output or a set instruction */
 
-#define __GEN_RMWcc(fullop, var, cc, ...)				\
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...)			\
 do {									\
 	bool c;								\
 	asm volatile (fullop ";" CC_SET(cc)				\
-			: "+m" (var), CC_OUT(cc) (c)			\
-			: __VA_ARGS__ : "memory");			\
+			: [counter] "+m" (var), CC_OUT(cc) (c)		\
+			: __VA_ARGS__ : clobbers);			\
 	return c;							\
 } while (0)
 
+#define __BINARY_RMWcc_ARG	" %2, "
+
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+
 #define GEN_UNARY_RMWcc(op, var, arg0, cc)				\
-	__GEN_RMWcc(op " " arg0, var, cc)
+	__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM)
+
+#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc)		\
+	__GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc,			\
+		    __CLOBBERS_MEM_CC_CX)
 
 #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
-	__GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
+	__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc,		\
+		    __CLOBBERS_MEM, vcon (val))
 
-#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc)	\
+	__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc,	\
+		    __CLOBBERS_MEM_CC_CX, vcon (val))
 
 #endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 1549caa098f0..066aaf813141 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -238,9 +238,7 @@
 #ifndef __ASSEMBLY__
 
 extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
-#ifdef CONFIG_TRACING
-# define trace_early_idt_handler_array early_idt_handler_array
-#endif
+extern void early_ignore_irq(void);
 
 /*
  * Load a segment. Fall back on loading the zero segment if something goes
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index eaec6c364e42..cd71273ec49d 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -11,6 +11,7 @@
  * Executability : eXeutable, NoteXecutable
  * Read/Write    : ReadOnly, ReadWrite
  * Presence      : NotPresent
+ * Encryption    : Encrypted, Decrypted
  *
  * Within a category, the attributes are mutually exclusive.
  *
@@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages);
 int set_memory_wb(unsigned long addr, int numpages);
 int set_memory_np(unsigned long addr, int numpages);
 int set_memory_4k(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
 
 int set_memory_array_uc(unsigned long *addr, int addrinarray);
 int set_memory_array_wc(unsigned long *addr, int addrinarray);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd6e7b3..5161da1a0fa0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -98,6 +98,7 @@ struct thread_info {
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 #define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
 #define TIF_X32			30	/* 32-bit native x86-64 binary */
+#define TIF_FSCHECK		31	/* Check FS is USER_DS on return */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -122,6 +123,7 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_ADDR32		(1 << TIF_ADDR32)
 #define _TIF_X32		(1 << TIF_X32)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 
 /*
  * work to do in syscall_trace_enter().  Also includes TIF_NOHZ for
@@ -137,7 +139,8 @@ struct thread_info {
 	(_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING |	\
 	 _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU |	\
 	 _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE |	\
-	 _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_FSCHECK)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index c7797307fc2b..79a4ca6a9606 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -15,4 +15,18 @@
 
 #include <asm-generic/tlb.h>
 
+/*
+ * While x86 architecture in general requires an IPI to perform TLB
+ * shootdown, enablement code for several hypervisors overrides
+ * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
+ * a hypercall. To keep software pagetable walkers safe in this case we
+ * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment
+ * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h
+ * for more details.
+ */
+static inline void __tlb_remove_table(void *table)
+{
+	free_page_and_swap_cache(table);
+}
+
 #endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 50ea3482e1d1..d23e61dc0640 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void)
 	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
 }
 
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+	u64 new_tlb_gen;
+
+	/*
+	 * Bump the generation count.  This also serves as a full barrier
+	 * that synchronizes with switch_mm(): callers are required to order
+	 * their read of mm_cpumask after their writes to the paging
+	 * structures.
+	 */
+	smp_mb__before_atomic();
+	new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
+	smp_mb__after_atomic();
+
+	return new_tlb_gen;
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+struct tlb_context {
+	u64 ctx_id;
+	u64 tlb_gen;
+};
+
 struct tlb_state {
 	/*
 	 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -73,13 +101,35 @@ struct tlb_state {
 	 * mode even if we've already switched back to swapper_pg_dir.
 	 */
 	struct mm_struct *loaded_mm;
-	int state;
+	u16 loaded_mm_asid;
+	u16 next_asid;
 
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
 	 */
 	unsigned long cr4;
+
+	/*
+	 * This is a list of all contexts that might exist in the TLB.
+	 * There is one per ASID that we use, and the ASID (what the
+	 * CPU calls PCID) is the index into ctxts.
+	 *
+	 * For each context, ctx_id indicates which mm the TLB's user
+	 * entries came from.  As an invariant, the TLB will never
+	 * contain entries that are out-of-date as when that mm reached
+	 * the tlb_gen in the list.
+	 *
+	 * To be clear, this means that it's legal for the TLB code to
+	 * flush the TLB without updating tlb_gen.  This can happen
+	 * (for now, at least) due to paravirt remote flushes.
+	 *
+	 * NB: context 0 is a bit special, since it's also used by
+	 * various bits of init code.  This is fine -- code that
+	 * isn't aware of PCID will end up harmlessly flushing
+	 * context 0.
+	 */
+	struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
@@ -207,6 +257,14 @@ static inline void __flush_tlb_all(void)
 		__flush_tlb_global();
 	else
 		__flush_tlb();
+
+	/*
+	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+	 * we'd end up flushing kernel translations for the current ASID but
+	 * we might fail to flush kernel translations for other cached ASIDs.
+	 *
+	 * To avoid this issue, we force PCID off if PGE is off.
+	 */
 }
 
 static inline void __flush_tlb_one(unsigned long addr)
@@ -231,9 +289,26 @@ static inline void __flush_tlb_one(unsigned long addr)
  * and page-granular flushes are available only on i486 and up.
  */
 struct flush_tlb_info {
-	struct mm_struct *mm;
-	unsigned long start;
-	unsigned long end;
+	/*
+	 * We support several kinds of flushes.
+	 *
+	 * - Fully flush a single mm.  .mm will be set, .end will be
+	 *   TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+	 *   which the IPI sender is trying to catch us up.
+	 *
+	 * - Partially flush a single mm.  .mm will be set, .start and
+	 *   .end will indicate the range, and .new_tlb_gen will be set
+	 *   such that the changes between generation .new_tlb_gen-1 and
+	 *   .new_tlb_gen are entirely contained in the indicated range.
+	 *
+	 * - Fully flush all mms whose tlb_gens have been updated.  .mm
+	 *   will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+	 *   will be zero.
+	 */
+	struct mm_struct	*mm;
+	unsigned long		start;
+	unsigned long		end;
+	u64			new_tlb_gen;
 };
 
 #define local_flush_tlb() __flush_tlb()
@@ -256,12 +331,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 void native_flush_tlb_others(const struct cpumask *cpumask,
 			     const struct flush_tlb_info *info);
 
-#define TLBSTATE_OK	1
-#define TLBSTATE_LAZY	2
-
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 					struct mm_struct *mm)
 {
+	inc_mm_tlb_gen(mm);
 	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
 }
 
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 6358a85e2270..c1d2a9892352 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
 
 extern void setup_node_to_cpumask_map(void);
 
-/*
- * Returns the number of the node containing Node 'node'. This
- * architecture is flat, so it is a pretty simple function!
- */
-#define parent_node(node) (node)
-
 #define pcibus_to_node(bus) __pcibus_to_node(bus)
 
 extern int __node_distance(int, int);
diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h
new file mode 100644
index 000000000000..57c8da027d99
--- /dev/null
+++ b/arch/x86/include/asm/trace/common.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_TRACE_COMMON_H
+#define _ASM_TRACE_COMMON_H
+
+#ifdef CONFIG_TRACING
+DECLARE_STATIC_KEY_FALSE(trace_pagefault_key);
+#define trace_pagefault_enabled()			\
+	static_branch_unlikely(&trace_pagefault_key)
+DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+#define trace_resched_ipi_enabled()			\
+	static_branch_unlikely(&trace_resched_ipi_key)
+#else
+static inline bool trace_pagefault_enabled(void) { return false; }
+static inline bool trace_resched_ipi_enabled(void) { return false; }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index 2422b14c50a7..5665bf205b8d 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -5,9 +5,10 @@
 #define _TRACE_PAGE_FAULT_H
 
 #include <linux/tracepoint.h>
+#include <asm/trace/common.h>
 
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
+extern int trace_pagefault_reg(void);
+extern void trace_pagefault_unreg(void);
 
 DECLARE_EVENT_CLASS(x86_exceptions,
 
@@ -37,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name,				\
 	TP_PROTO(unsigned long address,	struct pt_regs *regs,	\
 		 unsigned long error_code),			\
 	TP_ARGS(address, regs, error_code),			\
-	trace_irq_vector_regfunc,				\
-	trace_irq_vector_unregfunc);
+	trace_pagefault_reg, trace_pagefault_unreg);
 
 DEFINE_PAGE_FAULT_EVENT(page_fault_user);
 DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 32dd6a9e343c..1599d394c8c1 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -5,9 +5,12 @@
 #define _TRACE_IRQ_VECTORS_H
 
 #include <linux/tracepoint.h>
+#include <asm/trace/common.h>
 
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
+#ifdef CONFIG_X86_LOCAL_APIC
+
+extern int trace_resched_ipi_reg(void);
+extern void trace_resched_ipi_unreg(void);
 
 DECLARE_EVENT_CLASS(x86_irq_vector,
 
@@ -28,15 +31,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector,
 #define DEFINE_IRQ_VECTOR_EVENT(name)		\
 DEFINE_EVENT_FN(x86_irq_vector, name##_entry,	\
 	TP_PROTO(int vector),			\
+	TP_ARGS(vector), NULL, NULL);		\
+DEFINE_EVENT_FN(x86_irq_vector, name##_exit,	\
+	TP_PROTO(int vector),			\
+	TP_ARGS(vector), NULL, NULL);
+
+#define DEFINE_RESCHED_IPI_EVENT(name)		\
+DEFINE_EVENT_FN(x86_irq_vector, name##_entry,	\
+	TP_PROTO(int vector),			\
 	TP_ARGS(vector),			\
-	trace_irq_vector_regfunc,		\
-	trace_irq_vector_unregfunc);		\
+	trace_resched_ipi_reg,			\
+	trace_resched_ipi_unreg);		\
 DEFINE_EVENT_FN(x86_irq_vector, name##_exit,	\
 	TP_PROTO(int vector),			\
 	TP_ARGS(vector),			\
-	trace_irq_vector_regfunc,		\
-	trace_irq_vector_unregfunc);
-
+	trace_resched_ipi_reg,			\
+	trace_resched_ipi_unreg);
 
 /*
  * local_timer - called when entering/exiting a local timer interrupt
@@ -45,11 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit,	\
 DEFINE_IRQ_VECTOR_EVENT(local_timer);
 
 /*
- * reschedule - called when entering/exiting a reschedule vector handler
- */
-DEFINE_IRQ_VECTOR_EVENT(reschedule);
-
-/*
  * spurious_apic - called when entering/exiting a spurious apic vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(spurious_apic);
@@ -65,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic);
  */
 DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi);
 
+#ifdef CONFIG_IRQ_WORK
 /*
  * irq_work - called when entering/exiting a irq work interrupt
  * vector handler
@@ -81,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work);
  *  4) goto 1
  */
 TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0);
+#endif
+
+/*
+ * The ifdef is required because that tracepoint macro hell emits tracepoint
+ * code in files which include this header even if the tracepoint is not
+ * enabled. Brilliant stuff that.
+ */
+#ifdef CONFIG_SMP
+/*
+ * reschedule - called when entering/exiting a reschedule vector handler
+ */
+DEFINE_RESCHED_IPI_EVENT(reschedule);
 
 /*
  * call_function - called when entering/exiting a call function interrupt
@@ -93,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function);
  * single interrupt vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(call_function_single);
+#endif
 
+#ifdef CONFIG_X86_MCE_THRESHOLD
 /*
  * threshold_apic - called when entering/exiting a threshold apic interrupt
  * vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
+#endif
 
+#ifdef CONFIG_X86_MCE_AMD
 /*
  * deferred_error_apic - called when entering/exiting a deferred apic interrupt
  * vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+#endif
 
+#ifdef CONFIG_X86_THERMAL_VECTOR
 /*
  * thermal_apic - called when entering/exiting a thermal apic interrupt
  * vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(thermal_apic);
+#endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
 
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH .
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 01fd0a7f48cd..5545f6459bf5 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -13,9 +13,6 @@ asmlinkage void divide_error(void);
 asmlinkage void debug(void);
 asmlinkage void nmi(void);
 asmlinkage void int3(void);
-asmlinkage void xen_debug(void);
-asmlinkage void xen_int3(void);
-asmlinkage void xen_stack_segment(void);
 asmlinkage void overflow(void);
 asmlinkage void bounds(void);
 asmlinkage void invalid_op(void);
@@ -38,22 +35,29 @@ asmlinkage void machine_check(void);
 #endif /* CONFIG_X86_MCE */
 asmlinkage void simd_coprocessor_error(void);
 
-#ifdef CONFIG_TRACING
-asmlinkage void trace_page_fault(void);
-#define trace_stack_segment stack_segment
-#define trace_divide_error divide_error
-#define trace_bounds bounds
-#define trace_invalid_op invalid_op
-#define trace_device_not_available device_not_available
-#define trace_coprocessor_segment_overrun coprocessor_segment_overrun
-#define trace_invalid_TSS invalid_TSS
-#define trace_segment_not_present segment_not_present
-#define trace_general_protection general_protection
-#define trace_spurious_interrupt_bug spurious_interrupt_bug
-#define trace_coprocessor_error coprocessor_error
-#define trace_alignment_check alignment_check
-#define trace_simd_coprocessor_error simd_coprocessor_error
-#define trace_async_page_fault async_page_fault
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xendebug(void);
+asmlinkage void xen_xenint3(void);
+asmlinkage void xen_nmi(void);
+asmlinkage void xen_overflow(void);
+asmlinkage void xen_bounds(void);
+asmlinkage void xen_invalid_op(void);
+asmlinkage void xen_device_not_available(void);
+asmlinkage void xen_double_fault(void);
+asmlinkage void xen_coprocessor_segment_overrun(void);
+asmlinkage void xen_invalid_TSS(void);
+asmlinkage void xen_segment_not_present(void);
+asmlinkage void xen_stack_segment(void);
+asmlinkage void xen_general_protection(void);
+asmlinkage void xen_page_fault(void);
+asmlinkage void xen_spurious_interrupt_bug(void);
+asmlinkage void xen_coprocessor_error(void);
+asmlinkage void xen_alignment_check(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void xen_machine_check(void);
+#endif /* CONFIG_X86_MCE */
+asmlinkage void xen_simd_coprocessor_error(void);
 #endif
 
 dotraplinkage void do_divide_error(struct pt_regs *, long);
@@ -74,14 +78,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *, long);
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
-#ifdef CONFIG_TRACING
-dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
-#else
-static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error)
-{
-	do_page_fault(regs, error);
-}
-#endif
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
 dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
 dotraplinkage void do_alignment_check(struct pt_regs *, long);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 30269dafec47..184eb9894dae 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -26,7 +26,12 @@
 
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current->thread.addr_limit)
-#define set_fs(x)	(current->thread.addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+	current->thread.addr_limit = fs;
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+}
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e6676495b125..e9f793e2df7a 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,11 +12,14 @@ struct unwind_state {
 	struct task_struct *task;
 	int graph_idx;
 	bool error;
-#ifdef CONFIG_FRAME_POINTER
+#if defined(CONFIG_ORC_UNWINDER)
+	bool signal, full_regs;
+	unsigned long sp, bp, ip;
+	struct pt_regs *regs;
+#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
 	bool got_irq;
-	unsigned long *bp, *orig_sp;
+	unsigned long *bp, *orig_sp, ip;
 	struct pt_regs *regs;
-	unsigned long ip;
 #else
 	unsigned long *sp;
 #endif
@@ -24,41 +27,30 @@ struct unwind_state {
 
 void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		    struct pt_regs *regs, unsigned long *first_frame);
-
 bool unwind_next_frame(struct unwind_state *state);
-
 unsigned long unwind_get_return_address(struct unwind_state *state);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
 
 static inline bool unwind_done(struct unwind_state *state)
 {
 	return state->stack_info.type == STACK_TYPE_UNKNOWN;
 }
 
-static inline
-void unwind_start(struct unwind_state *state, struct task_struct *task,
-		  struct pt_regs *regs, unsigned long *first_frame)
-{
-	first_frame = first_frame ? : get_stack_pointer(task, regs);
-
-	__unwind_start(state, task, regs, first_frame);
-}
-
 static inline bool unwind_error(struct unwind_state *state)
 {
 	return state->error;
 }
 
-#ifdef CONFIG_FRAME_POINTER
-
 static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+		  struct pt_regs *regs, unsigned long *first_frame)
 {
-	if (unwind_done(state))
-		return NULL;
+	first_frame = first_frame ? : get_stack_pointer(task, regs);
 
-	return state->regs ? &state->regs->ip : state->bp + 1;
+	__unwind_start(state, task, regs, first_frame);
 }
 
+#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	if (unwind_done(state))
@@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 
 	return state->regs;
 }
-
-#else /* !CONFIG_FRAME_POINTER */
-
-static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+#else
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	return NULL;
 }
+#endif
 
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+#ifdef CONFIG_ORC_UNWINDER
+void unwind_init(void);
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+			void *orc, size_t orc_size);
+#else
+static inline void unwind_init(void) {}
+static inline
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+			void *orc, size_t orc_size) {}
+#endif
+
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x)			\
+({							\
+	unsigned long val;				\
+	if (task == current)				\
+		val = READ_ONCE(x);			\
+	else						\
+		val = READ_ONCE_NOCHECK(x);		\
+	val;						\
+})
+
+static inline bool task_on_another_cpu(struct task_struct *task)
 {
-	return NULL;
+#ifdef CONFIG_SMP
+	return task != current && task->on_cpu;
+#else
+	return false;
+#endif
 }
 
-#endif /* CONFIG_FRAME_POINTER */
-
 #endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
new file mode 100644
index 000000000000..bae46fc6b9de
--- /dev/null
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -0,0 +1,105 @@
+#ifndef _ASM_X86_UNWIND_HINTS_H
+#define _ASM_X86_UNWIND_HINTS_H
+
+#include "orc_types.h"
+
+#ifdef __ASSEMBLY__
+
+/*
+ * In asm, there are two kinds of code: normal C-type callable functions and
+ * the rest.  The normal callable functions can be called by other code, and
+ * don't do anything unusual with the stack.  Such normal callable functions
+ * are annotated with the ENTRY/ENDPROC macros.  Most asm code falls in this
+ * category.  In this case, no special debugging annotations are needed because
+ * objtool can automatically generate the ORC data for the ORC unwinder to read
+ * at runtime.
+ *
+ * Anything which doesn't fall into the above category, such as syscall and
+ * interrupt handlers, tends to not be called directly by other functions, and
+ * often does unusual non-C-function-type things with the stack pointer.  Such
+ * code needs to be annotated such that objtool can understand it.  The
+ * following CFI hint macros are for this type of code.
+ *
+ * These macros provide hints to objtool about the state of the stack at each
+ * instruction.  Objtool starts from the hints and follows the code flow,
+ * making automatic CFI adjustments when it sees pushes and pops, filling out
+ * the debuginfo as necessary.  It will also warn if it sees any
+ * inconsistencies.
+ */
+.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL
+#ifdef CONFIG_STACK_VALIDATION
+.Lunwind_hint_ip_\@:
+	.pushsection .discard.unwind_hints
+		/* struct unwind_hint */
+		.long .Lunwind_hint_ip_\@ - .
+		.short \sp_offset
+		.byte \sp_reg
+		.byte \type
+	.popsection
+#endif
+.endm
+
+.macro UNWIND_HINT_EMPTY
+	UNWIND_HINT sp_reg=ORC_REG_UNDEFINED
+.endm
+
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
+	.if \base == %rsp
+		.if \indirect
+			.set sp_reg, ORC_REG_SP_INDIRECT
+		.else
+			.set sp_reg, ORC_REG_SP
+		.endif
+	.elseif \base == %rbp
+		.set sp_reg, ORC_REG_BP
+	.elseif \base == %rdi
+		.set sp_reg, ORC_REG_DI
+	.elseif \base == %rdx
+		.set sp_reg, ORC_REG_DX
+	.elseif \base == %r10
+		.set sp_reg, ORC_REG_R10
+	.else
+		.error "UNWIND_HINT_REGS: bad base register"
+	.endif
+
+	.set sp_offset, \offset
+
+	.if \iret
+		.set type, ORC_TYPE_REGS_IRET
+	.elseif \extra == 0
+		.set type, ORC_TYPE_REGS_IRET
+		.set sp_offset, \offset + (16*8)
+	.else
+		.set type, ORC_TYPE_REGS
+	.endif
+
+	UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+.endm
+
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
+	UNWIND_HINT_REGS base=\base offset=\offset iret=1
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=8
+	UNWIND_HINT sp_offset=\sp_offset
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#define UNWIND_HINT(sp_reg, sp_offset, type)			\
+	"987: \n\t"						\
+	".pushsection .discard.unwind_hints\n\t"		\
+	/* struct unwind_hint */				\
+	".long 987b - .\n\t"					\
+	".short " __stringify(sp_offset) "\n\t"		\
+	".byte " __stringify(sp_reg) "\n\t"			\
+	".byte " __stringify(type) "\n\t"			\
+	".popsection\n\t"
+
+#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE)
+
+#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index c4b9dc2f67c5..9f42beefc67a 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -7,12 +7,24 @@
 #ifndef _ASM_X86_VGA_H
 #define _ASM_X86_VGA_H
 
+#include <asm/set_memory.h>
+
 /*
  *	On the PC, we can just recalculate addresses and then
  *	access the videoram directly without any black magic.
+ *	To support memory encryption however, we need to access
+ *	the videoram as decrypted memory.
  */
 
-#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
+#define VGA_MAP_MEM(x, s)					\
+({								\
+	unsigned long start = (unsigned long)phys_to_virt(x);	\
+								\
+	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))			\
+		set_memory_decrypted(start, (s) >> PAGE_SHIFT);	\
+								\
+	start;							\
+})
 
 #define vga_readb(x) (*(x))
 #define vga_writeb(x, y) (*(y) = (x))
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 11071fcd630e..9606688caa4b 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -552,6 +552,8 @@ static inline void
 MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
 			struct desc_struct desc)
 {
+	u32 *p = (u32 *) &desc;
+
 	mcl->op = __HYPERVISOR_update_descriptor;
 	if (sizeof(maddr) == sizeof(long)) {
 		mcl->args[0] = maddr;
@@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
 	} else {
 		mcl->args[0] = maddr;
 		mcl->args[1] = maddr >> 32;
-		mcl->args[2] = desc.a;
-		mcl->args[3] = desc.b;
+		mcl->args[2] = *p++;
+		mcl->args[3] = *p;
 	}
 
 	trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ddef37b16af2..66b8f93333d1 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -201,7 +201,7 @@ struct boot_params {
  *
  * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
  *	PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
  * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
  * 	which start at asm startup_xen() entry point and later jump to the C
  * 	xen_start_kernel() entry point. Both domU and dom0 type of guests are
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a01892bdd61a..fd0a7895b63f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o
 obj-$(CONFIG_COMPAT)	+= signal_compat.o
-obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y			+= traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o dumpstack.o nmi.o
 obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
@@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
 
+obj-$(CONFIG_EISA)		+= eisa.o
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
@@ -126,11 +127,9 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
 
-ifdef CONFIG_FRAME_POINTER
-obj-y					+= unwind_frame.o
-else
-obj-y					+= unwind_guess.o
-endif
+obj-$(CONFIG_ORC_UNWINDER)		+= unwind_orc.o
+obj-$(CONFIG_FRAME_POINTER_UNWINDER)	+= unwind_frame.o
+obj-$(CONFIG_GUESS_UNWINDER)		+= unwind_guess.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6bb680671088..97bb2caf3428 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
 #define	ACPI_INVALID_GSI		INT_MIN
 
 /*
- * This is just a simple wrapper around early_ioremap(),
+ * This is just a simple wrapper around early_memremap(),
  * with sanity checks for phys == 0 and size == 0.
  */
 char *__init __acpi_map_table(unsigned long phys, unsigned long size)
@@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 	if (!phys || !size)
 		return NULL;
 
-	return early_ioremap(phys, size);
+	return early_memremap(phys, size);
 }
 
 void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
 	if (!map || !size)
 		return;
 
-	early_iounmap(map, size);
+	early_memunmap(map, size);
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -347,6 +347,14 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 	struct mpc_intsrc mp_irq;
 
 	/*
+	 * Check bus_irq boundary.
+	 */
+	if (bus_irq >= NR_IRQS_LEGACY) {
+		pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq);
+		return;
+	}
+
+	/*
 	 * Convert 'gsi' to 'ioapic.pin'.
 	 */
 	ioapic = mp_find_ioapic(gsi);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 32e14d137416..3344d3382e91 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr;
 
 int poke_int3_handler(struct pt_regs *regs)
 {
-	/* bp_patching_in_progress */
+	/*
+	 * Having observed our INT3 instruction, we now must observe
+	 * bp_patching_in_progress.
+	 *
+	 * 	in_progress = TRUE		INT3
+	 * 	WMB				RMB
+	 * 	write INT3			if (in_progress)
+	 *
+	 * Idem for bp_int3_handler.
+	 */
 	smp_rmb();
 
 	if (likely(!bp_patching_in_progress))
@@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
 	bp_int3_addr = (u8 *)addr + sizeof(int3);
 	bp_patching_in_progress = true;
 	/*
-	 * Corresponding read barrier in int3 notifier for
-	 * making sure the in_progress flags is correctly ordered wrt.
-	 * patching
+	 * Corresponding read barrier in int3 notifier for making sure the
+	 * in_progress and handler are correctly ordered wrt. patching.
 	 */
 	smp_wmb();
 
@@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
 	text_poke(addr, opcode, sizeof(int3));
 
 	on_each_cpu(do_sync_core, NULL, 1);
-
+	/*
+	 * sync_core() implies an smp_mb() and orders this store against
+	 * the writing of the new instruction.
+	 */
 	bp_patching_in_progress = false;
-	smp_wmb();
 
 	return addr;
 }
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 98b3dd8cf2bf..7834f73efbf1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -177,8 +177,6 @@ static int disable_apic_timer __initdata;
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
-int first_system_vector = FIRST_SYSTEM_VECTOR;
-
 /*
  * Debug level, exported for io_apic.c
  */
@@ -599,9 +597,13 @@ static const struct x86_cpu_id deadline_match[] = {
 
 static void apic_check_deadline_errata(void)
 {
-	const struct x86_cpu_id *m = x86_match_cpu(deadline_match);
+	const struct x86_cpu_id *m;
 	u32 rev;
 
+	if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+		return;
+
+	m = x86_match_cpu(deadline_match);
 	if (!m)
 		return;
 
@@ -990,8 +992,7 @@ void setup_secondary_APIC_clock(void)
  */
 static void local_apic_timer_interrupt(void)
 {
-	int cpu = smp_processor_id();
-	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+	struct clock_event_device *evt = this_cpu_ptr(&lapic_events);
 
 	/*
 	 * Normally we should not be here till LAPIC has been initialized but
@@ -1005,7 +1006,8 @@ static void local_apic_timer_interrupt(void)
 	 * spurious.
 	 */
 	if (!evt->event_handler) {
-		pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+		pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
+			   smp_processor_id());
 		/* Switch it off */
 		lapic_timer_shutdown(evt);
 		return;
@@ -1040,25 +1042,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
 	entering_ack_irq();
-	local_apic_timer_interrupt();
-	exiting_irq();
-
-	set_irq_regs(old_regs);
-}
-
-__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-
-	/*
-	 * NOTE! We'd better ACK the irq immediately,
-	 * because timer handling can be slow.
-	 *
-	 * update_process_times() expects us to have done irq_enter().
-	 * Besides, if we don't timer interrupts ignore the global
-	 * interrupt lock, which is the WrongThing (tm) to do.
-	 */
-	entering_ack_irq();
 	trace_local_timer_entry(LOCAL_TIMER_VECTOR);
 	local_apic_timer_interrupt();
 	trace_local_timer_exit(LOCAL_TIMER_VECTOR);
@@ -1920,10 +1903,14 @@ void __init register_lapic_address(unsigned long address)
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
-static void __smp_spurious_interrupt(u8 vector)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
 {
+	u8 vector = ~regs->orig_ax;
 	u32 v;
 
+	entering_irq();
+	trace_spurious_apic_entry(vector);
+
 	/*
 	 * Check if this really is a spurious interrupt and ACK it
 	 * if it is a vectored one.  Just in case...
@@ -1938,22 +1925,7 @@ static void __smp_spurious_interrupt(u8 vector)
 	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
 	pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
 		"should never happen.\n", vector, smp_processor_id());
-}
 
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
-{
-	entering_irq();
-	__smp_spurious_interrupt(~regs->orig_ax);
-	exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
-{
-	u8 vector = ~regs->orig_ax;
-
-	entering_irq();
-	trace_spurious_apic_entry(vector);
-	__smp_spurious_interrupt(vector);
 	trace_spurious_apic_exit(vector);
 	exiting_irq();
 }
@@ -1961,10 +1933,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
 /*
  * This interrupt should never happen with our APIC/SMP architecture
  */
-static void __smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
 {
-	u32 v;
-	u32 i = 0;
 	static const char * const error_interrupt_reason[] = {
 		"Send CS error",		/* APIC Error Bit 0 */
 		"Receive CS error",		/* APIC Error Bit 1 */
@@ -1975,6 +1945,10 @@ static void __smp_error_interrupt(struct pt_regs *regs)
 		"Received illegal vector",	/* APIC Error Bit 6 */
 		"Illegal register address",	/* APIC Error Bit 7 */
 	};
+	u32 v, i = 0;
+
+	entering_irq();
+	trace_error_apic_entry(ERROR_APIC_VECTOR);
 
 	/* First tickle the hardware, only then report what went on. -- REW */
 	if (lapic_get_maxlvt() > 3)	/* Due to the Pentium erratum 3AP. */
@@ -1996,20 +1970,6 @@ static void __smp_error_interrupt(struct pt_regs *regs)
 
 	apic_printk(APIC_DEBUG, KERN_CONT "\n");
 
-}
-
-__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
-{
-	entering_irq();
-	__smp_error_interrupt(regs);
-	exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
-{
-	entering_irq();
-	trace_error_apic_entry(ERROR_APIC_VECTOR);
-	__smp_error_interrupt(regs);
 	trace_error_apic_exit(ERROR_APIC_VECTOR);
 	exiting_irq();
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b4f5f73febdb..70e48aa6af98 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1243,7 +1243,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 			 entry.vector, entry.irr, entry.delivery_status);
 		if (ir_entry->format)
 			printk(KERN_DEBUG "%s, remapped, I(%04X),  Z(%X)\n",
-			       buf, (ir_entry->index << 15) | ir_entry->index,
+			       buf, (ir_entry->index2 << 15) | ir_entry->index,
 			       ir_entry->zero);
 		else
 			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
@@ -2093,7 +2093,7 @@ static inline void __init check_timer(void)
 			int idx;
 			idx = find_irq_entry(apic1, pin1, mp_INT);
 			if (idx != -1 && irq_trigger(idx))
-				unmask_ioapic_irq(irq_get_chip_data(0));
+				unmask_ioapic_irq(irq_get_irq_data(0));
 		}
 		irq_domain_deactivate_irq(irq_data);
 		irq_domain_activate_irq(irq_data);
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b3af457ed667..88c214e75a6b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
 		offset = current_offset;
 next:
 		vector += 16;
-		if (vector >= first_system_vector) {
+		if (vector >= FIRST_SYSTEM_VECTOR) {
 			offset = (offset + 1) % 16;
 			vector = FIRST_EXTERNAL_VECTOR + offset;
 		}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..710edab9e644 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
 
 #include <asm/ucontext.h>
 
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
 #define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
 	OFFSET(stack_canary_offset, stack_canary, canary);
 #endif
 
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
-	BLANK();
-	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
-	BLANK();
-	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
-	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
-	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
-	OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
-	OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
-	OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
-	OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
-	OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
-	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
-	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
 	BLANK();
 	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
 	DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 99332f550c48..cf42206926af 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,7 +20,6 @@ static char syscalls_ia32[] = {
 int main(void)
 {
 #ifdef CONFIG_PARAVIRT
-	OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
 	OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
 	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
 	BLANK();
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cdf82492b770..e17942c131c8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_INTEL_RDT_A)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bb5abe8f5fd4..9862e2cd6d93 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -134,6 +134,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 		n = K6_BUG_LOOP;
 		f_vide = vide;
+		OPTIMIZER_HIDE_VAR(f_vide);
 		d = rdtsc();
 		while (n--)
 			f_vide();
@@ -296,13 +297,29 @@ static int nearby_node(int apicid)
 }
 #endif
 
+#ifdef CONFIG_SMP
+/*
+ * Fix up cpu_core_id for pre-F17h systems to be in the
+ * [0 .. cores_per_node - 1] range. Not really needed but
+ * kept so as not to break existing setups.
+ */
+static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
+{
+	u32 cus_per_node;
+
+	if (c->x86 >= 0x17)
+		return;
+
+	cus_per_node = c->x86_max_cores / nodes_per_socket;
+	c->cpu_core_id %= cus_per_node;
+}
+
 /*
  * Fixup core topology information for
  * (1) AMD multi-node processors
  *     Assumption: Number of cores in each internal node is the same.
  * (2) AMD processors supporting compute units
  */
-#ifdef CONFIG_SMP
 static void amd_get_topology(struct cpuinfo_x86 *c)
 {
 	u8 node_id;
@@ -353,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 	} else
 		return;
 
-	/* fixup multi-node processor information */
 	if (nodes_per_socket > 1) {
-		u32 cus_per_node;
-
 		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-		cus_per_node = c->x86_max_cores / nodes_per_socket;
-
-		/* core id has to be in the [0 .. cores_per_node - 1] range */
-		c->cpu_core_id %= cus_per_node;
+		legacy_fixup_core_id(c);
 	}
 }
 #endif
@@ -547,8 +558,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
 static void early_init_amd(struct cpuinfo_x86 *c)
 {
+	u32 dummy;
+
 	early_init_amd_mc(c);
 
+	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
 	/*
 	 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
 	 * with P/T states and does not stop in deep C-states
@@ -611,6 +626,27 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	 */
 	if (cpu_has_amd_erratum(c, amd_erratum_400))
 		set_cpu_bug(c, X86_BUG_AMD_E400);
+
+	/*
+	 * BIOS support is required for SME. If BIOS has enabled SME then
+	 * adjust x86_phys_bits by the SME physical address space reduction
+	 * value. If BIOS has not enabled SME then don't advertise the
+	 * feature (set in scattered.c). Also, since the SME support requires
+	 * long mode, don't advertise the feature under CONFIG_X86_32.
+	 */
+	if (cpu_has(c, X86_FEATURE_SME)) {
+		u64 msr;
+
+		/* Check if SME is enabled */
+		rdmsrl(MSR_K8_SYSCFG, msr);
+		if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
+			c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+			if (IS_ENABLED(CONFIG_X86_32))
+				clear_cpu_cap(c, X86_FEATURE_SME);
+		} else {
+			clear_cpu_cap(c, X86_FEATURE_SME);
+		}
+	}
 }
 
 static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -729,8 +765,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 
 static void init_amd(struct cpuinfo_x86 *c)
 {
-	u32 dummy;
-
 	early_init_amd(c);
 
 	/*
@@ -792,8 +826,6 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 > 0x11)
 		set_cpu_cap(c, X86_FEATURE_ARAT);
 
-	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
-
 	/* 3DNow or LM implies PREFETCHW */
 	if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
 		if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index d869c8671e36..0ee83321a313 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -8,20 +8,25 @@
  * This file is licensed under GPLv2.
  */
 
-#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
 #include <linux/math64.h>
 #include <linux/percpu.h>
 #include <linux/smp.h>
 
 struct aperfmperf_sample {
 	unsigned int	khz;
-	unsigned long	jiffies;
+	ktime_t	time;
 	u64	aperf;
 	u64	mperf;
 };
 
 static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
 
+#define APERFMPERF_CACHE_THRESHOLD_MS	10
+#define APERFMPERF_REFRESH_DELAY_MS	20
+#define APERFMPERF_STALE_THRESHOLD_MS	1000
+
 /*
  * aperfmperf_snapshot_khz()
  * On the current CPU, snapshot APERF, MPERF, and jiffies
@@ -33,13 +38,18 @@ static void aperfmperf_snapshot_khz(void *dummy)
 	u64 aperf, aperf_delta;
 	u64 mperf, mperf_delta;
 	struct aperfmperf_sample *s = this_cpu_ptr(&samples);
+	ktime_t now = ktime_get();
+	s64 time_delta = ktime_ms_delta(now, s->time);
+	unsigned long flags;
 
-	/* Don't bother re-computing within 10 ms */
-	if (time_before(jiffies, s->jiffies + HZ/100))
+	/* Don't bother re-computing within the cache threshold time. */
+	if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
 		return;
 
+	local_irq_save(flags);
 	rdmsrl(MSR_IA32_APERF, aperf);
 	rdmsrl(MSR_IA32_MPERF, mperf);
+	local_irq_restore(flags);
 
 	aperf_delta = aperf - s->aperf;
 	mperf_delta = mperf - s->mperf;
@@ -51,22 +61,21 @@ static void aperfmperf_snapshot_khz(void *dummy)
 	if (mperf_delta == 0)
 		return;
 
-	/*
-	 * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
-	 *	khz = (cpu_khz * aperf_delta) / mperf_delta
-	 */
-	if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
-		s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
-	else	/* khz = aperf_delta / (mperf_delta / cpu_khz) */
-		s->khz = div64_u64(aperf_delta,
-			div64_u64(mperf_delta, cpu_khz));
-	s->jiffies = jiffies;
+	s->time = now;
 	s->aperf = aperf;
 	s->mperf = mperf;
+
+	/* If the previous iteration was too long ago, discard it. */
+	if (time_delta > APERFMPERF_STALE_THRESHOLD_MS)
+		s->khz = 0;
+	else
+		s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
 }
 
 unsigned int arch_freq_get_on_cpu(int cpu)
 {
+	unsigned int khz;
+
 	if (!cpu_khz)
 		return 0;
 
@@ -74,6 +83,12 @@ unsigned int arch_freq_get_on_cpu(int cpu)
 		return 0;
 
 	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+	khz = per_cpu(samples.khz, cpu);
+	if (khz)
+		return khz;
+
+	msleep(APERFMPERF_REFRESH_DELAY_MS);
+	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
 
 	return per_cpu(samples.khz, cpu);
 }
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0af86d9242da..db684880d74a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,6 +21,14 @@
 
 void __init check_bugs(void)
 {
+#ifdef CONFIG_X86_32
+	/*
+	 * Regardless of whether PCID is enumerated, the SDM says
+	 * that it can't be enabled in 32-bit mode.
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
 	identify_boot_cpu();
 
 	if (!IS_ENABLED(CONFIG_SMP)) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b39870f33e..efba8e3da3e2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s)
 }
 __setup("nompx", x86_mpx_setup);
 
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+	/* require an exact match without trailing characters */
+	if (strlen(s))
+		return 0;
+
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_PCID))
+		return 1;
+
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+	pr_info("nopcid: PCID feature disabled\n");
+	return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
 static int __init x86_noinvpcid_setup(char *s)
 {
 	/* noinvpcid doesn't accept parameters */
@@ -311,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_PCID)) {
+		if (cpu_has(c, X86_FEATURE_PGE)) {
+			cr4_set_bits(X86_CR4_PCIDE);
+		} else {
+			/*
+			 * flush_tlb_all(), as currently implemented, won't
+			 * work if PCID is on but PGE is not.  Since that
+			 * combination doesn't exist on real hardware, there's
+			 * no reason to try to fully support it, but it's
+			 * polite to avoid corrupting data if we're on
+			 * an improperly configured VM.
+			 */
+			clear_cpu_cap(c, X86_FEATURE_PCID);
+		}
+	}
+}
+
 /*
  * Protection Keys are not available in 32-bit mode.
  */
@@ -1125,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	setup_smep(c);
 	setup_smap(c);
 
+	/* Set up PCID */
+	setup_pcid(c);
+
 	/*
 	 * The vendor-specific functions might have changed features.
 	 * Now we do "generic changes."
@@ -1289,15 +1329,6 @@ static __init int setup_disablecpuid(char *arg)
 __setup("clearcpuid=", setup_disablecpuid);
 
 #ifdef CONFIG_X86_64
-struct desc_ptr idt_descr __ro_after_init = {
-	.size = NR_VECTORS * 16 - 1,
-	.address = (unsigned long) idt_table,
-};
-const struct desc_ptr debug_idt_descr = {
-	.size = NR_VECTORS * 16 - 1,
-	.address = (unsigned long) debug_idt_table,
-};
-
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
 		     irq_stack_union) __aligned(PAGE_SIZE) __visible;
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c55fb2cb2acc..24f749324c0f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
 	struct cacheinfo *this_leaf;
 	int i, sibling;
 
-	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+	/*
+	 * For L3, always use the pre-calculated cpu_llc_shared_mask
+	 * to derive shared_cpu_map.
+	 */
+	if (index == 3) {
+		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+			this_cpu_ci = get_cpu_cacheinfo(i);
+			if (!this_cpu_ci->info_list)
+				continue;
+			this_leaf = this_cpu_ci->info_list + index;
+			for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+				if (!cpu_online(sibling))
+					continue;
+				cpumask_set_cpu(sibling,
+						&this_leaf->shared_cpu_map);
+			}
+		}
+	} else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		unsigned int apicid, nshared, first, last;
 
 		this_leaf = this_cpu_ci->info_list + index;
@@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
 						&this_leaf->shared_cpu_map);
 			}
 		}
-	} else if (index == 3) {
-		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
-			this_cpu_ci = get_cpu_cacheinfo(i);
-			if (!this_cpu_ci->info_list)
-				continue;
-			this_leaf = this_cpu_ci->info_list + index;
-			for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
-				if (!cpu_online(sibling))
-					continue;
-				cpumask_set_cpu(sibling,
-						&this_leaf->shared_cpu_map);
-			}
-		}
 	} else
 		return 0;
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5b366462f579..cd5fc61ba450 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -30,7 +30,8 @@
 #include <linux/cpuhotplug.h>
 
 #include <asm/intel-family.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
 
 #define MAX_MBA_BW	100u
 #define MBA_IS_LINEAR	0x4
@@ -38,7 +39,13 @@
 /* Mutex to protect rdtgroup access. */
 DEFINE_MUTEX(rdtgroup_mutex);
 
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 
 /*
  * Used to store the max resource name width and max resource data width
@@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
  */
 int max_name_width, max_data_width;
 
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
 static void
 mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 static void
@@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
 
 struct rdt_resource rdt_resources_all[] = {
+	[RDT_RESOURCE_L3] =
 	{
+		.rid			= RDT_RESOURCE_L3,
 		.name			= "L3",
 		.domains		= domain_init(RDT_RESOURCE_L3),
 		.msr_base		= IA32_L3_CBM_BASE,
@@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = {
 		},
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
+		.fflags			= RFTYPE_RES_CACHE,
 	},
+	[RDT_RESOURCE_L3DATA] =
 	{
+		.rid			= RDT_RESOURCE_L3DATA,
 		.name			= "L3DATA",
 		.domains		= domain_init(RDT_RESOURCE_L3DATA),
 		.msr_base		= IA32_L3_CBM_BASE,
@@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = {
 		},
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
+		.fflags			= RFTYPE_RES_CACHE,
 	},
+	[RDT_RESOURCE_L3CODE] =
 	{
+		.rid			= RDT_RESOURCE_L3CODE,
 		.name			= "L3CODE",
 		.domains		= domain_init(RDT_RESOURCE_L3CODE),
 		.msr_base		= IA32_L3_CBM_BASE,
@@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = {
 		},
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
+		.fflags			= RFTYPE_RES_CACHE,
 	},
+	[RDT_RESOURCE_L2] =
 	{
+		.rid			= RDT_RESOURCE_L2,
 		.name			= "L2",
 		.domains		= domain_init(RDT_RESOURCE_L2),
 		.msr_base		= IA32_L2_CBM_BASE,
@@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = {
 		},
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
+		.fflags			= RFTYPE_RES_CACHE,
 	},
+	[RDT_RESOURCE_MBA] =
 	{
+		.rid			= RDT_RESOURCE_MBA,
 		.name			= "MB",
 		.domains		= domain_init(RDT_RESOURCE_MBA),
 		.msr_base		= IA32_MBA_THRTL_BASE,
@@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = {
 		.cache_level		= 3,
 		.parse_ctrlval		= parse_bw,
 		.format_str		= "%d=%*d",
+		.fflags			= RFTYPE_RES_MB,
 	},
 };
 
@@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
  * is always 20 on hsw server parts. The minimum cache bitmask length
  * allowed for HSW server is always 2 bits. Hardcode all of them.
  */
-static inline bool cache_alloc_hsw_probe(void)
+static inline void cache_alloc_hsw_probe(void)
 {
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-	    boot_cpu_data.x86 == 6 &&
-	    boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
-		struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
-		u32 l, h, max_cbm = BIT_MASK(20) - 1;
-
-		if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
-			return false;
-		rdmsr(IA32_L3_CBM_BASE, l, h);
+	struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
+	u32 l, h, max_cbm = BIT_MASK(20) - 1;
 
-		/* If all the bits were set in MSR, return success */
-		if (l != max_cbm)
-			return false;
+	if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+		return;
+	rdmsr(IA32_L3_CBM_BASE, l, h);
 
-		r->num_closid = 4;
-		r->default_ctrl = max_cbm;
-		r->cache.cbm_len = 20;
-		r->cache.min_cbm_bits = 2;
-		r->capable = true;
-		r->enabled = true;
+	/* If all the bits were set in MSR, return success */
+	if (l != max_cbm)
+		return;
 
-		return true;
-	}
+	r->num_closid = 4;
+	r->default_ctrl = max_cbm;
+	r->cache.cbm_len = 20;
+	r->cache.shareable_bits = 0xc0000;
+	r->cache.min_cbm_bits = 2;
+	r->alloc_capable = true;
+	r->alloc_enabled = true;
 
-	return false;
+	rdt_alloc_capable = true;
 }
 
 /*
@@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
 			return false;
 	}
 	r->data_width = 3;
-	rdt_get_mba_infofile(r);
 
-	r->capable = true;
-	r->enabled = true;
+	r->alloc_capable = true;
+	r->alloc_enabled = true;
 
 	return true;
 }
 
-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 {
 	union cpuid_0x10_1_eax eax;
 	union cpuid_0x10_x_edx edx;
@@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r)
 	r->num_closid = edx.split.cos_max + 1;
 	r->cache.cbm_len = eax.split.cbm_len + 1;
 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+	r->cache.shareable_bits = ebx & r->default_ctrl;
 	r->data_width = (r->cache.cbm_len + 3) / 4;
-	rdt_get_cache_infofile(r);
-	r->capable = true;
-	r->enabled = true;
+	r->alloc_capable = true;
+	r->alloc_enabled = true;
 }
 
 static void rdt_get_cdp_l3_config(int type)
@@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type)
 	r->cache.cbm_len = r_l3->cache.cbm_len;
 	r->default_ctrl = r_l3->default_ctrl;
 	r->data_width = (r->cache.cbm_len + 3) / 4;
-	r->capable = true;
+	r->alloc_capable = true;
 	/*
 	 * By default, CDP is disabled. CDP can be enabled by mount parameter
 	 * "cdp" during resctrl file system mount time.
 	 */
-	r->enabled = false;
+	r->alloc_enabled = false;
 }
 
 static int get_cache_id(int cpu, int level)
@@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 		wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
 }
 
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+	struct rdt_domain *d;
+
+	list_for_each_entry(d, &r->domains, list) {
+		/* Find the domain that contains this CPU */
+		if (cpumask_test_cpu(cpu, &d->cpu_mask))
+			return d;
+	}
+
+	return NULL;
+}
+
 void rdt_ctrl_update(void *arg)
 {
 	struct msr_param *m = arg;
@@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg)
 	int cpu = smp_processor_id();
 	struct rdt_domain *d;
 
-	list_for_each_entry(d, &r->domains, list) {
-		/* Find the domain that contains this CPU */
-		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
-			r->msr_update(d, m, r);
-			return;
-		}
+	d = get_domain_from_cpu(cpu, r);
+	if (d) {
+		r->msr_update(d, m, r);
+		return;
 	}
 	pr_warn_once("cpu %d not found in any domain for resource %s\n",
 		     cpu, r->name);
@@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg)
  * caller, return the first domain whose id is bigger than the input id.
  * The domain list is sorted by id in ascending order.
  */
-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
-					  struct list_head **pos)
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+				   struct list_head **pos)
 {
 	struct rdt_domain *d;
 	struct list_head *l;
@@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	return 0;
 }
 
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+	size_t tsize;
+
+	if (is_llc_occupancy_enabled()) {
+		d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+					   sizeof(unsigned long),
+					   GFP_KERNEL);
+		if (!d->rmid_busy_llc)
+			return -ENOMEM;
+		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+	}
+	if (is_mbm_total_enabled()) {
+		tsize = sizeof(*d->mbm_total);
+		d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+		if (!d->mbm_total) {
+			kfree(d->rmid_busy_llc);
+			return -ENOMEM;
+		}
+	}
+	if (is_mbm_local_enabled()) {
+		tsize = sizeof(*d->mbm_local);
+		d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+		if (!d->mbm_local) {
+			kfree(d->rmid_busy_llc);
+			kfree(d->mbm_total);
+			return -ENOMEM;
+		}
+	}
+
+	if (is_mbm_enabled()) {
+		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+	}
+
+	return 0;
+}
+
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 
 	d->id = id;
+	cpumask_set_cpu(cpu, &d->cpu_mask);
 
-	if (domain_setup_ctrlval(r, d)) {
+	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
+		kfree(d);
+		return;
+	}
+
+	if (r->mon_capable && domain_setup_mon_state(r, d)) {
 		kfree(d);
 		return;
 	}
 
-	cpumask_set_cpu(cpu, &d->cpu_mask);
 	list_add_tail(&d->list, add_pos);
+
+	/*
+	 * If resctrl is mounted, add
+	 * per domain monitor data directories.
+	 */
+	if (static_branch_unlikely(&rdt_mon_enable_key))
+		mkdir_mondata_subdir_allrdtgrp(r, d);
 }
 
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 
 	cpumask_clear_cpu(cpu, &d->cpu_mask);
 	if (cpumask_empty(&d->cpu_mask)) {
+		/*
+		 * If resctrl is mounted, remove all the
+		 * per domain monitor data directories.
+		 */
+		if (static_branch_unlikely(&rdt_mon_enable_key))
+			rmdir_mondata_subdir_allrdtgrp(r, d->id);
 		kfree(d->ctrl_val);
+		kfree(d->rmid_busy_llc);
+		kfree(d->mbm_total);
+		kfree(d->mbm_local);
 		list_del(&d->list);
+		if (is_mbm_enabled())
+			cancel_delayed_work(&d->mbm_over);
+		if (is_llc_occupancy_enabled() &&  has_busy_rmid(r, d)) {
+			/*
+			 * When a package is going down, forcefully
+			 * decrement rmid->ebusy. There is no way to know
+			 * that the L3 was flushed and hence may lead to
+			 * incorrect counts in rare scenarios, but leaving
+			 * the RMID as busy creates RMID leaks if the
+			 * package never comes back.
+			 */
+			__check_limbo(d, true);
+			cancel_delayed_work(&d->cqm_limbo);
+		}
+
 		kfree(d);
+		return;
+	}
+
+	if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+			cancel_delayed_work(&d->mbm_over);
+			mbm_setup_overflow_handler(d, 0);
+		}
+		if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+		    has_busy_rmid(r, d)) {
+			cancel_delayed_work(&d->cqm_limbo);
+			cqm_setup_limbo_handler(d, 0);
+		}
 	}
 }
 
-static void clear_closid(int cpu)
+static void clear_closid_rmid(int cpu)
 {
 	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 
-	per_cpu(cpu_closid, cpu) = 0;
-	state->closid = 0;
-	wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+	state->default_closid = 0;
+	state->default_rmid = 0;
+	state->cur_closid = 0;
+	state->cur_rmid = 0;
+	wrmsr(IA32_PQR_ASSOC, 0, 0);
 }
 
 static int intel_rdt_online_cpu(unsigned int cpu)
@@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu)
 		domain_add_cpu(cpu, r);
 	/* The cpu is set in default rdtgroup after online. */
 	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
-	clear_closid(cpu);
+	clear_closid_rmid(cpu);
 	mutex_unlock(&rdtgroup_mutex);
 
 	return 0;
 }
 
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+	struct rdtgroup *cr;
+
+	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
+			break;
+		}
+	}
+}
+
 static int intel_rdt_offline_cpu(unsigned int cpu)
 {
 	struct rdtgroup *rdtgrp;
@@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
 	for_each_capable_rdt_resource(r)
 		domain_remove_cpu(cpu, r);
 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
-		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+			clear_childcpus(rdtgrp, cpu);
 			break;
+		}
 	}
-	clear_closid(cpu);
+	clear_closid_rmid(cpu);
 	mutex_unlock(&rdtgroup_mutex);
 
 	return 0;
@@ -492,7 +627,7 @@ static __init void rdt_init_padding(void)
 	struct rdt_resource *r;
 	int cl;
 
-	for_each_capable_rdt_resource(r) {
+	for_each_alloc_capable_rdt_resource(r) {
 		cl = strlen(r->name);
 		if (cl > max_name_width)
 			max_name_width = cl;
@@ -502,38 +637,153 @@ static __init void rdt_init_padding(void)
 	}
 }
 
-static __init bool get_rdt_resources(void)
+enum {
+	RDT_FLAG_CMT,
+	RDT_FLAG_MBM_TOTAL,
+	RDT_FLAG_MBM_LOCAL,
+	RDT_FLAG_L3_CAT,
+	RDT_FLAG_L3_CDP,
+	RDT_FLAG_L2_CAT,
+	RDT_FLAG_MBA,
+};
+
+#define RDT_OPT(idx, n, f)	\
+[idx] = {			\
+	.name = n,		\
+	.flag = f		\
+}
+
+struct rdt_options {
+	char	*name;
+	int	flag;
+	bool	force_off, force_on;
+};
+
+static struct rdt_options rdt_options[]  __initdata = {
+	RDT_OPT(RDT_FLAG_CMT,	    "cmt",	X86_FEATURE_CQM_OCCUP_LLC),
+	RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+	RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+	RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",	X86_FEATURE_CAT_L3),
+	RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",	X86_FEATURE_CDP_L3),
+	RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",	X86_FEATURE_CAT_L2),
+	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+	struct rdt_options *o;
+	bool force_off;
+	char *tok;
+
+	if (*str == '=')
+		str++;
+	while ((tok = strsep(&str, ",")) != NULL) {
+		force_off = *tok == '!';
+		if (force_off)
+			tok++;
+		for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+			if (strcmp(tok, o->name) == 0) {
+				if (force_off)
+					o->force_off = true;
+				else
+					o->force_on = true;
+				break;
+			}
+		}
+	}
+	return 1;
+}
+__setup("rdt", set_rdt_options);
+
+static bool __init rdt_cpu_has(int flag)
+{
+	bool ret = boot_cpu_has(flag);
+	struct rdt_options *o;
+
+	if (!ret)
+		return ret;
+
+	for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+		if (flag == o->flag) {
+			if (o->force_off)
+				ret = false;
+			if (o->force_on)
+				ret = true;
+			break;
+		}
+	}
+	return ret;
+}
+
+static __init bool get_rdt_alloc_resources(void)
 {
 	bool ret = false;
 
-	if (cache_alloc_hsw_probe())
+	if (rdt_alloc_capable)
 		return true;
 
 	if (!boot_cpu_has(X86_FEATURE_RDT_A))
 		return false;
 
-	if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
-		rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
-		if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+		rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+		if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
 			rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
 			rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
 		}
 		ret = true;
 	}
-	if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
-		rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+		rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
 		ret = true;
 	}
 
-	if (boot_cpu_has(X86_FEATURE_MBA)) {
+	if (rdt_cpu_has(X86_FEATURE_MBA)) {
 		if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
 			ret = true;
 	}
-
 	return ret;
 }
 
+static __init bool get_rdt_mon_resources(void)
+{
+	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+		rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+		rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+		rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+	if (!rdt_mon_features)
+		return false;
+
+	return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+}
+
+static __init void rdt_quirks(void)
+{
+	switch (boot_cpu_data.x86_model) {
+	case INTEL_FAM6_HASWELL_X:
+		if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+			cache_alloc_hsw_probe();
+		break;
+	case INTEL_FAM6_SKYLAKE_X:
+		if (boot_cpu_data.x86_mask <= 4)
+			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+	}
+}
+
+static __init bool get_rdt_resources(void)
+{
+	rdt_quirks();
+	rdt_alloc_capable = get_rdt_alloc_resources();
+	rdt_mon_capable = get_rdt_mon_resources();
+
+	return (rdt_mon_capable || rdt_alloc_capable);
+}
+
 static int __init intel_rdt_late_init(void)
 {
 	struct rdt_resource *r;
@@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void)
 		return ret;
 	}
 
-	for_each_capable_rdt_resource(r)
+	for_each_alloc_capable_rdt_resource(r)
 		pr_info("Intel RDT %s allocation detected\n", r->name);
 
+	for_each_mon_capable_rdt_resource(r)
+		pr_info("Intel RDT %s monitoring detected\n", r->name);
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
new file mode 100644
index 000000000000..ebaddaeef023
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,440 @@
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+
+#include <linux/sched.h>
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+
+#define IA32_L3_QOS_CFG		0xc81
+#define IA32_L3_CBM_BASE	0xc90
+#define IA32_L2_CBM_BASE	0xd10
+#define IA32_MBA_THRTL_BASE	0xd50
+
+#define L3_QOS_CDP_ENABLE	0x01ULL
+
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID		0x01
+#define QOS_L3_MBM_TOTAL_EVENT_ID	0x02
+#define QOS_L3_MBM_LOCAL_EVENT_ID	0x03
+
+#define CQM_LIMBOCHECK_INTERVAL	1000
+
+#define MBM_CNTR_WIDTH			24
+#define MBM_OVERFLOW_INTERVAL		1000
+
+#define RMID_VAL_ERROR			BIT_ULL(63)
+#define RMID_VAL_UNAVAIL		BIT_ULL(62)
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid:		event id
+ * @name:		name of the event
+ */
+struct mon_evt {
+	u32			evtid;
+	char			*name;
+	struct list_head	list;
+};
+
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid:               Resource id associated with the event file.
+ * @evtid:             Event id associated with the event file
+ * @domid:             The domain to which the event file belongs
+ */
+union mon_data_bits {
+	void *priv;
+	struct {
+		unsigned int rid	: 10;
+		unsigned int evtid	: 8;
+		unsigned int domid	: 14;
+	} u;
+};
+
+struct rmid_read {
+	struct rdtgroup		*rgrp;
+	struct rdt_domain	*d;
+	int			evtid;
+	bool			first;
+	u64			val;
+};
+
+extern unsigned int intel_cqm_threshold;
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
+
+enum rdt_group_type {
+	RDTCTRL_GROUP = 0,
+	RDTMON_GROUP,
+	RDT_NUM_GROUP,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn		kernlfs node for the mon_data directory
+ * @parent:			parent rdtgrp
+ * @crdtgrp_list:		child rdtgroup node list
+ * @rmid:			rmid for this rdtgroup
+ */
+struct mongroup {
+	struct kernfs_node	*mon_data_kn;
+	struct rdtgroup		*parent;
+	struct list_head	crdtgrp_list;
+	u32			rmid;
+};
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn:				kernfs node
+ * @rdtgroup_list:		linked list for all rdtgroups
+ * @closid:			closid for this rdtgroup
+ * @cpu_mask:			CPUs assigned to this rdtgroup
+ * @flags:			status bits
+ * @waitcount:			how many cpus expect to find this
+ *				group when they acquire rdtgroup_mutex
+ * @type:			indicates type of this rdtgroup - either
+ *				monitor only or ctrl_mon group
+ * @mon:			mongroup related data
+ */
+struct rdtgroup {
+	struct kernfs_node	*kn;
+	struct list_head	rdtgroup_list;
+	u32			closid;
+	struct cpumask		cpu_mask;
+	int			flags;
+	atomic_t		waitcount;
+	enum rdt_group_type	type;
+	struct mongroup		mon;
+};
+
+/* rdtgroup.flags */
+#define	RDT_DELETED		1
+
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST	1
+
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO			BIT(0)
+#define RFTYPE_BASE			BIT(1)
+#define RF_CTRLSHIFT			4
+#define RF_MONSHIFT			5
+#define RFTYPE_CTRL			BIT(RF_CTRLSHIFT)
+#define RFTYPE_MON			BIT(RF_MONSHIFT)
+#define RFTYPE_RES_CACHE		BIT(8)
+#define RFTYPE_RES_MB			BIT(9)
+#define RF_CTRL_INFO			(RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO			(RFTYPE_INFO | RFTYPE_MON)
+#define RF_CTRL_BASE			(RFTYPE_BASE | RFTYPE_CTRL)
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+extern int max_name_width, max_data_width;
+
+int __init rdtgroup_init(void);
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name:	File name
+ * @mode:	Access mode
+ * @kf_ops:	File operations
+ * @flags:	File specific RFTYPE_FLAGS_* flags
+ * @fflags:	File specific RF_* or RFTYPE_* flags
+ * @seq_show:	Show content of the file
+ * @write:	Write to the file
+ */
+struct rftype {
+	char			*name;
+	umode_t			mode;
+	struct kernfs_ops	*kf_ops;
+	unsigned long		flags;
+	unsigned long		fflags;
+
+	int (*seq_show)(struct kernfs_open_file *of,
+			struct seq_file *sf, void *v);
+	/*
+	 * write() is the generic write callback which maps directly to
+	 * kernfs write operation and overrides all other operations.
+	 * Maximum write size is determined by ->max_write_len.
+	 */
+	ssize_t (*write)(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr	Value of IA32_QM_CTR for this RMID last time we read it
+ */
+struct mbm_state {
+	u64	chunks;
+	u64	prev_msr;
+};
+
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list:	all instances of this resource
+ * @id:		unique id for this instance
+ * @cpu_mask:	which cpus share this resource
+ * @rmid_busy_llc:
+ *		bitmap of which limbo RMIDs are above threshold
+ * @mbm_total:	saved state for MBM total bandwidth
+ * @mbm_local:	saved state for MBM local bandwidth
+ * @mbm_over:	worker to periodically read MBM h/w counters
+ * @cqm_limbo:	worker to periodically read CQM h/w counters
+ * @mbm_work_cpu:
+ *		worker cpu for MBM h/w counters
+ * @cqm_work_cpu:
+ *		worker cpu for CQM h/w counters
+ * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
+ * @new_ctrl:	new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
+ */
+struct rdt_domain {
+	struct list_head	list;
+	int			id;
+	struct cpumask		cpu_mask;
+	unsigned long		*rmid_busy_llc;
+	struct mbm_state	*mbm_total;
+	struct mbm_state	*mbm_local;
+	struct delayed_work	mbm_over;
+	struct delayed_work	cqm_limbo;
+	int			mbm_work_cpu;
+	int			cqm_work_cpu;
+	u32			*ctrl_val;
+	u32			new_ctrl;
+	bool			have_new_ctrl;
+};
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res:       The resource to use
+ * @low:       Beginning index from base MSR
+ * @high:      End index
+ */
+struct msr_param {
+	struct rdt_resource	*res;
+	int			low;
+	int			high;
+};
+
+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len:		Length of the cache bit mask
+ * @min_cbm_bits:	Minimum number of consecutive bits to be set
+ * @cbm_idx_mult:	Multiplier of CBM index
+ * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
+ *			closid * cbm_idx_multi + cbm_idx_offset
+ *			in a cache bit mask
+ * @shareable_bits:	Bitmask of shareable resource with other
+ *			executing entities
+ */
+struct rdt_cache {
+	unsigned int	cbm_len;
+	unsigned int	min_cbm_bits;
+	unsigned int	cbm_idx_mult;
+	unsigned int	cbm_idx_offset;
+	unsigned int	shareable_bits;
+};
+
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay:		Max throttle delay. Delay is the hardware
+ *			representation for memory bandwidth.
+ * @min_bw:		Minimum memory bandwidth percentage user can request
+ * @bw_gran:		Granularity at which the memory bandwidth is allocated
+ * @delay_linear:	True if memory B/W delay is in linear scale
+ * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+	u32		max_delay;
+	u32		min_bw;
+	u32		bw_gran;
+	u32		delay_linear;
+	u32		*mb_map;
+};
+
+static inline bool is_llc_occupancy_enabled(void)
+{
+	return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+
+static inline bool is_mbm_total_enabled(void)
+{
+	return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
+}
+
+static inline bool is_mbm_local_enabled(void)
+{
+	return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
+}
+
+static inline bool is_mbm_enabled(void)
+{
+	return (is_mbm_total_enabled() || is_mbm_local_enabled());
+}
+
+static inline bool is_mbm_event(int e)
+{
+	return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+		e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @rid:		The index of the resource
+ * @alloc_enabled:	Is allocation enabled on this machine
+ * @mon_enabled:		Is monitoring enabled for this feature
+ * @alloc_capable:	Is allocation available on this machine
+ * @mon_capable:		Is monitor feature available on this machine
+ * @name:		Name to use in "schemata" file
+ * @num_closid:		Number of CLOSIDs available
+ * @cache_level:	Which cache level defines scope of this resource
+ * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
+ * @msr_base:		Base MSR address for CBMs
+ * @msr_update:		Function pointer to update QOS MSRs
+ * @data_width:		Character width of data when displaying
+ * @domains:		All domains for this resource
+ * @cache:		Cache allocation related data
+ * @format_str:		Per resource format string to show domain value
+ * @parse_ctrlval:	Per resource function pointer to parse control values
+ * @evt_list:			List of monitoring events
+ * @num_rmid:			Number of RMIDs available
+ * @mon_scale:			cqm counter * mon_scale = occupancy in bytes
+ * @fflags:			flags to choose base and info files
+ */
+struct rdt_resource {
+	int			rid;
+	bool			alloc_enabled;
+	bool			mon_enabled;
+	bool			alloc_capable;
+	bool			mon_capable;
+	char			*name;
+	int			num_closid;
+	int			cache_level;
+	u32			default_ctrl;
+	unsigned int		msr_base;
+	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
+				 struct rdt_resource *r);
+	int			data_width;
+	struct list_head	domains;
+	struct rdt_cache	cache;
+	struct rdt_membw	membw;
+	const char		*format_str;
+	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
+				 struct rdt_domain *d);
+	struct list_head	evt_list;
+	int			num_rmid;
+	unsigned int		mon_scale;
+	unsigned long		fflags;
+};
+
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
+
+extern struct mutex rdtgroup_mutex;
+
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+int __init rdtgroup_init(void);
+
+enum {
+	RDT_RESOURCE_L3,
+	RDT_RESOURCE_L3DATA,
+	RDT_RESOURCE_L3CODE,
+	RDT_RESOURCE_L2,
+	RDT_RESOURCE_MBA,
+
+	/* Must be the last */
+	RDT_NUM_RESOURCES,
+};
+
+#define for_each_capable_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->alloc_capable || r->mon_capable)
+
+#define for_each_alloc_capable_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->alloc_capable)
+
+#define for_each_mon_capable_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->mon_capable)
+
+#define for_each_alloc_enabled_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->alloc_enabled)
+
+#define for_each_mon_enabled_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->mon_enabled)
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+	struct {
+		unsigned int cbm_len:5;
+	} split;
+	unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+	struct {
+		unsigned int max_delay:12;
+	} split;
+	unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+	struct {
+		unsigned int cos_max:16;
+	} split;
+	unsigned int full;
+};
+
+void rdt_ctrl_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+				   struct list_head **pos);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+			   struct seq_file *s, void *v);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+void mon_event_count(void *info);
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+				    unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+				    struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+		    struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom,
+				unsigned long delay_ms);
+void mbm_handle_overflow(struct work_struct *work);
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_handle_limbo(struct work_struct *work);
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
+void __check_limbo(struct rdt_domain *d, bool force_free);
+
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 406d7a6532f9..f6ea94f8954a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -26,7 +26,7 @@
 #include <linux/kernfs.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <asm/intel_rdt.h>
+#include "intel_rdt.h"
 
 /*
  * Check whether MBA bandwidth percentage value is correct. The value is
@@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
 {
 	struct rdt_resource *r;
 
-	for_each_enabled_rdt_resource(r) {
+	for_each_alloc_enabled_rdt_resource(r) {
 		if (!strcmp(resname, r->name) && closid < r->num_closid)
 			return parse_line(tok, r);
 	}
@@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
 	closid = rdtgrp->closid;
 
-	for_each_enabled_rdt_resource(r) {
+	for_each_alloc_enabled_rdt_resource(r) {
 		list_for_each_entry(dom, &r->domains, list)
 			dom->have_new_ctrl = false;
 	}
@@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 			goto out;
 	}
 
-	for_each_enabled_rdt_resource(r) {
+	for_each_alloc_enabled_rdt_resource(r) {
 		ret = update_domains(r, closid);
 		if (ret)
 			goto out;
@@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 {
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
-	int closid, ret = 0;
+	int ret = 0;
+	u32 closid;
 
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (rdtgrp) {
 		closid = rdtgrp->closid;
-		for_each_enabled_rdt_resource(r) {
+		for_each_alloc_enabled_rdt_resource(r) {
 			if (closid < r->num_closid)
 				show_doms(s, r, closid);
 		}
@@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 	rdtgroup_kn_unlock(of->kn);
 	return ret;
 }
+
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+		    struct rdtgroup *rdtgrp, int evtid, int first)
+{
+	/*
+	 * setup the parameters to send to the IPI to read the data.
+	 */
+	rr->rgrp = rdtgrp;
+	rr->evtid = evtid;
+	rr->d = d;
+	rr->val = 0;
+	rr->first = first;
+
+	smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+	struct kernfs_open_file *of = m->private;
+	u32 resid, evtid, domid;
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	union mon_data_bits md;
+	struct rdt_domain *d;
+	struct rmid_read rr;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+	md.priv = of->kn->priv;
+	resid = md.u.rid;
+	domid = md.u.domid;
+	evtid = md.u.evtid;
+
+	r = &rdt_resources_all[resid];
+	d = rdt_find_domain(r, domid, NULL);
+	if (!d) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	mon_event_read(&rr, d, rdtgrp, evtid, false);
+
+	if (rr.val & RMID_VAL_ERROR)
+		seq_puts(m, "Error\n");
+	else if (rr.val & RMID_VAL_UNAVAIL)
+		seq_puts(m, "Unavailable\n");
+	else
+		seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+
+out:
+	rdtgroup_kn_unlock(of->kn);
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 000000000000..30827510094b
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,499 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ *    Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "intel_rdt.h"
+
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL		0x0c8d
+
+struct rmid_entry {
+	u32				rmid;
+	int				busy;
+	struct list_head		list;
+};
+
+/**
+ * @rmid_free_lru    A least recently used list of free RMIDs
+ *     These RMIDs are guaranteed to have an occupancy less than the
+ *     threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/**
+ * @rmid_limbo_count     count of currently unused but (potentially)
+ *     dirty RMIDs.
+ *     This counts RMIDs that no one is currently using but that
+ *     may have a occupancy value > intel_cqm_threshold. User can change
+ *     the threshold occupancy value.
+ */
+unsigned int rmid_limbo_count;
+
+/**
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry	*rmid_ptrs;
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+/*
+ * Global to indicate which monitoring events are enabled.
+ */
+unsigned int rdt_mon_features;
+
+/*
+ * This is the threshold cache occupancy at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int intel_cqm_threshold;
+
+static inline struct rmid_entry *__rmid_entry(u32 rmid)
+{
+	struct rmid_entry *entry;
+
+	entry = &rmid_ptrs[rmid];
+	WARN_ON(entry->rmid != rmid);
+
+	return entry;
+}
+
+static u64 __rmid_read(u32 rmid, u32 eventid)
+{
+	u64 val;
+
+	/*
+	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+	 * with a valid event code for supported resource type and the bits
+	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+	 * are error bits.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+
+	return val;
+}
+
+static bool rmid_dirty(struct rmid_entry *entry)
+{
+	u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+
+	return val >= intel_cqm_threshold;
+}
+
+/*
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
+ */
+void __check_limbo(struct rdt_domain *d, bool force_free)
+{
+	struct rmid_entry *entry;
+	struct rdt_resource *r;
+	u32 crmid = 1, nrmid;
+
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	/*
+	 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+	 * are marked as busy for occupancy < threshold. If the occupancy
+	 * is less than the threshold decrement the busy counter of the
+	 * RMID and move it to the free list when the counter reaches 0.
+	 */
+	for (;;) {
+		nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
+		if (nrmid >= r->num_rmid)
+			break;
+
+		entry = __rmid_entry(nrmid);
+		if (force_free || !rmid_dirty(entry)) {
+			clear_bit(entry->rmid, d->rmid_busy_llc);
+			if (!--entry->busy) {
+				rmid_limbo_count--;
+				list_add_tail(&entry->list, &rmid_free_lru);
+			}
+		}
+		crmid = nrmid + 1;
+	}
+}
+
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+{
+	return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+}
+
+/*
+ * As of now the RMIDs allocation is global.
+ * However we keep track of which packages the RMIDs
+ * are used to optimize the limbo list management.
+ */
+int alloc_rmid(void)
+{
+	struct rmid_entry *entry;
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	if (list_empty(&rmid_free_lru))
+		return rmid_limbo_count ? -EBUSY : -ENOSPC;
+
+	entry = list_first_entry(&rmid_free_lru,
+				 struct rmid_entry, list);
+	list_del(&entry->list);
+
+	return entry->rmid;
+}
+
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+	struct rdt_resource *r;
+	struct rdt_domain *d;
+	int cpu;
+	u64 val;
+
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	entry->busy = 0;
+	cpu = get_cpu();
+	list_for_each_entry(d, &r->domains, list) {
+		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+			val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+			if (val <= intel_cqm_threshold)
+				continue;
+		}
+
+		/*
+		 * For the first limbo RMID in the domain,
+		 * setup up the limbo worker.
+		 */
+		if (!has_busy_rmid(r, d))
+			cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
+		set_bit(entry->rmid, d->rmid_busy_llc);
+		entry->busy++;
+	}
+	put_cpu();
+
+	if (entry->busy)
+		rmid_limbo_count++;
+	else
+		list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+void free_rmid(u32 rmid)
+{
+	struct rmid_entry *entry;
+
+	if (!rmid)
+		return;
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	entry = __rmid_entry(rmid);
+
+	if (is_llc_occupancy_enabled())
+		add_rmid_to_limbo(entry);
+	else
+		list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+{
+	u64 chunks, shift, tval;
+	struct mbm_state *m;
+
+	tval = __rmid_read(rmid, rr->evtid);
+	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
+		rr->val = tval;
+		return -EINVAL;
+	}
+	switch (rr->evtid) {
+	case QOS_L3_OCCUP_EVENT_ID:
+		rr->val += tval;
+		return 0;
+	case QOS_L3_MBM_TOTAL_EVENT_ID:
+		m = &rr->d->mbm_total[rmid];
+		break;
+	case QOS_L3_MBM_LOCAL_EVENT_ID:
+		m = &rr->d->mbm_local[rmid];
+		break;
+	default:
+		/*
+		 * Code would never reach here because
+		 * an invalid event id would fail the __rmid_read.
+		 */
+		return -EINVAL;
+	}
+
+	if (rr->first) {
+		m->prev_msr = tval;
+		m->chunks = 0;
+		return 0;
+	}
+
+	shift = 64 - MBM_CNTR_WIDTH;
+	chunks = (tval << shift) - (m->prev_msr << shift);
+	chunks >>= shift;
+	m->chunks += chunks;
+	m->prev_msr = tval;
+
+	rr->val += m->chunks;
+	return 0;
+}
+
+/*
+ * This is called via IPI to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+	struct rdtgroup *rdtgrp, *entry;
+	struct rmid_read *rr = info;
+	struct list_head *head;
+
+	rdtgrp = rr->rgrp;
+
+	if (__mon_event_count(rdtgrp->mon.rmid, rr))
+		return;
+
+	/*
+	 * For Ctrl groups read data from child monitor groups.
+	 */
+	head = &rdtgrp->mon.crdtgrp_list;
+
+	if (rdtgrp->type == RDTCTRL_GROUP) {
+		list_for_each_entry(entry, head, mon.crdtgrp_list) {
+			if (__mon_event_count(entry->mon.rmid, rr))
+				return;
+		}
+	}
+}
+
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+	struct rmid_read rr;
+
+	rr.first = false;
+	rr.d = d;
+
+	/*
+	 * This is protected from concurrent reads from user
+	 * as both the user and we hold the global mutex.
+	 */
+	if (is_mbm_total_enabled()) {
+		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+		__mon_event_count(rmid, &rr);
+	}
+	if (is_mbm_local_enabled()) {
+		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+		__mon_event_count(rmid, &rr);
+	}
+}
+
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+	unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+	int cpu = smp_processor_id();
+	struct rdt_resource *r;
+	struct rdt_domain *d;
+
+	mutex_lock(&rdtgroup_mutex);
+
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+	d = get_domain_from_cpu(cpu, r);
+
+	if (!d) {
+		pr_warn_once("Failure to get domain for limbo worker\n");
+		goto out_unlock;
+	}
+
+	__check_limbo(d, false);
+
+	if (has_busy_rmid(r, d))
+		schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+}
+
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+	unsigned long delay = msecs_to_jiffies(delay_ms);
+	struct rdt_resource *r;
+	int cpu;
+
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	cpu = cpumask_any(&dom->cpu_mask);
+	dom->cqm_work_cpu = cpu;
+
+	schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+
+void mbm_handle_overflow(struct work_struct *work)
+{
+	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+	struct rdtgroup *prgrp, *crgrp;
+	int cpu = smp_processor_id();
+	struct list_head *head;
+	struct rdt_domain *d;
+
+	mutex_lock(&rdtgroup_mutex);
+
+	if (!static_branch_likely(&rdt_enable_key))
+		goto out_unlock;
+
+	d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
+	if (!d)
+		goto out_unlock;
+
+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+		mbm_update(d, prgrp->mon.rmid);
+
+		head = &prgrp->mon.crdtgrp_list;
+		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+			mbm_update(d, crgrp->mon.rmid);
+	}
+
+	schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+}
+
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+	unsigned long delay = msecs_to_jiffies(delay_ms);
+	int cpu;
+
+	if (!static_branch_likely(&rdt_enable_key))
+		return;
+	cpu = cpumask_any(&dom->cpu_mask);
+	dom->mbm_work_cpu = cpu;
+	schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+	struct rmid_entry *entry = NULL;
+	int i, nr_rmids;
+
+	nr_rmids = r->num_rmid;
+	rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
+	if (!rmid_ptrs)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_rmids; i++) {
+		entry = &rmid_ptrs[i];
+		INIT_LIST_HEAD(&entry->list);
+
+		entry->rmid = i;
+		list_add_tail(&entry->list, &rmid_free_lru);
+	}
+
+	/*
+	 * RMID 0 is special and is always allocated. It's used for all
+	 * tasks that are not monitored.
+	 */
+	entry = __rmid_entry(0);
+	list_del(&entry->list);
+
+	return 0;
+}
+
+static struct mon_evt llc_occupancy_event = {
+	.name		= "llc_occupancy",
+	.evtid		= QOS_L3_OCCUP_EVENT_ID,
+};
+
+static struct mon_evt mbm_total_event = {
+	.name		= "mbm_total_bytes",
+	.evtid		= QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+
+static struct mon_evt mbm_local_event = {
+	.name		= "mbm_local_bytes",
+	.evtid		= QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+	INIT_LIST_HEAD(&r->evt_list);
+
+	if (is_llc_occupancy_enabled())
+		list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+	if (is_mbm_total_enabled())
+		list_add_tail(&mbm_total_event.list, &r->evt_list);
+	if (is_mbm_local_enabled())
+		list_add_tail(&mbm_local_event.list, &r->evt_list);
+}
+
+int rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+	int ret;
+
+	r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
+
+	/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+	intel_cqm_threshold /= r->mon_scale;
+
+	ret = dom_data_init(r);
+	if (ret)
+		return ret;
+
+	l3_mon_evt_init(r);
+
+	r->mon_capable = true;
+	r->mon_enabled = true;
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 9257bd9dc664..a869d4a073c5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -32,17 +32,25 @@
 
 #include <uapi/linux/magic.h>
 
-#include <asm/intel_rdt.h>
-#include <asm/intel_rdt_common.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
 
 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-struct kernfs_root *rdt_root;
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
 LIST_HEAD(rdt_all_groups);
 
 /* Kernel fs node for "info" directory under root */
 static struct kernfs_node *kn_info;
 
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
 /*
  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  * we can keep a bitmap of free CLOSIDs in a single integer.
@@ -66,7 +74,7 @@ static void closid_init(void)
 	int rdt_min_closid = 32;
 
 	/* Compute rdt_min_closid across all resources */
-	for_each_enabled_rdt_resource(r)
+	for_each_alloc_enabled_rdt_resource(r)
 		rdt_min_closid = min(rdt_min_closid, r->num_closid);
 
 	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
@@ -75,9 +83,9 @@ static void closid_init(void)
 	closid_free_map &= ~1;
 }
 
-int closid_alloc(void)
+static int closid_alloc(void)
 {
-	int closid = ffs(closid_free_map);
+	u32 closid = ffs(closid_free_map);
 
 	if (closid == 0)
 		return -ENOSPC;
@@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 	return 0;
 }
 
-static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
-			      int len)
-{
-	struct rftype *rft;
-	int ret;
-
-	lockdep_assert_held(&rdtgroup_mutex);
-
-	for (rft = rfts; rft < rfts + len; rft++) {
-		ret = rdtgroup_add_file(kn, rft);
-		if (ret)
-			goto error;
-	}
-
-	return 0;
-error:
-	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
-	while (--rft >= rfts)
-		kernfs_remove_by_name(kn, rft->name);
-	return ret;
-}
-
 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
@@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
 	.seq_show		= rdtgroup_seqfile_show,
 };
 
+static struct kernfs_ops kf_mondata_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.seq_show		= rdtgroup_mondata_show,
+};
+
 static bool is_cpu_list(struct kernfs_open_file *of)
 {
 	struct rftype *rft = of->kn->priv;
@@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 /*
  * This is safe against intel_rdt_sched_in() called from __switch_to()
  * because __switch_to() is executed with interrupts disabled. A local call
- * from rdt_update_closid() is proteced against __switch_to() because
+ * from update_closid_rmid() is proteced against __switch_to() because
  * preemption is disabled.
  */
-static void rdt_update_cpu_closid(void *closid)
+static void update_cpu_closid_rmid(void *info)
 {
-	if (closid)
-		this_cpu_write(cpu_closid, *(int *)closid);
+	struct rdtgroup *r = info;
+
+	if (r) {
+		this_cpu_write(pqr_state.default_closid, r->closid);
+		this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+	}
+
 	/*
 	 * We cannot unconditionally write the MSR because the current
 	 * executing task might have its own closid selected. Just reuse
@@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid)
 /*
  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
  *
- * Per task closids must have been set up before calling this function.
- *
- * The per cpu closids are updated with the smp function call, when @closid
- * is not NULL. If @closid is NULL then all affected percpu closids must
- * have been set up before calling this function.
+ * Per task closids/rmids must have been set up before calling this function.
  */
 static void
-rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 {
 	int cpu = get_cpu();
 
 	if (cpumask_test_cpu(cpu, cpu_mask))
-		rdt_update_cpu_closid(closid);
-	smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+		update_cpu_closid_rmid(r);
+	smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
 	put_cpu();
 }
 
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+			  cpumask_var_t tmpmask)
+{
+	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+	struct list_head *head;
+
+	/* Check whether cpus belong to parent ctrl group */
+	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+	if (cpumask_weight(tmpmask))
+		return -EINVAL;
+
+	/* Check whether cpus are dropped from this group */
+	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+	if (cpumask_weight(tmpmask)) {
+		/* Give any dropped cpus to parent rdtgroup */
+		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+		update_closid_rmid(tmpmask, prgrp);
+	}
+
+	/*
+	 * If we added cpus, remove them from previous group that owned them
+	 * and update per-cpu rmid
+	 */
+	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+	if (cpumask_weight(tmpmask)) {
+		head = &prgrp->mon.crdtgrp_list;
+		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+			if (crgrp == rdtgrp)
+				continue;
+			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+				       tmpmask);
+		}
+		update_closid_rmid(tmpmask, rdtgrp);
+	}
+
+	/* Done pushing/pulling - update this group with new mask */
+	cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+	return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+	struct rdtgroup *crgrp;
+
+	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+	/* update the child mon group masks as well*/
+	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+{
+	struct rdtgroup *r, *crgrp;
+	struct list_head *head;
+
+	/* Check whether cpus are dropped from this group */
+	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+	if (cpumask_weight(tmpmask)) {
+		/* Can't drop from default group */
+		if (rdtgrp == &rdtgroup_default)
+			return -EINVAL;
+
+		/* Give any dropped cpus to rdtgroup_default */
+		cpumask_or(&rdtgroup_default.cpu_mask,
+			   &rdtgroup_default.cpu_mask, tmpmask);
+		update_closid_rmid(tmpmask, &rdtgroup_default);
+	}
+
+	/*
+	 * If we added cpus, remove them from previous group and
+	 * the prev group's child groups that owned them
+	 * and update per-cpu closid/rmid.
+	 */
+	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+	if (cpumask_weight(tmpmask)) {
+		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+			if (r == rdtgrp)
+				continue;
+			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+			if (cpumask_weight(tmpmask1))
+				cpumask_rdtgrp_clear(r, tmpmask1);
+		}
+		update_closid_rmid(tmpmask, rdtgrp);
+	}
+
+	/* Done pushing/pulling - update this group with new mask */
+	cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+	/*
+	 * Clear child mon group masks since there is a new parent mask
+	 * now and update the rmid for the cpus the child lost.
+	 */
+	head = &rdtgrp->mon.crdtgrp_list;
+	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+		update_closid_rmid(tmpmask, rdtgrp);
+		cpumask_clear(&crgrp->cpu_mask);
+	}
+
+	return 0;
+}
+
 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 				   char *buf, size_t nbytes, loff_t off)
 {
-	cpumask_var_t tmpmask, newmask;
-	struct rdtgroup *rdtgrp, *r;
+	cpumask_var_t tmpmask, newmask, tmpmask1;
+	struct rdtgroup *rdtgrp;
 	int ret;
 
 	if (!buf)
@@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 		free_cpumask_var(tmpmask);
 		return -ENOMEM;
 	}
+	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+		free_cpumask_var(tmpmask);
+		free_cpumask_var(newmask);
+		return -ENOMEM;
+	}
 
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (!rdtgrp) {
@@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 		goto unlock;
 	}
 
-	/* Check whether cpus are dropped from this group */
-	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
-	if (cpumask_weight(tmpmask)) {
-		/* Can't drop from default group */
-		if (rdtgrp == &rdtgroup_default) {
-			ret = -EINVAL;
-			goto unlock;
-		}
-		/* Give any dropped cpus to rdtgroup_default */
-		cpumask_or(&rdtgroup_default.cpu_mask,
-			   &rdtgroup_default.cpu_mask, tmpmask);
-		rdt_update_closid(tmpmask, &rdtgroup_default.closid);
-	}
-
-	/*
-	 * If we added cpus, remove them from previous group that owned them
-	 * and update per-cpu closid
-	 */
-	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
-	if (cpumask_weight(tmpmask)) {
-		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
-			if (r == rdtgrp)
-				continue;
-			cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
-		}
-		rdt_update_closid(tmpmask, &rdtgrp->closid);
-	}
-
-	/* Done pushing/pulling - update this group with new mask */
-	cpumask_copy(&rdtgrp->cpu_mask, newmask);
+	if (rdtgrp->type == RDTCTRL_GROUP)
+		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
+	else if (rdtgrp->type == RDTMON_GROUP)
+		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
+	else
+		ret = -EINVAL;
 
 unlock:
 	rdtgroup_kn_unlock(of->kn);
 	free_cpumask_var(tmpmask);
 	free_cpumask_var(newmask);
+	free_cpumask_var(tmpmask1);
 
 	return ret ?: nbytes;
 }
@@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head)
 	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
 	    (rdtgrp->flags & RDT_DELETED)) {
 		current->closid = 0;
+		current->rmid = 0;
 		kfree(rdtgrp);
 	}
 
@@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 		atomic_dec(&rdtgrp->waitcount);
 		kfree(callback);
 	} else {
-		tsk->closid = rdtgrp->closid;
+		/*
+		 * For ctrl_mon groups move both closid and rmid.
+		 * For monitor groups, can move the tasks only from
+		 * their parent CTRL group.
+		 */
+		if (rdtgrp->type == RDTCTRL_GROUP) {
+			tsk->closid = rdtgrp->closid;
+			tsk->rmid = rdtgrp->mon.rmid;
+		} else if (rdtgrp->type == RDTMON_GROUP) {
+			if (rdtgrp->mon.parent->closid == tsk->closid)
+				tsk->rmid = rdtgrp->mon.rmid;
+			else
+				ret = -EINVAL;
+		}
 	}
 	return ret;
 }
@@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if (t->closid == r->closid)
+		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
 			seq_printf(s, "%d\n", t->pid);
 	}
 	rcu_read_unlock();
@@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 	return ret;
 }
 
-/* Files in each rdtgroup */
-static struct rftype rdtgroup_base_files[] = {
-	{
-		.name		= "cpus",
-		.mode		= 0644,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.write		= rdtgroup_cpus_write,
-		.seq_show	= rdtgroup_cpus_show,
-	},
-	{
-		.name		= "cpus_list",
-		.mode		= 0644,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.write		= rdtgroup_cpus_write,
-		.seq_show	= rdtgroup_cpus_show,
-		.flags		= RFTYPE_FLAGS_CPUS_LIST,
-	},
-	{
-		.name		= "tasks",
-		.mode		= 0644,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.write		= rdtgroup_tasks_write,
-		.seq_show	= rdtgroup_tasks_show,
-	},
-	{
-		.name		= "schemata",
-		.mode		= 0644,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.write		= rdtgroup_schemata_write,
-		.seq_show	= rdtgroup_schemata_show,
-	},
-};
-
 static int rdt_num_closids_show(struct kernfs_open_file *of,
 				struct seq_file *seq, void *v)
 {
@@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 	return 0;
 }
 
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+				   struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%x\n", r->cache.shareable_bits);
+	return 0;
+}
+
 static int rdt_min_bw_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
@@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
 	return 0;
 }
 
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+			      struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%d\n", r->num_rmid);
+
+	return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+				 struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+	struct mon_evt *mevt;
+
+	list_for_each_entry(mevt, &r->evt_list, list)
+		seq_printf(seq, "%s\n", mevt->name);
+
+	return 0;
+}
+
 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
@@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
 	return 0;
 }
 
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+				  struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+
+	return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+	unsigned int bytes;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &bytes);
+	if (ret)
+		return ret;
+
+	if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+		return -EINVAL;
+
+	intel_cqm_threshold = bytes / r->mon_scale;
+
+	return nbytes;
+}
+
 /* rdtgroup information files for one cache resource. */
-static struct rftype res_cache_info_files[] = {
+static struct rftype res_common_files[] = {
 	{
 		.name		= "num_closids",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_num_closids_show,
+		.fflags		= RF_CTRL_INFO,
+	},
+	{
+		.name		= "mon_features",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_mon_features_show,
+		.fflags		= RF_MON_INFO,
+	},
+	{
+		.name		= "num_rmids",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_num_rmids_show,
+		.fflags		= RF_MON_INFO,
 	},
 	{
 		.name		= "cbm_mask",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_default_ctrl_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "min_cbm_bits",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_min_cbm_bits_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
-};
-
-/* rdtgroup information files for memory bandwidth. */
-static struct rftype res_mba_info_files[] = {
 	{
-		.name		= "num_closids",
+		.name		= "shareable_bits",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
-		.seq_show	= rdt_num_closids_show,
+		.seq_show	= rdt_shareable_bits_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "min_bandwidth",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_min_bw_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
 	},
 	{
 		.name		= "bandwidth_gran",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_bw_gran_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
 	},
 	{
 		.name		= "delay_linear",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_delay_linear_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
+	},
+	{
+		.name		= "max_threshold_occupancy",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= max_threshold_occ_write,
+		.seq_show	= max_threshold_occ_show,
+		.fflags		= RF_MON_INFO | RFTYPE_RES_CACHE,
+	},
+	{
+		.name		= "cpus",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_cpus_write,
+		.seq_show	= rdtgroup_cpus_show,
+		.fflags		= RFTYPE_BASE,
+	},
+	{
+		.name		= "cpus_list",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_cpus_write,
+		.seq_show	= rdtgroup_cpus_show,
+		.flags		= RFTYPE_FLAGS_CPUS_LIST,
+		.fflags		= RFTYPE_BASE,
+	},
+	{
+		.name		= "tasks",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_tasks_write,
+		.seq_show	= rdtgroup_tasks_show,
+		.fflags		= RFTYPE_BASE,
+	},
+	{
+		.name		= "schemata",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_schemata_write,
+		.seq_show	= rdtgroup_schemata_show,
+		.fflags		= RF_CTRL_BASE,
 	},
 };
 
-void rdt_get_mba_infofile(struct rdt_resource *r)
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
 {
-	r->info_files = res_mba_info_files;
-	r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
+	struct rftype *rfts, *rft;
+	int ret, len;
+
+	rfts = res_common_files;
+	len = ARRAY_SIZE(res_common_files);
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	for (rft = rfts; rft < rfts + len; rft++) {
+		if ((fflags & rft->fflags) == rft->fflags) {
+			ret = rdtgroup_add_file(kn, rft);
+			if (ret)
+				goto error;
+		}
+	}
+
+	return 0;
+error:
+	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+	while (--rft >= rfts) {
+		if ((fflags & rft->fflags) == rft->fflags)
+			kernfs_remove_by_name(kn, rft->name);
+	}
+	return ret;
 }
 
-void rdt_get_cache_infofile(struct rdt_resource *r)
+static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+				      unsigned long fflags)
 {
-	r->info_files = res_cache_info_files;
-	r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
+	struct kernfs_node *kn_subdir;
+	int ret;
+
+	kn_subdir = kernfs_create_dir(kn_info, name,
+				      kn_info->mode, r);
+	if (IS_ERR(kn_subdir))
+		return PTR_ERR(kn_subdir);
+
+	kernfs_get(kn_subdir);
+	ret = rdtgroup_kn_set_ugid(kn_subdir);
+	if (ret)
+		return ret;
+
+	ret = rdtgroup_add_files(kn_subdir, fflags);
+	if (!ret)
+		kernfs_activate(kn_subdir);
+
+	return ret;
 }
 
 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 {
-	struct kernfs_node *kn_subdir;
-	struct rftype *res_info_files;
 	struct rdt_resource *r;
-	int ret, len;
+	unsigned long fflags;
+	char name[32];
+	int ret;
 
 	/* create the directory */
 	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 		return PTR_ERR(kn_info);
 	kernfs_get(kn_info);
 
-	for_each_enabled_rdt_resource(r) {
-		kn_subdir = kernfs_create_dir(kn_info, r->name,
-					      kn_info->mode, r);
-		if (IS_ERR(kn_subdir)) {
-			ret = PTR_ERR(kn_subdir);
-			goto out_destroy;
-		}
-		kernfs_get(kn_subdir);
-		ret = rdtgroup_kn_set_ugid(kn_subdir);
+	for_each_alloc_enabled_rdt_resource(r) {
+		fflags =  r->fflags | RF_CTRL_INFO;
+		ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
 		if (ret)
 			goto out_destroy;
+	}
 
-		res_info_files = r->info_files;
-		len = r->nr_info_files;
-
-		ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
+	for_each_mon_enabled_rdt_resource(r) {
+		fflags =  r->fflags | RF_MON_INFO;
+		sprintf(name, "%s_MON", r->name);
+		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
 		if (ret)
 			goto out_destroy;
-		kernfs_activate(kn_subdir);
 	}
 
 	/*
@@ -678,6 +889,39 @@ out_destroy:
 	return ret;
 }
 
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+		    char *name, struct kernfs_node **dest_kn)
+{
+	struct kernfs_node *kn;
+	int ret;
+
+	/* create the directory */
+	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	if (dest_kn)
+		*dest_kn = kn;
+
+	/*
+	 * This extra ref will be put in kernfs_remove() and guarantees
+	 * that @rdtgrp->kn is always accessible.
+	 */
+	kernfs_get(kn);
+
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret)
+		goto out_destroy;
+
+	kernfs_activate(kn);
+
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn);
+	return ret;
+}
 static void l3_qos_cfg_update(void *arg)
 {
 	bool *enable = arg;
@@ -718,14 +962,15 @@ static int cdp_enable(void)
 	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
 	int ret;
 
-	if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+	if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
+	    !r_l3code->alloc_capable)
 		return -EINVAL;
 
 	ret = set_l3_qos_cfg(r_l3, true);
 	if (!ret) {
-		r_l3->enabled = false;
-		r_l3data->enabled = true;
-		r_l3code->enabled = true;
+		r_l3->alloc_enabled = false;
+		r_l3data->alloc_enabled = true;
+		r_l3code->alloc_enabled = true;
 	}
 	return ret;
 }
@@ -734,11 +979,11 @@ static void cdp_disable(void)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
 
-	r->enabled = r->capable;
+	r->alloc_enabled = r->alloc_capable;
 
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
-		rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
-		rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
+		rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
+		rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
 		set_l3_qos_cfg(r, false);
 	}
 }
@@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
 	}
 }
 
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+			     struct rdtgroup *prgrp,
+			     struct kernfs_node **mon_data_kn);
+
 static struct dentry *rdt_mount(struct file_system_type *fs_type,
 				int flags, const char *unused_dev_name,
 				void *data)
 {
+	struct rdt_domain *dom;
+	struct rdt_resource *r;
 	struct dentry *dentry;
 	int ret;
 
@@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 		goto out_cdp;
 	}
 
+	if (rdt_mon_capable) {
+		ret = mongroup_create_dir(rdtgroup_default.kn,
+					  NULL, "mon_groups",
+					  &kn_mongrp);
+		if (ret) {
+			dentry = ERR_PTR(ret);
+			goto out_info;
+		}
+		kernfs_get(kn_mongrp);
+
+		ret = mkdir_mondata_all(rdtgroup_default.kn,
+					&rdtgroup_default, &kn_mondata);
+		if (ret) {
+			dentry = ERR_PTR(ret);
+			goto out_mongrp;
+		}
+		kernfs_get(kn_mondata);
+		rdtgroup_default.mon.mon_data_kn = kn_mondata;
+	}
+
 	dentry = kernfs_mount(fs_type, flags, rdt_root,
 			      RDTGROUP_SUPER_MAGIC, NULL);
 	if (IS_ERR(dentry))
-		goto out_destroy;
+		goto out_mondata;
+
+	if (rdt_alloc_capable)
+		static_branch_enable(&rdt_alloc_enable_key);
+	if (rdt_mon_capable)
+		static_branch_enable(&rdt_mon_enable_key);
+
+	if (rdt_alloc_capable || rdt_mon_capable)
+		static_branch_enable(&rdt_enable_key);
+
+	if (is_mbm_enabled()) {
+		r = &rdt_resources_all[RDT_RESOURCE_L3];
+		list_for_each_entry(dom, &r->domains, list)
+			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
+	}
 
-	static_branch_enable(&rdt_enable_key);
 	goto out;
 
-out_destroy:
+out_mondata:
+	if (rdt_mon_capable)
+		kernfs_remove(kn_mondata);
+out_mongrp:
+	if (rdt_mon_capable)
+		kernfs_remove(kn_mongrp);
+out_info:
 	kernfs_remove(kn_info);
 out_cdp:
 	cdp_disable();
@@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	return 0;
 }
 
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+	return (rdt_alloc_capable &&
+		(r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+	return (rdt_mon_capable &&
+		(r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
 /*
  * Move tasks from one to the other group. If @from is NULL, then all tasks
  * in the systems are moved unconditionally (used for teardown).
@@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(p, t) {
-		if (!from || t->closid == from->closid) {
+		if (!from || is_closid_match(t, from) ||
+		    is_rmid_match(t, from)) {
 			t->closid = to->closid;
+			t->rmid = to->mon.rmid;
+
 #ifdef CONFIG_SMP
 			/*
 			 * This is safe on x86 w/o barriers as the ordering
@@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 	read_unlock(&tasklist_lock);
 }
 
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+	struct rdtgroup *sentry, *stmp;
+	struct list_head *head;
+
+	head = &rdtgrp->mon.crdtgrp_list;
+	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+		free_rmid(sentry->mon.rmid);
+		list_del(&sentry->mon.crdtgrp_list);
+		kfree(sentry);
+	}
+}
+
 /*
  * Forcibly remove all of subdirectories under root.
  */
@@ -955,6 +1273,9 @@ static void rmdir_all_sub(void)
 	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
 
 	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+		/* Free any child rmids */
+		free_all_child_rdtgrp(rdtgrp);
+
 		/* Remove each rdtgroup other than root */
 		if (rdtgrp == &rdtgroup_default)
 			continue;
@@ -967,16 +1288,20 @@ static void rmdir_all_sub(void)
 		cpumask_or(&rdtgroup_default.cpu_mask,
 			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
 
+		free_rmid(rdtgrp->mon.rmid);
+
 		kernfs_remove(rdtgrp->kn);
 		list_del(&rdtgrp->rdtgroup_list);
 		kfree(rdtgrp);
 	}
 	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
 	get_online_cpus();
-	rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
 	put_online_cpus();
 
 	kernfs_remove(kn_info);
+	kernfs_remove(kn_mongrp);
+	kernfs_remove(kn_mondata);
 }
 
 static void rdt_kill_sb(struct super_block *sb)
@@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb)
 	mutex_lock(&rdtgroup_mutex);
 
 	/*Put everything back to default values. */
-	for_each_enabled_rdt_resource(r)
+	for_each_alloc_enabled_rdt_resource(r)
 		reset_all_ctrls(r);
 	cdp_disable();
 	rmdir_all_sub();
+	static_branch_disable(&rdt_alloc_enable_key);
+	static_branch_disable(&rdt_mon_enable_key);
 	static_branch_disable(&rdt_enable_key);
 	kernfs_kill_sb(sb);
 	mutex_unlock(&rdtgroup_mutex);
@@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = {
 	.kill_sb = rdt_kill_sb,
 };
 
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-			  umode_t mode)
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+		       void *priv)
 {
-	struct rdtgroup *parent, *rdtgrp;
 	struct kernfs_node *kn;
-	int ret, closid;
+	int ret = 0;
 
-	/* Only allow mkdir in the root directory */
-	if (parent_kn != rdtgroup_default.kn)
-		return -EPERM;
+	kn = __kernfs_create_file(parent_kn, name, 0444, 0,
+				  &kf_mondata_ops, priv, NULL, NULL);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
 
-	/* Do not accept '\n' to avoid unparsable situation. */
-	if (strchr(name, '\n'))
-		return -EINVAL;
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret) {
+		kernfs_remove(kn);
+		return ret;
+	}
 
-	parent = rdtgroup_kn_lock_live(parent_kn);
-	if (!parent) {
-		ret = -ENODEV;
-		goto out_unlock;
+	return ret;
+}
+
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups with given domain id.
+ */
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+{
+	struct rdtgroup *prgrp, *crgrp;
+	char name[32];
+
+	if (!r->mon_enabled)
+		return;
+
+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+		sprintf(name, "mon_%s_%02d", r->name, dom_id);
+		kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+
+		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+			kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
 	}
+}
 
-	ret = closid_alloc();
-	if (ret < 0)
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+				struct rdt_domain *d,
+				struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+	union mon_data_bits priv;
+	struct kernfs_node *kn;
+	struct mon_evt *mevt;
+	struct rmid_read rr;
+	char name[32];
+	int ret;
+
+	sprintf(name, "mon_%s_%02d", r->name, d->id);
+	/* create the directory */
+	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	/*
+	 * This extra ref will be put in kernfs_remove() and guarantees
+	 * that kn is always accessible.
+	 */
+	kernfs_get(kn);
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret)
+		goto out_destroy;
+
+	if (WARN_ON(list_empty(&r->evt_list))) {
+		ret = -EPERM;
+		goto out_destroy;
+	}
+
+	priv.u.rid = r->rid;
+	priv.u.domid = d->id;
+	list_for_each_entry(mevt, &r->evt_list, list) {
+		priv.u.evtid = mevt->evtid;
+		ret = mon_addfile(kn, mevt->name, priv.priv);
+		if (ret)
+			goto out_destroy;
+
+		if (is_mbm_event(mevt->evtid))
+			mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+	}
+	kernfs_activate(kn);
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn);
+	return ret;
+}
+
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+				    struct rdt_domain *d)
+{
+	struct kernfs_node *parent_kn;
+	struct rdtgroup *prgrp, *crgrp;
+	struct list_head *head;
+
+	if (!r->mon_enabled)
+		return;
+
+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+		parent_kn = prgrp->mon.mon_data_kn;
+		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+
+		head = &prgrp->mon.crdtgrp_list;
+		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+			parent_kn = crgrp->mon.mon_data_kn;
+			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+		}
+	}
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+				       struct rdt_resource *r,
+				       struct rdtgroup *prgrp)
+{
+	struct rdt_domain *dom;
+	int ret;
+
+	list_for_each_entry(dom, &r->domains, list) {
+		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain whic are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+			     struct rdtgroup *prgrp,
+			     struct kernfs_node **dest_kn)
+{
+	struct rdt_resource *r;
+	struct kernfs_node *kn;
+	int ret;
+
+	/*
+	 * Create the mon_data directory first.
+	 */
+	ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+	if (ret)
+		return ret;
+
+	if (dest_kn)
+		*dest_kn = kn;
+
+	/*
+	 * Create the subdirectories for each domain. Note that all events
+	 * in a domain like L3 are grouped into a resource whose domain is L3
+	 */
+	for_each_mon_enabled_rdt_resource(r) {
+		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+		if (ret)
+			goto out_destroy;
+	}
+
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn);
+	return ret;
+}
+
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+			     struct kernfs_node *prgrp_kn,
+			     const char *name, umode_t mode,
+			     enum rdt_group_type rtype, struct rdtgroup **r)
+{
+	struct rdtgroup *prdtgrp, *rdtgrp;
+	struct kernfs_node *kn;
+	uint files = 0;
+	int ret;
+
+	prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+	if (!prdtgrp) {
+		ret = -ENODEV;
 		goto out_unlock;
-	closid = ret;
+	}
 
 	/* allocate the rdtgroup. */
 	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
 	if (!rdtgrp) {
 		ret = -ENOSPC;
-		goto out_closid_free;
+		goto out_unlock;
 	}
-	rdtgrp->closid = closid;
-	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+	*r = rdtgrp;
+	rdtgrp->mon.parent = prdtgrp;
+	rdtgrp->type = rtype;
+	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
 
 	/* kernfs creates the directory for rdtgrp */
-	kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
 	if (IS_ERR(kn)) {
 		ret = PTR_ERR(kn);
-		goto out_cancel_ref;
+		goto out_free_rgrp;
 	}
 	rdtgrp->kn = kn;
 
@@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	if (ret)
 		goto out_destroy;
 
-	ret = rdtgroup_add_files(kn, rdtgroup_base_files,
-				 ARRAY_SIZE(rdtgroup_base_files));
+	files = RFTYPE_BASE | RFTYPE_CTRL;
+	files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+	ret = rdtgroup_add_files(kn, files);
 	if (ret)
 		goto out_destroy;
 
+	if (rdt_mon_capable) {
+		ret = alloc_rmid();
+		if (ret < 0)
+			goto out_destroy;
+		rdtgrp->mon.rmid = ret;
+
+		ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+		if (ret)
+			goto out_idfree;
+	}
 	kernfs_activate(kn);
 
-	ret = 0;
-	goto out_unlock;
+	/*
+	 * The caller unlocks the prgrp_kn upon success.
+	 */
+	return 0;
 
+out_idfree:
+	free_rmid(rdtgrp->mon.rmid);
 out_destroy:
 	kernfs_remove(rdtgrp->kn);
-out_cancel_ref:
-	list_del(&rdtgrp->rdtgroup_list);
+out_free_rgrp:
 	kfree(rdtgrp);
-out_closid_free:
-	closid_free(closid);
 out_unlock:
-	rdtgroup_kn_unlock(parent_kn);
+	rdtgroup_kn_unlock(prgrp_kn);
 	return ret;
 }
 
-static int rdtgroup_rmdir(struct kernfs_node *kn)
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+	kernfs_remove(rgrp->kn);
+	free_rmid(rgrp->mon.rmid);
+	kfree(rgrp);
+}
+
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+			      struct kernfs_node *prgrp_kn,
+			      const char *name,
+			      umode_t mode)
+{
+	struct rdtgroup *rdtgrp, *prgrp;
+	int ret;
+
+	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
+				&rdtgrp);
+	if (ret)
+		return ret;
+
+	prgrp = rdtgrp->mon.parent;
+	rdtgrp->closid = prgrp->closid;
+
+	/*
+	 * Add the rdtgrp to the list of rdtgrps the parent
+	 * ctrl_mon group has to track.
+	 */
+	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+
+	rdtgroup_kn_unlock(prgrp_kn);
+	return ret;
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate and monitor resources.
+ */
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+				   struct kernfs_node *prgrp_kn,
+				   const char *name, umode_t mode)
 {
-	int ret, cpu, closid = rdtgroup_default.closid;
 	struct rdtgroup *rdtgrp;
-	cpumask_var_t tmpmask;
+	struct kernfs_node *kn;
+	u32 closid;
+	int ret;
 
-	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
-		return -ENOMEM;
+	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
+				&rdtgrp);
+	if (ret)
+		return ret;
 
-	rdtgrp = rdtgroup_kn_lock_live(kn);
-	if (!rdtgrp) {
-		ret = -EPERM;
-		goto out;
+	kn = rdtgrp->kn;
+	ret = closid_alloc();
+	if (ret < 0)
+		goto out_common_fail;
+	closid = ret;
+
+	rdtgrp->closid = closid;
+	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+	if (rdt_mon_capable) {
+		/*
+		 * Create an empty mon_groups directory to hold the subset
+		 * of tasks and cpus to monitor.
+		 */
+		ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+		if (ret)
+			goto out_id_free;
 	}
 
+	goto out_unlock;
+
+out_id_free:
+	closid_free(closid);
+	list_del(&rdtgrp->rdtgroup_list);
+out_common_fail:
+	mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+	rdtgroup_kn_unlock(prgrp_kn);
+	return ret;
+}
+
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ *   This makes sure "mon_groups" directory always has a ctrl_mon group
+ *   as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+	return (!strcmp(kn->name, "mon_groups") &&
+		strcmp(name, "mon_groups"));
+}
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+			  umode_t mode)
+{
+	/* Do not accept '\n' to avoid unparsable situation. */
+	if (strchr(name, '\n'))
+		return -EINVAL;
+
+	/*
+	 * If the parent directory is the root directory and RDT
+	 * allocation is supported, add a control and monitoring
+	 * subdirectory
+	 */
+	if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+		return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+
+	/*
+	 * If RDT monitoring is supported and the parent directory is a valid
+	 * "mon_groups" directory, add a monitoring subdirectory.
+	 */
+	if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+		return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+
+	return -EPERM;
+}
+
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+			      cpumask_var_t tmpmask)
+{
+	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+	int cpu;
+
+	/* Give any tasks back to the parent group */
+	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+	/* Update per cpu rmid of the moved CPUs first */
+	for_each_cpu(cpu, &rdtgrp->cpu_mask)
+		per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+	/*
+	 * Update the MSR on moved CPUs and CPUs which have moved
+	 * task running on them.
+	 */
+	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+	update_closid_rmid(tmpmask, NULL);
+
+	rdtgrp->flags = RDT_DELETED;
+	free_rmid(rdtgrp->mon.rmid);
+
+	/*
+	 * Remove the rdtgrp from the parent ctrl_mon group's list
+	 */
+	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+	list_del(&rdtgrp->mon.crdtgrp_list);
+
+	/*
+	 * one extra hold on this, will drop when we kfree(rdtgrp)
+	 * in rdtgroup_kn_unlock()
+	 */
+	kernfs_get(kn);
+	kernfs_remove(rdtgrp->kn);
+
+	return 0;
+}
+
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+			       cpumask_var_t tmpmask)
+{
+	int cpu;
+
 	/* Give any tasks back to the default group */
 	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
 
@@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 	cpumask_or(&rdtgroup_default.cpu_mask,
 		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
 
-	/* Update per cpu closid of the moved CPUs first */
-	for_each_cpu(cpu, &rdtgrp->cpu_mask)
-		per_cpu(cpu_closid, cpu) = closid;
+	/* Update per cpu closid and rmid of the moved CPUs first */
+	for_each_cpu(cpu, &rdtgrp->cpu_mask) {
+		per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
+		per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
+	}
+
 	/*
 	 * Update the MSR on moved CPUs and CPUs which have moved
 	 * task running on them.
 	 */
 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
-	rdt_update_closid(tmpmask, NULL);
+	update_closid_rmid(tmpmask, NULL);
 
 	rdtgrp->flags = RDT_DELETED;
 	closid_free(rdtgrp->closid);
+	free_rmid(rdtgrp->mon.rmid);
+
+	/*
+	 * Free all the child monitor group rmids.
+	 */
+	free_all_child_rdtgrp(rdtgrp);
+
 	list_del(&rdtgrp->rdtgroup_list);
 
 	/*
@@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 	 */
 	kernfs_get(kn);
 	kernfs_remove(rdtgrp->kn);
-	ret = 0;
+
+	return 0;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+	struct kernfs_node *parent_kn = kn->parent;
+	struct rdtgroup *rdtgrp;
+	cpumask_var_t tmpmask;
+	int ret = 0;
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	rdtgrp = rdtgroup_kn_lock_live(kn);
+	if (!rdtgrp) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	/*
+	 * If the rdtgroup is a ctrl_mon group and parent directory
+	 * is the root directory, remove the ctrl_mon group.
+	 *
+	 * If the rdtgroup is a mon group and parent directory
+	 * is a valid "mon_groups" directory, remove the mon group.
+	 */
+	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
+		ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+	else if (rdtgrp->type == RDTMON_GROUP &&
+		 is_mon_groups(parent_kn, kn->name))
+		ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+	else
+		ret = -EPERM;
+
 out:
 	rdtgroup_kn_unlock(kn);
 	free_cpumask_var(tmpmask);
@@ -1129,7 +1845,7 @@ out:
 
 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 {
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
 		seq_puts(seq, ",cdp");
 	return 0;
 }
@@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void)
 	mutex_lock(&rdtgroup_mutex);
 
 	rdtgroup_default.closid = 0;
+	rdtgroup_default.mon.rmid = 0;
+	rdtgroup_default.type = RDTCTRL_GROUP;
+	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
+
 	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
 
-	ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
-				 ARRAY_SIZE(rdtgroup_base_files));
+	ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
 	if (ret) {
 		kernfs_destroy_root(rdt_root);
 		goto out;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6dde0497efc7..3b413065c613 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,6 +51,7 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/reboot.h>
+#include <asm/set_memory.h>
 
 #include "mce-internal.h"
 
@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
 	return ret;
 }
 
+#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
+
+void arch_unmap_kpfn(unsigned long pfn)
+{
+	unsigned long decoy_addr;
+
+	/*
+	 * Unmap this page from the kernel 1:1 mappings to make sure
+	 * we don't log more errors because of speculative access to
+	 * the page.
+	 * We would like to just call:
+	 *	set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
+	 * but doing that would radically increase the odds of a
+	 * speculative access to the posion page because we'd have
+	 * the virtual address of the kernel 1:1 mapping sitting
+	 * around in registers.
+	 * Instead we get tricky.  We create a non-canonical address
+	 * that looks just like the one we want, but has bit 63 flipped.
+	 * This relies on set_memory_np() not checking whether we passed
+	 * a legal address.
+	 */
+
+/*
+ * Build time check to see if we have a spare virtual bit. Don't want
+ * to leave this until run time because most developers don't have a
+ * system that can exercise this code path. This will only become a
+ * problem if/when we move beyond 5-level page tables.
+ *
+ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
+ */
+#if PGDIR_SHIFT + 9 < 63
+	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+#else
+#error "no unused virtual bit available"
+#endif
+
+	if (set_memory_np(decoy_addr, 1))
+		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+
+}
+#endif
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9e314bcf67cc..40e28ed77fbf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
 		wrmsr(smca_config, low, high);
 	}
 
-	/* Collect bank_info using CPU 0 for now. */
-	if (cpu)
+	/* Return early if this bank was already initialized. */
+	if (smca_banks[bank].hwid)
 		return;
 
 	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
@@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
 	for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
 		s_hwid = &smca_hwid_mcatypes[i];
 		if (hwid_mcatype == s_hwid->hwid_mcatype) {
-
-			WARN(smca_banks[bank].hwid,
-			     "Bank %s already initialized!\n",
-			     smca_get_name(s_hwid->bank_type));
-
 			smca_banks[bank].hwid = s_hwid;
 			smca_banks[bank].id = low;
 			smca_banks[bank].sysfs_id = s_hwid->count++;
@@ -776,24 +771,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
 	mce_log(&m);
 }
 
-static inline void __smp_deferred_error_interrupt(void)
-{
-	inc_irq_stat(irq_deferred_error_count);
-	deferred_error_int_vector();
-}
-
 asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
 {
 	entering_irq();
-	__smp_deferred_error_interrupt();
-	exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
-{
-	entering_irq();
 	trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
-	__smp_deferred_error_interrupt();
+	inc_irq_stat(irq_deferred_error_count);
+	deferred_error_int_vector();
 	trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
 	exiting_ack_irq();
 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d7cc190ae457..2da67b70ba98 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -122,7 +122,7 @@ static struct attribute *thermal_throttle_attrs[] = {
 	NULL
 };
 
-static struct attribute_group thermal_attr_group = {
+static const struct attribute_group thermal_attr_group = {
 	.attrs	= thermal_throttle_attrs,
 	.name	= "thermal_throttle"
 };
@@ -390,26 +390,12 @@ static void unexpected_thermal_interrupt(void)
 
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
 
-static inline void __smp_thermal_interrupt(void)
-{
-	inc_irq_stat(irq_thermal_count);
-	smp_thermal_vector();
-}
-
-asmlinkage __visible void __irq_entry
-smp_thermal_interrupt(struct pt_regs *regs)
-{
-	entering_irq();
-	__smp_thermal_interrupt();
-	exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry
-smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r)
 {
 	entering_irq();
 	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
-	__smp_thermal_interrupt();
+	inc_irq_stat(irq_thermal_count);
+	smp_thermal_vector();
 	trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
 	exiting_ack_irq();
 }
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index bb0e75eed10a..5e7249e42f8f 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,24 +17,12 @@ static void default_threshold_interrupt(void)
 
 void (*mce_threshold_vector)(void) = default_threshold_interrupt;
 
-static inline void __smp_threshold_interrupt(void)
-{
-	inc_irq_stat(irq_threshold_count);
-	mce_threshold_vector();
-}
-
 asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
 {
 	entering_irq();
-	__smp_threshold_interrupt();
-	exiting_ack_irq();
-}
-
-asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
-{
-	entering_irq();
 	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
-	__smp_threshold_interrupt();
+	inc_irq_stat(irq_threshold_count);
+	mce_threshold_vector();
 	trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
 	exiting_ack_irq();
 }
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 21b185793c80..c6daec4bdba5 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch)
 
 	list_for_each_entry(p, &microcode_cache, plist) {
 		if (p->equiv_cpu == new_patch->equiv_cpu) {
-			if (p->patch_id >= new_patch->patch_id)
+			if (p->patch_id >= new_patch->patch_id) {
 				/* we already have the latest patch */
+				kfree(new_patch->data);
+				kfree(new_patch);
 				return;
+			}
 
 			list_replace(&p->plist, &new_patch->plist);
 			kfree(p->data);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9cb98ee103db..86e8f0b2537b 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -561,7 +561,7 @@ static struct attribute *mc_default_attrs[] = {
 	NULL
 };
 
-static struct attribute_group mc_attr_group = {
+static const struct attribute_group mc_attr_group = {
 	.attrs			= mc_default_attrs,
 	.name			= "microcode",
 };
@@ -707,7 +707,7 @@ static struct attribute *cpu_root_microcode_attrs[] = {
 	NULL
 };
 
-static struct attribute_group cpu_root_microcode_group = {
+static const struct attribute_group cpu_root_microcode_group = {
 	.name  = "microcode",
 	.attrs = cpu_root_microcode_attrs,
 };
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 59edbe9d4ccb..8f7a9bbad514 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header,
 	return false;
 }
 
-static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
+static struct ucode_patch *memdup_patch(void *data, unsigned int size)
 {
 	struct ucode_patch *p;
 
 	p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
 	if (!p)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
 	p->data = kmemdup(data, size, GFP_KERNEL);
 	if (!p->data) {
 		kfree(p);
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 	}
 
 	return p;
@@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size)
 			if (mc_hdr->rev <= mc_saved_hdr->rev)
 				continue;
 
-			p = __alloc_microcode_buf(data, size);
-			if (IS_ERR(p))
+			p = memdup_patch(data, size);
+			if (!p)
 				pr_err("Error allocating buffer %p\n", data);
 			else
 				list_replace(&iter->plist, &p->plist);
@@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size)
 	 * newly found.
 	 */
 	if (!prev_found) {
-		p = __alloc_microcode_buf(data, size);
-		if (IS_ERR(p))
+		p = memdup_patch(data, size);
+		if (!p)
 			pr_err("Error allocating buffer for %p\n", data);
 		else
 			list_add_tail(&p->plist, &microcode_cache);
 	}
 
+	if (!p)
+		return;
+
 	/*
 	 * Save for early loading. On 32-bit, that needs to be a physical
 	 * address as the APs are running from physical addresses, before
 	 * paging has been enabled.
 	 */
-	if (p) {
-		if (IS_ENABLED(CONFIG_X86_32))
-			intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
-		else
-			intel_ucode_patch = p->data;
-	}
+	if (IS_ENABLED(CONFIG_X86_32))
+		intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+	else
+		intel_ucode_patch = p->data;
 }
 
 static int microcode_sanity_check(void *mc, int print_err)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 70e717fccdd6..9fc32651c911 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs)
 void hv_setup_vmbus_irq(void (*handler)(void))
 {
 	vmbus_handler = handler;
-	/*
-	 * Setup the IDT for hypervisor callback. Prevent reallocation
-	 * at module reload.
-	 */
-	if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
-		alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
-				hyperv_callback_vector);
+	/* Setup the IDT for hypervisor callback */
+	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
 }
 
 void hv_remove_vmbus_irq(void)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c5bb63be4ba1..40d5a8a75212 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -237,6 +237,18 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 	stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
 }
 
+static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
+				unsigned long size, mtrr_type type)
+{
+	struct set_mtrr_data data = { .smp_reg = reg,
+				      .smp_base = base,
+				      .smp_size = size,
+				      .smp_type = type
+				    };
+
+	stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
+
 static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
 				      unsigned long size, mtrr_type type)
 {
@@ -370,7 +382,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	/* Search for an empty MTRR */
 	i = mtrr_if->get_free_region(base, size, replace);
 	if (i >= 0) {
-		set_mtrr(i, base, size, type);
+		set_mtrr_cpuslocked(i, base, size, type);
 		if (likely(replace < 0)) {
 			mtrr_usage_table[i] = 1;
 		} else {
@@ -378,7 +390,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 			if (increment)
 				mtrr_usage_table[i]++;
 			if (unlikely(replace != i)) {
-				set_mtrr(replace, 0, 0, 0);
+				set_mtrr_cpuslocked(replace, 0, 0, 0);
 				mtrr_usage_table[replace] = 0;
 			}
 		}
@@ -506,7 +518,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 		goto out;
 	}
 	if (--mtrr_usage_table[reg] < 1)
-		set_mtrr(reg, 0, 0, 0);
+		set_mtrr_cpuslocked(reg, 0, 0, 0);
 	error = reg;
  out:
 	mutex_unlock(&mtrr_mutex);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 23c23508c012..05459ad3db46 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
+	{ X86_FEATURE_SME,		CPUID_EAX,  0, 0x8000001f, 0 },
 	{ 0, 0, 0, 0, 0 }
 };
 
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3fe45f84ced4..cbf1f6ba39a8 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -235,8 +235,7 @@ static void __init dtb_add_ioapic(struct device_node *dn)
 
 	ret = of_address_to_resource(dn, 0, &r);
 	if (ret) {
-		printk(KERN_ERR "Can't obtain address from node %s.\n",
-				dn->full_name);
+		printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn);
 		return;
 	}
 	mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dbce3cca94cb..f13b4c00a5de 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (stack_name)
 			printk("%s <%s>\n", log_lvl, stack_name);
 
+		if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
+			__show_regs(regs, 0);
+
 		/*
 		 * Scan the stack, printing any text addresses we find.  At the
 		 * same time, follow proper stack frames with the unwinder.
@@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			 * Don't print regs->ip again if it was already printed
 			 * by __show_regs() below.
 			 */
-			if (regs && stack == &regs->ip) {
-				unwind_next_frame(&state);
-				continue;
-			}
+			if (regs && stack == &regs->ip)
+				goto next;
 
 			if (stack == ret_addr_p)
 				reliable = 1;
@@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			if (!reliable)
 				continue;
 
+next:
 			/*
 			 * Get the next frame from the unwinder.  No need to
 			 * check for an error: if anything goes wrong, the rest
@@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
 			/* if the frame has entry regs, print them */
 			regs = unwind_get_entry_regs(&state);
-			if (regs)
+			if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
 				__show_regs(regs, 0);
 		}
 
@@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
 #ifdef CONFIG_X86_32
 	if (user_mode(regs)) {
 		sp = regs->sp;
-		ss = regs->ss & 0xffff;
+		ss = regs->ss;
 	} else {
 		sp = kernel_stack_pointer(regs);
 		savesegment(ss, ss);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e5f0b40e66d2..4f0481474903 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
 	 * This is a software stack, so 'end' can be a valid stack pointer.
 	 * It just means the stack is empty.
 	 */
-	if (stack < begin || stack > end)
+	if (stack <= begin || stack > end)
 		return false;
 
 	info->type	= STACK_TYPE_IRQ;
@@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
 	 * This is a software stack, so 'end' can be a valid stack pointer.
 	 * It just means the stack is empty.
 	 */
-	if (stack < begin || stack > end)
+	if (stack <= begin || stack > end)
 		return false;
 
 	info->type	= STACK_TYPE_SOFTIRQ;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 3e1471d57487..225af4184f06 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
 		begin = end - (exception_stack_sizes[k] / sizeof(long));
 		regs  = (struct pt_regs *)end - 1;
 
-		if (stack < begin || stack >= end)
+		if (stack <= begin || stack >= end)
 			continue;
 
 		info->type	= STACK_TYPE_EXCEPTION + k;
@@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
 	 * This is a software stack, so 'end' can be a valid stack pointer.
 	 * It just means the stack is empty.
 	 */
-	if (stack < begin || stack > end)
+	if (stack <= begin || stack > end)
 		return false;
 
 	info->type	= STACK_TYPE_IRQ;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 532da61d605c..71c11ad5643e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any);
  * Note: this function only works correctly once the E820 table is sorted and
  * not-overlapping (at least for the range specified), which is the case normally.
  */
-bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+					     enum e820_type type)
 {
 	int i;
 
@@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
 		 * coverage of the desired range exists:
 		 */
 		if (start >= end)
-			return 1;
+			return entry;
 	}
-	return 0;
+
+	return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+	return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+	struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+	return entry ? entry->type : -EINVAL;
 }
 
 /*
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index d907c3d8633f..a4516ca4c4f3 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
 	INTEL_BXT_IDS(&gen9_early_ops),
 	INTEL_KBL_IDS(&gen9_early_ops),
 	INTEL_GLK_IDS(&gen9_early_ops),
+	INTEL_CNL_IDS(&gen9_early_ops),
 };
 
 static void __init
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
new file mode 100644
index 000000000000..f260e452e4f8
--- /dev/null
+++ b/arch/x86/kernel/eisa.c
@@ -0,0 +1,19 @@
+/*
+ * EISA specific code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#include <linux/io.h>
+
+static __init int eisa_bus_probe(void)
+{
+	void __iomem *p = ioremap(0x0FFFD9, 4);
+
+	if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
+		EISA_bus = 1;
+	iounmap(p);
+	return 0;
+}
+subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6b91e2eb8d3f..9c4e7ba6870c 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -195,7 +195,7 @@ void init_espfix_ap(int cpu)
 
 	pte_p = pte_offset_kernel(&pmd, addr);
 	stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
-	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+	pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
 	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
 
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 538ec012b371..cf2ce063f65a 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/memblock.h>
 
+#include <asm/desc.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/e820/api.h>
@@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void)
 asmlinkage __visible void __init i386_start_kernel(void)
 {
 	cr4_init_shadow();
+
+	idt_setup_early_handler();
+
 	sanitize_boot_params(&boot_params);
 
 	x86_early_init_platform_quirks();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 46c3c73e7f43..bab4fa579450 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -14,6 +14,7 @@
 #include <linux/start_kernel.h>
 #include <linux/io.h>
 #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/processor.h>
 #include <asm/proto.h>
@@ -33,7 +34,6 @@
 /*
  * Manage page tables very early on.
  */
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
 static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
@@ -45,14 +45,17 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
 	return ptr - (void *)_text + (void *)physaddr;
 }
 
-void __head __startup_64(unsigned long physaddr)
+unsigned long __head __startup_64(unsigned long physaddr,
+				  struct boot_params *bp)
 {
 	unsigned long load_delta, *p;
+	unsigned long pgtable_flags;
 	pgdval_t *pgd;
 	p4dval_t *p4d;
 	pudval_t *pud;
 	pmdval_t *pmd, pmd_entry;
 	int i;
+	unsigned int *next_pgt_ptr;
 
 	/* Is the address too large? */
 	if (physaddr >> MAX_PHYSMEM_BITS)
@@ -68,6 +71,12 @@ void __head __startup_64(unsigned long physaddr)
 	if (load_delta & ~PMD_PAGE_MASK)
 		for (;;);
 
+	/* Activate Secure Memory Encryption (SME) if supported and enabled */
+	sme_enable(bp);
+
+	/* Include the SME encryption mask in the fixup value */
+	load_delta += sme_get_me_mask();
+
 	/* Fixup the physical addresses in the page table */
 
 	pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -92,30 +101,34 @@ void __head __startup_64(unsigned long physaddr)
 	 * it avoids problems around wraparound.
 	 */
 
-	pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
-	pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+	next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
+	pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+	pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+
+	pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
 
 	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 		p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
 
 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-		pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
-		pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+		pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+		pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
 
 		i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
-		p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-		p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+		p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
+		p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
 	} else {
 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-		pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-		pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+		pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+		pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
 	}
 
 	i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
-	pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
-	pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+	pud[i + 0] = (pudval_t)pmd + pgtable_flags;
+	pud[i + 1] = (pudval_t)pmd + pgtable_flags;
 
 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+	pmd_entry += sme_get_me_mask();
 	pmd_entry +=  physaddr;
 
 	for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
@@ -136,9 +149,30 @@ void __head __startup_64(unsigned long physaddr)
 			pmd[i] += load_delta;
 	}
 
-	/* Fixup phys_base */
+	/*
+	 * Fixup phys_base - remove the memory encryption mask to obtain
+	 * the true physical address.
+	 */
 	p = fixup_pointer(&phys_base, physaddr);
-	*p += load_delta;
+	*p += load_delta - sme_get_me_mask();
+
+	/* Encrypt the kernel (if SME is active) */
+	sme_encrypt_kernel();
+
+	/*
+	 * Return the SME encryption mask (if SME is active) to be used as a
+	 * modifier for the initial pgdir entry programmed into CR3.
+	 */
+	return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+	/*
+	 * Return the SME encryption mask (if SME is active) to be used as a
+	 * modifier for the initial pgdir entry programmed into CR3.
+	 */
+	return sme_get_me_mask();
 }
 
 /* Wipe all early page tables except for the kernel symbol map */
@@ -146,17 +180,17 @@ static void __init reset_early_page_tables(void)
 {
 	memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
 	next_early_pgt = 0;
-	write_cr3(__pa_nodebug(early_top_pgt));
+	write_cr3(__sme_pa_nodebug(early_top_pgt));
 }
 
 /* Create a new PMD entry */
-int __init early_make_pgtable(unsigned long address)
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
 {
 	unsigned long physaddr = address - __PAGE_OFFSET;
 	pgdval_t pgd, *pgd_p;
 	p4dval_t p4d, *p4d_p;
 	pudval_t pud, *pud_p;
-	pmdval_t pmd, *pmd_p;
+	pmdval_t *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
 	if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
@@ -215,12 +249,21 @@ again:
 		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
 		*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
-	pmd = (physaddr & PMD_MASK) + early_pmd_flags;
 	pmd_p[pmd_index(address)] = pmd;
 
 	return 0;
 }
 
+int __init early_make_pgtable(unsigned long address)
+{
+	unsigned long physaddr = address - __PAGE_OFFSET;
+	pmdval_t pmd;
+
+	pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+	return __early_make_pgtable(address, pmd);
+}
+
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
    yet. */
 static void __init clear_bss(void)
@@ -243,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data)
 	char * command_line;
 	unsigned long cmd_line_ptr;
 
+	/*
+	 * If SME is active, this will create decrypted mappings of the
+	 * boot data in advance of the copy operations.
+	 */
+	sme_map_bootdata(real_mode_data);
+
 	memcpy(&boot_params, real_mode_data, sizeof boot_params);
 	sanitize_boot_params(&boot_params);
 	cmd_line_ptr = get_cmd_line_ptr();
@@ -250,12 +299,18 @@ static void __init copy_bootdata(char *real_mode_data)
 		command_line = __va(cmd_line_ptr);
 		memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
 	}
+
+	/*
+	 * The old boot data is no longer needed and won't be reserved,
+	 * freeing up that memory for use by the system. If SME is active,
+	 * we need to remove the mappings that were created so that the
+	 * memory doesn't remain mapped as decrypted.
+	 */
+	sme_unmap_bootdata(real_mode_data);
 }
 
 asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 {
-	int i;
-
 	/*
 	 * Build-time sanity checks on the kernel image and module
 	 * area mappings. (these are purely build-time and produce no code)
@@ -279,11 +334,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 
 	clear_page(init_top_pgt);
 
+	/*
+	 * SME support may update early_pmd_flags to include the memory
+	 * encryption mask, so it needs to be called before anything
+	 * that may generate a page fault.
+	 */
+	sme_early_init();
+
 	kasan_early_init();
 
-	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
-		set_intr_gate(i, early_idt_handler_array[i]);
-	load_idt((const struct desc_ptr *)&idt_descr);
+	idt_setup_early_handler();
 
 	copy_bootdata(__va(real_mode_data));
 
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 1f85ee8f9439..9ed3074d0d27 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
 	jmp *%eax
 
 .Lbad_subarch:
-WEAK(lguest_entry)
 WEAK(xen_entry)
 	/* Unknown implementation; there's really
 	   nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
 
 subarch_entries:
 	.long .Ldefault_entry		/* normal x86/PC */
-	.long lguest_entry		/* lguest hypervisor */
 	.long xen_entry			/* Xen hypervisor */
 	.long .Ldefault_entry		/* Moorestown MID */
 num_subarch_entries = (. - subarch_entries) / 4
@@ -347,7 +345,6 @@ ENTRY(startup_32_smp)
 	movl %eax,%cr0
 
 	lgdt early_gdt_descr
-	lidt idt_descr
 	ljmp $(__KERNEL_CS),$1f
 1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
 	movl %eax,%ss			# after changing gdt.
@@ -380,37 +377,6 @@ ENDPROC(startup_32_smp)
  */
 __INIT
 setup_once:
-	/*
-	 * Set up a idt with 256 interrupt gates that push zero if there
-	 * is no error code and then jump to early_idt_handler_common.
-	 * It doesn't actually load the idt - that needs to be done on
-	 * each CPU. Interrupts are enabled elsewhere, when we can be
-	 * relatively sure everything is ok.
-	 */
-
-	movl $idt_table,%edi
-	movl $early_idt_handler_array,%eax
-	movl $NUM_EXCEPTION_VECTORS,%ecx
-1:
-	movl %eax,(%edi)
-	movl %eax,4(%edi)
-	/* interrupt gate, dpl=0, present */
-	movl $(0x8E000000 + __KERNEL_CS),2(%edi)
-	addl $EARLY_IDT_HANDLER_SIZE,%eax
-	addl $8,%edi
-	loop 1b
-
-	movl $256 - NUM_EXCEPTION_VECTORS,%ecx
-	movl $ignore_int,%edx
-	movl $(__KERNEL_CS << 16),%eax
-	movw %dx,%ax		/* selector = 0x0010 = cs */
-	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */
-2:
-	movl %eax,(%edi)
-	movl %edx,4(%edi)
-	addl $8,%edi
-	loop 2b
-
 #ifdef CONFIG_CC_STACKPROTECTOR
 	/*
 	 * Configure the stack canary. The linker can't handle this by
@@ -457,12 +423,9 @@ early_idt_handler_common:
 	/* The vector number is in pt_regs->gs */
 
 	cld
-	pushl	%fs		/* pt_regs->fs */
-	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
-	pushl	%es		/* pt_regs->es */
-	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
-	pushl	%ds		/* pt_regs->ds */
-	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
+	pushl	%fs		/* pt_regs->fs (__fsh varies by model) */
+	pushl	%es		/* pt_regs->es (__esh varies by model) */
+	pushl	%ds		/* pt_regs->ds (__dsh varies by model) */
 	pushl	%eax		/* pt_regs->ax */
 	pushl	%ebp		/* pt_regs->bp */
 	pushl	%edi		/* pt_regs->di */
@@ -479,9 +442,8 @@ early_idt_handler_common:
 	/* Load the vector number into EDX */
 	movl	PT_GS(%esp), %edx
 
-	/* Load GS into pt_regs->gs and clear high bits */
+	/* Load GS into pt_regs->gs (and maybe clobber __gsh) */
 	movw	%gs, PT_GS(%esp)
-	movw	$0, PT_GS+2(%esp)
 
 	movl	%esp, %eax	/* args are pt_regs (EAX), trapnr (EDX) */
 	call	early_fixup_exception
@@ -493,18 +455,17 @@ early_idt_handler_common:
 	popl	%edi		/* pt_regs->di */
 	popl	%ebp		/* pt_regs->bp */
 	popl	%eax		/* pt_regs->ax */
-	popl	%ds		/* pt_regs->ds */
-	popl	%es		/* pt_regs->es */
-	popl	%fs		/* pt_regs->fs */
-	popl	%gs		/* pt_regs->gs */
+	popl	%ds		/* pt_regs->ds (always ignores __dsh) */
+	popl	%es		/* pt_regs->es (always ignores __esh) */
+	popl	%fs		/* pt_regs->fs (always ignores __fsh) */
+	popl	%gs		/* pt_regs->gs (always ignores __gsh) */
 	decl	%ss:early_recursion_flag
 	addl	$4, %esp	/* pop pt_regs->orig_ax */
 	iret
 ENDPROC(early_idt_handler_common)
 
 /* This is the default interrupt "handler" :-) */
-	ALIGN
-ignore_int:
+ENTRY(early_ignore_irq)
 	cld
 #ifdef CONFIG_PRINTK
 	pushl %eax
@@ -539,7 +500,8 @@ ignore_int:
 hlt_loop:
 	hlt
 	jmp hlt_loop
-ENDPROC(ignore_int)
+ENDPROC(early_ignore_irq)
+
 __INITDATA
 	.align 4
 GLOBAL(early_recursion_flag)
@@ -628,7 +590,6 @@ int_msg:
 
 	.data
 .globl boot_gdt_descr
-.globl idt_descr
 
 	ALIGN
 # early boot GDT descriptor (must use 1:1 address mapping)
@@ -637,11 +598,6 @@ boot_gdt_descr:
 	.word __BOOT_DS+7
 	.long boot_gdt - __PAGE_OFFSET
 
-	.word 0				# 32-bit align idt_desc.address
-idt_descr:
-	.word IDT_ENTRIES*8-1		# idt contains 256 entries
-	.long idt_table
-
 # boot GDT descriptor (later on used by CPU#0):
 	.word 0				# 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6225550883df..513cbb012ecc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -73,12 +73,19 @@ startup_64:
 	/* Sanitize CPU configuration */
 	call verify_cpu
 
+	/*
+	 * Perform pagetable fixups. Additionally, if SME is active, encrypt
+	 * the kernel and retrieve the modifier (SME encryption mask if SME
+	 * is active) to be added to the initial pgdir entry that will be
+	 * programmed into CR3.
+	 */
 	leaq	_text(%rip), %rdi
 	pushq	%rsi
 	call	__startup_64
 	popq	%rsi
 
-	movq	$(early_top_pgt - __START_KERNEL_map), %rax
+	/* Form the CR3 value being sure to include the CR3 modifier */
+	addq	$(early_top_pgt - __START_KERNEL_map), %rax
 	jmp 1f
 ENTRY(secondary_startup_64)
 	/*
@@ -98,7 +105,16 @@ ENTRY(secondary_startup_64)
 	/* Sanitize CPU configuration */
 	call verify_cpu
 
-	movq	$(init_top_pgt - __START_KERNEL_map), %rax
+	/*
+	 * Retrieve the modifier (SME encryption mask if SME is active) to be
+	 * added to the initial pgdir entry that will be programmed into CR3.
+	 */
+	pushq	%rsi
+	call	__startup_secondary_64
+	popq	%rsi
+
+	/* Form the CR3 value being sure to include the CR3 modifier */
+	addq	$(init_top_pgt - __START_KERNEL_map), %rax
 1:
 
 	/* Enable PAE mode, PGE and LA57 */
@@ -335,9 +351,9 @@ GLOBAL(name)
 NEXT_PAGE(early_top_pgt)
 	.fill	511,8,0
 #ifdef CONFIG_X86_5LEVEL
-	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #else
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 
 NEXT_PAGE(early_dynamic_pgts)
@@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt)
 	.fill	512,8,0
 #else
 NEXT_PAGE(init_top_pgt)
-	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
-	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level3_ident_pgt)
-	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.fill	511, 8, 0
 NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
@@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt)
 #ifdef CONFIG_X86_5LEVEL
 NEXT_PAGE(level4_kernel_pgt)
 	.fill	511,8,0
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	L3_START_KERNEL,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level2_kernel_pgt)
 	/*
@@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt)
 
 NEXT_PAGE(level2_fixmap_pgt)
 	.fill	506,8,0
-	.quad	level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 	/* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
 	.fill	5,8,0
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 16f82a3aaec7..8ce4212e2b8d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -345,21 +345,10 @@ static int hpet_shutdown(struct clock_event_device *evt, int timer)
 	return 0;
 }
 
-static int hpet_resume(struct clock_event_device *evt, int timer)
-{
-	if (!timer) {
-		hpet_enable_legacy_int();
-	} else {
-		struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-
-		irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq));
-		irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
-		disable_hardirq(hdev->irq);
-		irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
-		enable_irq(hdev->irq);
-	}
+static int hpet_resume(struct clock_event_device *evt)
+{
+	hpet_enable_legacy_int();
 	hpet_print_config();
-
 	return 0;
 }
 
@@ -417,7 +406,7 @@ static int hpet_legacy_set_periodic(struct clock_event_device *evt)
 
 static int hpet_legacy_resume(struct clock_event_device *evt)
 {
-	return hpet_resume(evt, 0);
+	return hpet_resume(evt);
 }
 
 static int hpet_legacy_next_event(unsigned long delta,
@@ -510,8 +499,14 @@ static int hpet_msi_set_periodic(struct clock_event_device *evt)
 static int hpet_msi_resume(struct clock_event_device *evt)
 {
 	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+	struct irq_data *data = irq_get_irq_data(hdev->irq);
+	struct msi_msg msg;
 
-	return hpet_resume(evt, hdev->num);
+	/* Restore the MSI msg and unmask the interrupt */
+	irq_chip_compose_msi_msg(data, &msg);
+	hpet_msi_write(hdev, &msg);
+	hpet_msi_unmask(data);
+	return 0;
 }
 
 static int hpet_msi_next_event(unsigned long delta,
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 000000000000..6107ee1cb8d5
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,371 @@
+/*
+ * Interrupt descriptor table related code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/interrupt.h>
+
+#include <asm/traps.h>
+#include <asm/proto.h>
+#include <asm/desc.h>
+
+struct idt_data {
+	unsigned int	vector;
+	unsigned int	segment;
+	struct idt_bits	bits;
+	const void	*addr;
+};
+
+#define DPL0		0x0
+#define DPL3		0x3
+
+#define DEFAULT_STACK	0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment)	\
+	{						\
+		.vector		= _vector,		\
+		.bits.ist	= _ist,			\
+		.bits.type	= _type,		\
+		.bits.dpl	= _dpl,			\
+		.bits.p		= 1,			\
+		.addr		= _addr,		\
+		.segment	= _segment,		\
+	}
+
+/* Interrupt gate */
+#define INTG(_vector, _addr)				\
+	G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr)				\
+	G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Interrupt gate with interrupt stack */
+#define ISTG(_vector, _addr, _ist)			\
+	G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate with interrupt stack */
+#define SISTG(_vector, _addr, _ist)			\
+	G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Task gate */
+#define TSKG(_vector, _gdt)				\
+	G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initdata struct idt_data early_idts[] = {
+	INTG(X86_TRAP_DB,		debug),
+	SYSG(X86_TRAP_BP,		int3),
+#ifdef CONFIG_X86_32
+	INTG(X86_TRAP_PF,		page_fault),
+#endif
+};
+
+/*
+ * The default IDT entries which are set up in trap_init() before
+ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
+ * the traps which use them are reinitialized with IST after cpu_init() has
+ * set up TSS.
+ */
+static const __initdata struct idt_data def_idts[] = {
+	INTG(X86_TRAP_DE,		divide_error),
+	INTG(X86_TRAP_NMI,		nmi),
+	INTG(X86_TRAP_BR,		bounds),
+	INTG(X86_TRAP_UD,		invalid_op),
+	INTG(X86_TRAP_NM,		device_not_available),
+	INTG(X86_TRAP_OLD_MF,		coprocessor_segment_overrun),
+	INTG(X86_TRAP_TS,		invalid_TSS),
+	INTG(X86_TRAP_NP,		segment_not_present),
+	INTG(X86_TRAP_SS,		stack_segment),
+	INTG(X86_TRAP_GP,		general_protection),
+	INTG(X86_TRAP_SPURIOUS,		spurious_interrupt_bug),
+	INTG(X86_TRAP_MF,		coprocessor_error),
+	INTG(X86_TRAP_AC,		alignment_check),
+	INTG(X86_TRAP_XF,		simd_coprocessor_error),
+
+#ifdef CONFIG_X86_32
+	TSKG(X86_TRAP_DF,		GDT_ENTRY_DOUBLEFAULT_TSS),
+#else
+	INTG(X86_TRAP_DF,		double_fault),
+#endif
+	INTG(X86_TRAP_DB,		debug),
+	INTG(X86_TRAP_NMI,		nmi),
+	INTG(X86_TRAP_BP,		int3),
+
+#ifdef CONFIG_X86_MCE
+	INTG(X86_TRAP_MC,		&machine_check),
+#endif
+
+	SYSG(X86_TRAP_OF,		overflow),
+#if defined(CONFIG_IA32_EMULATION)
+	SYSG(IA32_SYSCALL_VECTOR,	entry_INT80_compat),
+#elif defined(CONFIG_X86_32)
+	SYSG(IA32_SYSCALL_VECTOR,	entry_INT80_32),
+#endif
+};
+
+/*
+ * The APIC and SMP idt entries
+ */
+static const __initdata struct idt_data apic_idts[] = {
+#ifdef CONFIG_SMP
+	INTG(RESCHEDULE_VECTOR,		reschedule_interrupt),
+	INTG(CALL_FUNCTION_VECTOR,	call_function_interrupt),
+	INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt),
+	INTG(IRQ_MOVE_CLEANUP_VECTOR,	irq_move_cleanup_interrupt),
+	INTG(REBOOT_VECTOR,		reboot_interrupt),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+	INTG(THERMAL_APIC_VECTOR,	thermal_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+	INTG(THRESHOLD_APIC_VECTOR,	threshold_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+	INTG(DEFERRED_ERROR_VECTOR,	deferred_error_interrupt),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	INTG(LOCAL_TIMER_VECTOR,	apic_timer_interrupt),
+	INTG(X86_PLATFORM_IPI_VECTOR,	x86_platform_ipi),
+# ifdef CONFIG_HAVE_KVM
+	INTG(POSTED_INTR_VECTOR,	kvm_posted_intr_ipi),
+	INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
+	INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+	INTG(IRQ_WORK_VECTOR,		irq_work_interrupt),
+# endif
+	INTG(SPURIOUS_APIC_VECTOR,	spurious_interrupt),
+	INTG(ERROR_APIC_VECTOR,		error_interrupt),
+#endif
+};
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initdata struct idt_data early_pf_idts[] = {
+	INTG(X86_TRAP_PF,		page_fault),
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+static const __initdata struct idt_data dbg_idts[] = {
+	INTG(X86_TRAP_DB,	debug),
+	INTG(X86_TRAP_BP,	int3),
+};
+#endif
+
+/* Must be page-aligned because the real IDT is used in a fixmap. */
+gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+struct desc_ptr idt_descr __ro_after_init = {
+	.size		= (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
+	.address	= (unsigned long) idt_table,
+};
+
+#ifdef CONFIG_X86_64
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+/*
+ * The exceptions which use Interrupt stacks. They are setup after
+ * cpu_init() when the TSS has been initialized.
+ */
+static const __initdata struct idt_data ist_idts[] = {
+	ISTG(X86_TRAP_DB,	debug,		DEBUG_STACK),
+	ISTG(X86_TRAP_NMI,	nmi,		NMI_STACK),
+	SISTG(X86_TRAP_BP,	int3,		DEBUG_STACK),
+	ISTG(X86_TRAP_DF,	double_fault,	DOUBLEFAULT_STACK),
+#ifdef CONFIG_X86_MCE
+	ISTG(X86_TRAP_MC,	&machine_check,	MCE_STACK),
+#endif
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+const struct desc_ptr debug_idt_descr = {
+	.size		= IDT_ENTRIES * 16 - 1,
+	.address	= (unsigned long) debug_idt_table,
+};
+#endif
+
+static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
+{
+	unsigned long addr = (unsigned long) d->addr;
+
+	gate->offset_low	= (u16) addr;
+	gate->segment		= (u16) d->segment;
+	gate->bits		= d->bits;
+	gate->offset_middle	= (u16) (addr >> 16);
+#ifdef CONFIG_X86_64
+	gate->offset_high	= (u32) (addr >> 32);
+	gate->reserved		= 0;
+#endif
+}
+
+static void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
+{
+	gate_desc desc;
+
+	for (; size > 0; t++, size--) {
+		idt_init_desc(&desc, t);
+		write_idt_entry(idt, t->vector, &desc);
+		if (sys)
+			set_bit(t->vector, used_vectors);
+	}
+}
+
+static void set_intr_gate(unsigned int n, const void *addr)
+{
+	struct idt_data data;
+
+	BUG_ON(n > 0xFF);
+
+	memset(&data, 0, sizeof(data));
+	data.vector	= n;
+	data.addr	= addr;
+	data.segment	= __KERNEL_CS;
+	data.bits.type	= GATE_INTERRUPT;
+	data.bits.p	= 1;
+
+	idt_setup_from_table(idt_table, &data, 1, false);
+}
+
+/**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+	idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+			     true);
+	load_idt(&idt_descr);
+}
+
+/**
+ * idt_setup_traps - Initialize the idt table with default traps
+ */
+void __init idt_setup_traps(void)
+{
+	idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+}
+
+#ifdef CONFIG_X86_64
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * FIXME: Why is 32bit and 64bit installing the PF handler at different
+ * places in the early setup code?
+ */
+void __init idt_setup_early_pf(void)
+{
+	idt_setup_from_table(idt_table, early_pf_idts,
+			     ARRAY_SIZE(early_pf_idts), true);
+}
+
+/**
+ * idt_setup_ist_traps - Initialize the idt table with traps using IST
+ */
+void __init idt_setup_ist_traps(void)
+{
+	idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
+}
+
+/**
+ * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
+ */
+void __init idt_setup_debugidt_traps(void)
+{
+	memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
+
+	idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
+}
+#endif
+
+/**
+ * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
+ */
+void __init idt_setup_apic_and_irq_gates(void)
+{
+	int i = FIRST_EXTERNAL_VECTOR;
+	void *entry;
+
+	idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
+
+	for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) {
+		entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+		set_intr_gate(i, entry);
+	}
+
+	for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
+#ifdef CONFIG_X86_LOCAL_APIC
+		set_bit(i, used_vectors);
+		set_intr_gate(i, spurious_interrupt);
+#else
+		entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+		set_intr_gate(i, entry);
+#endif
+	}
+}
+
+/**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+	int i;
+
+	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+		set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+	for ( ; i < NR_VECTORS; i++)
+		set_intr_gate(i, early_ignore_irq);
+#endif
+	load_idt(&idt_descr);
+}
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ * @addr:	The virtual address of the 'invalid' IDT
+ */
+void idt_invalidate(void *addr)
+{
+	struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 };
+
+	load_idt(&idt);
+}
+
+void __init update_intr_gate(unsigned int n, const void *addr)
+{
+	if (WARN_ON_ONCE(!test_bit(n, used_vectors)))
+		return;
+	set_intr_gate(n, addr);
+}
+
+void alloc_intr_gate(unsigned int n, const void *addr)
+{
+	BUG_ON(n < FIRST_SYSTEM_VECTOR);
+	if (!test_and_set_bit(n, used_vectors))
+		set_intr_gate(n, addr);
+}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 4aa03c5a14c9..52089c043160 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
 
 atomic_t irq_err_count;
 
-/* Function pointer for generic interrupt vector handling */
-void (*x86_platform_ipi_callback)(void) = NULL;
-
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
 	seq_puts(p, "  APIC ICR read retries\n");
-#endif
 	if (x86_platform_ipi_callback) {
 		seq_printf(p, "%*s: ", prec, "PLT");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
 		seq_puts(p, "  Platform interrupts\n");
 	}
+#endif
 #ifdef CONFIG_SMP
 	seq_printf(p, "%*s: ", prec, "RES");
 	for_each_online_cpu(j)
@@ -155,6 +152,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
 	seq_puts(p, "  Posted-interrupt notification event\n");
 
+	seq_printf(p, "%*s: ", prec, "NPI");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->kvm_posted_intr_nested_ipis);
+	seq_puts(p, "  Nested posted-interrupt event\n");
+
 	seq_printf(p, "%*s: ", prec, "PIW");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ",
@@ -177,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_perf_irqs;
 	sum += irq_stats(cpu)->apic_irq_work_irqs;
 	sum += irq_stats(cpu)->icr_read_retry_count;
-#endif
 	if (x86_platform_ipi_callback)
 		sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
 	sum += irq_stats(cpu)->irq_call_count;
@@ -253,26 +256,26 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
 /*
  * Handler for X86_PLATFORM_IPI_VECTOR.
  */
-void __smp_x86_platform_ipi(void)
-{
-	inc_irq_stat(x86_platform_ipis);
-
-	if (x86_platform_ipi_callback)
-		x86_platform_ipi_callback();
-}
-
 __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 	entering_ack_irq();
-	__smp_x86_platform_ipi();
+	trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+	inc_irq_stat(x86_platform_ipis);
+	if (x86_platform_ipi_callback)
+		x86_platform_ipi_callback();
+	trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
 	exiting_irq();
 	set_irq_regs(old_regs);
 }
+#endif
 
 #ifdef CONFIG_HAVE_KVM
 static void dummy_handler(void) {}
@@ -313,21 +316,21 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
 	exiting_irq();
 	set_irq_regs(old_regs);
 }
-#endif
 
-__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
+/*
+ * Handler for POSTED_INTERRUPT_NESTED_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 	entering_ack_irq();
-	trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
-	__smp_x86_platform_ipi();
-	trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
+	inc_irq_stat(kvm_posted_intr_nested_ipis);
 	exiting_irq();
 	set_irq_regs(old_regs);
 }
+#endif
 
-EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 
 #ifdef CONFIG_HOTPLUG_CPU
 
@@ -412,7 +415,7 @@ int check_irq_vectors_for_cpu_disable(void)
 		 * this w/o holding vector_lock.
 		 */
 		for (vector = FIRST_EXTERNAL_VECTOR;
-		     vector < first_system_vector; vector++) {
+		     vector < FIRST_SYSTEM_VECTOR; vector++) {
 			if (!test_bit(vector, used_vectors) &&
 			    IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) {
 				if (++count == this_count)
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 275487872be2..70dee056f92b 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -11,35 +11,23 @@
 #include <asm/trace/irq_vectors.h>
 #include <linux/interrupt.h>
 
-static inline void __smp_irq_work_interrupt(void)
-{
-	inc_irq_stat(apic_irq_work_irqs);
-	irq_work_run();
-}
-
+#ifdef CONFIG_X86_LOCAL_APIC
 __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
-	__smp_irq_work_interrupt();
-	exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
-{
-	ipi_entering_ack_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
-	__smp_irq_work_interrupt();
+	inc_irq_stat(apic_irq_work_irqs);
+	irq_work_run();
 	trace_irq_work_exit(IRQ_WORK_VECTOR);
 	exiting_irq();
 }
 
 void arch_irq_work_raise(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	if (!arch_irq_work_has_interrupt())
 		return;
 
 	apic->send_IPI_self(IRQ_WORK_VECTOR);
 	apic_wait_icr_idle();
-#endif
 }
+#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7468c6987547..1add9e08e83e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
 	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
-int vector_used_by_percpu_irq(unsigned int vector)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
-			return 1;
-	}
-
-	return 0;
-}
-
 void __init init_ISA_irqs(void)
 {
 	struct irq_chip *chip = legacy_pic->chip;
@@ -99,98 +87,12 @@ void __init init_IRQ(void)
 	x86_init.irqs.intr_init();
 }
 
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* IPI for generic single function call */
-	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-			call_function_single_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-
-	/* IPI used for rebooting/stopping */
-	alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
-#endif /* CONFIG_SMP */
-}
-
-static void __init apic_intr_init(void)
-{
-	smp_intr_init();
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#ifdef CONFIG_X86_MCE_THRESHOLD
-	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#endif
-
-#ifdef CONFIG_X86_MCE_AMD
-	alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/* self generated IPI for local APIC timer */
-	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* IPI for X86 platform specific use */
-	alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
-#ifdef CONFIG_HAVE_KVM
-	/* IPI for KVM to deliver posted interrupt */
-	alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
-	/* IPI for KVM to deliver interrupt to wake up tasks */
-	alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
-#endif
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-	/* IRQ work interrupts: */
-# ifdef CONFIG_IRQ_WORK
-	alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
-# endif
-
-#endif
-}
-
 void __init native_init_IRQ(void)
 {
-	int i;
-
 	/* Execute any quirks before the call gates are initialised: */
 	x86_init.irqs.pre_vector_init();
 
-	apic_intr_init();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	i = FIRST_EXTERNAL_VECTOR;
-#ifndef CONFIG_X86_LOCAL_APIC
-#define first_system_vector NR_VECTORS
-#endif
-	for_each_clear_bit_from(i, used_vectors, first_system_vector) {
-		/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
-		set_intr_gate(i, irq_entries_start +
-				8 * (i - FIRST_EXTERNAL_VECTOR));
-	}
-#ifdef CONFIG_X86_LOCAL_APIC
-	for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
-		set_intr_gate(i, spurious_interrupt);
-#endif
+	idt_setup_apic_and_irq_gates();
 
 	if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
 		setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 38b64587b31b..fd6f8fbbe6f2 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
 	struct setup_data_node *node = file->private_data;
 	unsigned long remain;
 	loff_t pos = *ppos;
-	struct page *pg;
 	void *p;
 	u64 pa;
 
@@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
 		count = node->len - pos;
 
 	pa = node->paddr + sizeof(struct setup_data) + pos;
-	pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
-	if (PageHighMem(pg)) {
-		p = ioremap_cache(pa, count);
-		if (!p)
-			return -ENXIO;
-	} else
-		p = __va(pa);
+	p = memremap(pa, count, MEMREMAP_WB);
+	if (!p)
+		return -ENOMEM;
 
 	remain = copy_to_user(user_buf, p, count);
 
-	if (PageHighMem(pg))
-		iounmap(p);
+	memunmap(p);
 
 	if (remain)
 		return -EFAULT;
@@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 	struct setup_data *data;
 	int error;
 	struct dentry *d;
-	struct page *pg;
 	u64 pa_data;
 	int no = 0;
 
@@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 			goto err_dir;
 		}
 
-		pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
-		if (PageHighMem(pg)) {
-			data = ioremap_cache(pa_data, sizeof(*data));
-			if (!data) {
-				kfree(node);
-				error = -ENXIO;
-				goto err_dir;
-			}
-		} else
-			data = __va(pa_data);
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+		if (!data) {
+			kfree(node);
+			error = -ENOMEM;
+			goto err_dir;
+		}
 
 		node->paddr = pa_data;
 		node->type = data->type;
@@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 		error = create_setup_data_node(d, no, node);
 		pa_data = data->next;
 
-		if (PageHighMem(pg))
-			iounmap(data);
+		memunmap(data);
 		if (error)
 			goto err_dir;
 		no++;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6b877807598b..f0153714ddac 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -457,6 +457,8 @@ static int arch_copy_kprobe(struct kprobe *p)
 
 int arch_prepare_kprobe(struct kprobe *p)
 {
+	int ret;
+
 	if (alternatives_text_reserved(p->addr, p->addr))
 		return -EINVAL;
 
@@ -467,7 +469,13 @@ int arch_prepare_kprobe(struct kprobe *p)
 	if (!p->ainsn.insn)
 		return -ENOMEM;
 
-	return arch_copy_kprobe(p);
+	ret = arch_copy_kprobe(p);
+	if (ret) {
+		free_insn_slot(p->ainsn.insn, 0);
+		p->ainsn.insn = NULL;
+	}
+
+	return ret;
 }
 
 void arch_arm_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 69ea0bc1cfa3..4f98aad38237 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -39,6 +39,7 @@
 #include <asm/insn.h>
 #include <asm/debugreg.h>
 #include <asm/set_memory.h>
+#include <asm/sections.h>
 
 #include "common.h"
 
@@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr)
 
 	/*
 	 * Do not optimize in the entry code due to the unstable
-	 * stack handling.
+	 * stack handling and registers setup.
 	 */
-	if ((paddr >= (unsigned long)__entry_text_start) &&
-	    (paddr <  (unsigned long)__entry_text_end))
+	if (((paddr >= (unsigned long)__entry_text_start) &&
+	     (paddr <  (unsigned long)__entry_text_end)) ||
+	    ((paddr >= (unsigned long)__irqentry_text_start) &&
+	     (paddr <  (unsigned long)__irqentry_text_end)))
 		return 0;
 
 	/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 4afc67f5facc..4b0592ca9e47 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -16,8 +16,8 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/io.h>
 
-#include <asm/io.h>
 #include <asm/setup.h>
 
 static ssize_t version_show(struct kobject *kobj,
@@ -55,7 +55,7 @@ static struct bin_attribute *boot_params_data_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group boot_params_attr_group = {
+static const struct attribute_group boot_params_attr_group = {
 	.attrs = boot_params_version_attrs,
 	.bin_attrs = boot_params_data_attrs,
 };
@@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
 			*paddr = pa_data;
 			return 0;
 		}
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
 		if (!data)
 			return -ENOMEM;
 
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 		i++;
 	}
 	return -EINVAL;
@@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size)
 	u64 pa_data = boot_params.hdr.setup_data;
 
 	while (pa_data) {
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
 		if (!data)
 			return -ENOMEM;
 		if (nr == i) {
 			*size = data->len;
-			iounmap(data);
+			memunmap(data);
 			return 0;
 		}
 
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 		i++;
 	}
 	return -EINVAL;
@@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj,
 	ret = get_setup_data_paddr(nr, &paddr);
 	if (ret)
 		return ret;
-	data = ioremap_cache(paddr, sizeof(*data));
+	data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
 	if (!data)
 		return -ENOMEM;
 
 	ret = sprintf(buf, "0x%x\n", data->type);
-	iounmap(data);
+	memunmap(data);
 	return ret;
 }
 
@@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp,
 	ret = get_setup_data_paddr(nr, &paddr);
 	if (ret)
 		return ret;
-	data = ioremap_cache(paddr, sizeof(*data));
+	data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
 	if (!data)
 		return -ENOMEM;
 
@@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp,
 		goto out;
 
 	ret = count;
-	p = ioremap_cache(paddr + sizeof(*data), data->len);
+	p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
 	if (!p) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	memcpy(buf, p + off, count);
-	iounmap(p);
+	memunmap(p);
 out:
-	iounmap(data);
+	memunmap(data);
 	return ret;
 }
 
@@ -202,7 +202,7 @@ static struct bin_attribute *setup_data_data_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group setup_data_attr_group = {
+static const struct attribute_group setup_data_attr_group = {
 	.attrs = setup_data_type_attrs,
 	.bin_attrs = setup_data_data_attrs,
 };
@@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr)
 	*nr = 0;
 	while (pa_data) {
 		*nr += 1;
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
 		if (!data) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 	}
 
 out:
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 71c17a5be983..874827b0d7ca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -151,6 +151,8 @@ void kvm_async_pf_task_wait(u32 token)
 		if (hlist_unhashed(&n.link))
 			break;
 
+		rcu_irq_exit();
+
 		if (!n.halted) {
 			local_irq_enable();
 			schedule();
@@ -159,11 +161,11 @@ void kvm_async_pf_task_wait(u32 token)
 			/*
 			 * We cannot reschedule. So halt.
 			 */
-			rcu_irq_exit();
 			native_safe_halt();
 			local_irq_disable();
-			rcu_irq_enter();
 		}
+
+		rcu_irq_enter();
 	}
 	if (!n.halted)
 		finish_swait(&n.wq, &wait);
@@ -261,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	switch (kvm_read_and_reset_pf_reason()) {
 	default:
-		trace_do_page_fault(regs, error_code);
+		do_page_fault(regs, error_code);
 		break;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
@@ -453,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
 
 static void __init kvm_apf_trap_init(void)
 {
-	set_intr_gate(14, async_page_fault);
+	update_intr_gate(X86_TRAP_PF, async_page_fault);
 }
 
 void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a870910c8565..f0e64db18ac8 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -21,6 +21,25 @@
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
 
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+	unsigned short sel;
+
+	/*
+	 * Make sure that the cached DS and ES descriptors match the updated
+	 * LDT.
+	 */
+	savesegment(ds, sel);
+	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+		loadsegment(ds, sel);
+
+	savesegment(es, sel);
+	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+		loadsegment(es, sel);
+#endif
+}
+
 /* context.lock is held for us, so we don't need any locking. */
 static void flush_ldt(void *__mm)
 {
@@ -32,6 +51,8 @@ static void flush_ldt(void *__mm)
 
 	pc = &mm->context;
 	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+
+	refresh_ldt_segments();
 }
 
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8c53c5d7a1bc..00bc751c861c 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -26,18 +26,6 @@
 #include <asm/set_memory.h>
 #include <asm/debugreg.h>
 
-static void set_idt(void *newidt, __u16 limit)
-{
-	struct desc_ptr curidt;
-
-	/* ia32 supports unaliged loads & stores */
-	curidt.size    = limit;
-	curidt.address = (unsigned long)newidt;
-
-	load_idt(&curidt);
-}
-
-
 static void set_gdt(void *newgdt, __u16 limit)
 {
 	struct desc_ptr curgdt;
@@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image)
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
 	set_gdt(phys_to_virt(0), 0);
-	set_idt(phys_to_virt(0), 0);
+	idt_invalidate(phys_to_virt(0));
 
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cb0a30473c23..1f790cf9d38f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
-	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
 	return 0;
 err:
 	free_transition_pgtable(image);
@@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 		.alloc_pgt_page	= alloc_pgt_page,
 		.context	= image,
 		.page_flag	= __PAGE_KERNEL_LARGE_EXEC,
+		.kernpg_flag	= _KERNPG_TABLE_NOENC,
 	};
 	unsigned long mstart, mend;
 	pgd_t *level4p;
@@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image)
 	image->start = relocate_kernel((unsigned long)image->head,
 				       (unsigned long)page_list,
 				       image->start,
-				       image->preserve_context);
+				       image->preserve_context,
+				       sme_active());
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (image->preserve_context)
@@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void)
 {
 	kexec_mark_crashkres(false);
 }
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+	/*
+	 * If SME is active we need to be sure that kexec pages are
+	 * not encrypted because when we boot to the new kernel the
+	 * pages won't be accessed encrypted (initially).
+	 */
+	return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+	/*
+	 * If SME is active we need to reset the pages back to being
+	 * an encrypted mapping before freeing them.
+	 */
+	set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f67bd3205df7..62e7d70aadd5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -35,6 +35,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/setup.h>
+#include <asm/unwind.h>
 
 #if 0
 #define DEBUGP(fmt, ...)				\
@@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    struct module *me)
 {
 	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
-		*para = NULL;
+		*para = NULL, *orc = NULL, *orc_ip = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 			locks = s;
 		if (!strcmp(".parainstructions", secstrings + s->sh_name))
 			para = s;
+		if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+			orc = s;
+		if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+			orc_ip = s;
 	}
 
 	if (alt) {
@@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 	/* make jump label nops */
 	jump_label_apply_nops(me);
 
+	if (orc && orc_ip)
+		unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+				   (void *)orc->sh_addr, orc->sh_size);
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0d904d759ff1..5cbb3177ed17 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 	}
 }
 
-static struct mpf_intel *mpf_found;
+static unsigned long mpf_base;
 
 static unsigned long __init get_mpc_size(unsigned long physptr)
 {
 	struct mpc_table *mpc;
 	unsigned long size;
 
-	mpc = early_ioremap(physptr, PAGE_SIZE);
+	mpc = early_memremap(physptr, PAGE_SIZE);
 	size = mpc->length;
-	early_iounmap(mpc, PAGE_SIZE);
+	early_memunmap(mpc, PAGE_SIZE);
 	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
 
 	return size;
@@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 	unsigned long size;
 
 	size = get_mpc_size(mpf->physptr);
-	mpc = early_ioremap(mpf->physptr, size);
+	mpc = early_memremap(mpf->physptr, size);
+
 	/*
 	 * Read the physical hardware table.  Anything here will
 	 * override the defaults.
@@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 #endif
 		pr_err("BIOS bug, MP table errors detected!...\n");
 		pr_cont("... disabling SMP support. (tell your hw vendor)\n");
-		early_iounmap(mpc, size);
+		early_memunmap(mpc, size);
 		return -1;
 	}
-	early_iounmap(mpc, size);
+	early_memunmap(mpc, size);
 
 	if (early)
 		return -1;
@@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
  */
 void __init default_get_smp_config(unsigned int early)
 {
-	struct mpf_intel *mpf = mpf_found;
+	struct mpf_intel *mpf;
 
 	if (!smp_found_config)
 		return;
 
-	if (!mpf)
+	if (!mpf_base)
 		return;
 
 	if (acpi_lapic && early)
@@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early)
 	if (acpi_lapic && acpi_ioapic)
 		return;
 
+	mpf = early_memremap(mpf_base, sizeof(*mpf));
+	if (!mpf) {
+		pr_err("MPTABLE: error mapping MP table\n");
+		return;
+	}
+
 	pr_info("Intel MultiProcessor Specification v1.%d\n",
 		mpf->specification);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early)
 	/*
 	 * Now see if we need to read further.
 	 */
-	if (mpf->feature1 != 0) {
+	if (mpf->feature1) {
 		if (early) {
 			/*
 			 * local APIC has default address
@@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early)
 		construct_default_ISA_mptable(mpf->feature1);
 
 	} else if (mpf->physptr) {
-		if (check_physptr(mpf, early))
+		if (check_physptr(mpf, early)) {
+			early_memunmap(mpf, sizeof(*mpf));
 			return;
+		}
 	} else
 		BUG();
 
@@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early)
 	/*
 	 * Only use the first configuration found.
 	 */
+
+	early_memunmap(mpf, sizeof(*mpf));
 }
 
 static void __init smp_reserve_memory(struct mpf_intel *mpf)
@@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
 
 static int __init smp_scan_config(unsigned long base, unsigned long length)
 {
-	unsigned int *bp = phys_to_virt(base);
+	unsigned int *bp;
 	struct mpf_intel *mpf;
-	unsigned long mem;
+	int ret = 0;
 
 	apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
 		    base, base + length - 1);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
+		bp = early_memremap(base, length);
 		mpf = (struct mpf_intel *)bp;
 		if ((*bp == SMP_MAGIC_IDENT) &&
 		    (mpf->length == 1) &&
@@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 #ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 1;
 #endif
-			mpf_found = mpf;
+			mpf_base = base;
 
-			pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
-				(unsigned long long) virt_to_phys(mpf),
-				(unsigned long long) virt_to_phys(mpf) +
-				sizeof(*mpf) - 1, mpf);
+			pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
+				base, base + sizeof(*mpf) - 1, mpf);
 
-			mem = virt_to_phys(mpf);
-			memblock_reserve(mem, sizeof(*mpf));
+			memblock_reserve(base, sizeof(*mpf));
 			if (mpf->physptr)
 				smp_reserve_memory(mpf);
 
-			return 1;
+			ret = 1;
 		}
-		bp += 4;
+		early_memunmap(bp, length);
+
+		if (ret)
+			break;
+
+		base += 16;
 		length -= 16;
 	}
-	return 0;
+	return ret;
 }
 
 void __init default_find_smp_config(void)
@@ -838,29 +852,40 @@ static int __init update_mp_table(void)
 	char oem[10];
 	struct mpf_intel *mpf;
 	struct mpc_table *mpc, *mpc_new;
+	unsigned long size;
 
 	if (!enable_update_mptable)
 		return 0;
 
-	mpf = mpf_found;
-	if (!mpf)
+	if (!mpf_base)
 		return 0;
 
+	mpf = early_memremap(mpf_base, sizeof(*mpf));
+	if (!mpf) {
+		pr_err("MPTABLE: mpf early_memremap() failed\n");
+		return 0;
+	}
+
 	/*
 	 * Now see if we need to go further.
 	 */
-	if (mpf->feature1 != 0)
-		return 0;
+	if (mpf->feature1)
+		goto do_unmap_mpf;
 
 	if (!mpf->physptr)
-		return 0;
+		goto do_unmap_mpf;
 
-	mpc = phys_to_virt(mpf->physptr);
+	size = get_mpc_size(mpf->physptr);
+	mpc = early_memremap(mpf->physptr, size);
+	if (!mpc) {
+		pr_err("MPTABLE: mpc early_memremap() failed\n");
+		goto do_unmap_mpf;
+	}
 
 	if (!smp_check_mpc(mpc, oem, str))
-		return 0;
+		goto do_unmap_mpc;
 
-	pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+	pr_info("mpf: %llx\n", (u64)mpf_base);
 	pr_info("physptr: %x\n", mpf->physptr);
 
 	if (mpc_new_phys && mpc->length > mpc_new_length) {
@@ -878,21 +903,32 @@ static int __init update_mp_table(void)
 		new = mpf_checksum((unsigned char *)mpc, mpc->length);
 		if (old == new) {
 			pr_info("mpc is readonly, please try alloc_mptable instead\n");
-			return 0;
+			goto do_unmap_mpc;
 		}
 		pr_info("use in-position replacing\n");
 	} else {
+		mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+		if (!mpc_new) {
+			pr_err("MPTABLE: new mpc early_memremap() failed\n");
+			goto do_unmap_mpc;
+		}
 		mpf->physptr = mpc_new_phys;
-		mpc_new = phys_to_virt(mpc_new_phys);
 		memcpy(mpc_new, mpc, mpc->length);
+		early_memunmap(mpc, size);
 		mpc = mpc_new;
+		size = mpc_new_length;
 		/* check if we can modify that */
 		if (mpc_new_phys - mpf->physptr) {
 			struct mpf_intel *mpf_new;
 			/* steal 16 bytes from [0, 1k) */
+			mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+			if (!mpf_new) {
+				pr_err("MPTABLE: new mpf early_memremap() failed\n");
+				goto do_unmap_mpc;
+			}
 			pr_info("mpf new: %x\n", 0x400 - 16);
-			mpf_new = phys_to_virt(0x400 - 16);
 			memcpy(mpf_new, mpf, 16);
+			early_memunmap(mpf, sizeof(*mpf));
 			mpf = mpf_new;
 			mpf->physptr = mpc_new_phys;
 		}
@@ -909,6 +945,12 @@ static int __init update_mp_table(void)
 	 */
 	replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
 
+do_unmap_mpc:
+	early_memunmap(mpc, size);
+
+do_unmap_mpf:
+	early_memunmap(mpf, sizeof(*mpf));
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 446c8aa09b9b..35aafc95e4b8 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -39,26 +39,26 @@
 #include <trace/events/nmi.h>
 
 struct nmi_desc {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct list_head head;
 };
 
 static struct nmi_desc nmi_desc[NMI_MAX] = 
 {
 	{
-		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[0].head),
 	},
 	{
-		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[1].head),
 	},
 	{
-		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[2].head),
 	},
 	{
-		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[3].head),
 	},
 
@@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
 
 	init_irq_work(&action->irq_work, nmi_max_handler);
 
-	spin_lock_irqsave(&desc->lock, flags);
+	raw_spin_lock_irqsave(&desc->lock, flags);
 
 	/*
 	 * Indicate if there are multiple registrations on the
@@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
 	else
 		list_add_tail_rcu(&action->list, &desc->head);
 	
-	spin_unlock_irqrestore(&desc->lock, flags);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
 EXPORT_SYMBOL(__register_nmi_handler);
@@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
 	struct nmiaction *n;
 	unsigned long flags;
 
-	spin_lock_irqsave(&desc->lock, flags);
+	raw_spin_lock_irqsave(&desc->lock, flags);
 
 	list_for_each_entry_rcu(n, &desc->head, list) {
 		/*
@@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
 		}
 	}
 
-	spin_unlock_irqrestore(&desc->lock, flags);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(unregister_nmi_handler);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bc0a849589bb..a14df9eecfed 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = {
 	.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
 	.safe_halt = native_safe_halt,
 	.halt = native_halt,
-#ifdef CONFIG_X86_64
-	.adjust_exception_frame = paravirt_nop,
-#endif
 };
 
 __visible struct pv_cpu_ops pv_cpu_ops = {
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 5e16d3f29594..0accc2404b92 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -93,9 +93,12 @@ again:
 	if (gfpflags_allow_blocking(flag)) {
 		page = dma_alloc_from_contiguous(dev, count, get_order(size),
 						 flag);
-		if (page && page_to_phys(page) + size > dma_mask) {
-			dma_release_from_contiguous(dev, page, count);
-			page = NULL;
+		if (page) {
+			addr = phys_to_dma(dev, page_to_phys(page));
+			if (addr + size > dma_mask) {
+				dma_release_from_contiguous(dev, page, count);
+				page = NULL;
+			}
 		}
 	}
 	/* fallback */
@@ -104,7 +107,7 @@ again:
 	if (!page)
 		return NULL;
 
-	addr = page_to_phys(page);
+	addr = phys_to_dma(dev, page_to_phys(page));
 	if (addr + size > dma_mask) {
 		__free_pages(page, get_order(size));
 
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a6d404087fe3..4fc3cb60ea11 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
 				 enum dma_data_direction dir,
 				 unsigned long attrs)
 {
-	dma_addr_t bus = page_to_phys(page) + offset;
+	dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
 	WARN_ON(size == 0);
 	if (!check_addr("map_single", dev, bus, size))
 		return NOMMU_MAPPING_ERROR;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 1e23577e17cf..677077510e30 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,12 +6,14 @@
 #include <linux/swiotlb.h>
 #include <linux/bootmem.h>
 #include <linux/dma-mapping.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 #include <asm/xen/swiotlb-xen.h>
 #include <asm/iommu_table.h>
+
 int swiotlb __read_mostly;
 
 void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
 		  pci_swiotlb_late_init);
 
 /*
- * if 4GB or more detected (and iommu=off not set) return 1
- * and set swiotlb to 1.
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
  */
 int __init pci_swiotlb_detect_4gb(void)
 {
@@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void)
 	if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
 		swiotlb = 1;
 #endif
+
+	/*
+	 * If SME is active then swiotlb will be set to 1 so that bounce
+	 * buffers are allocated and used for devices that do not support
+	 * the addressing range required for the encryption mask.
+	 */
+	if (sme_active())
+		swiotlb = 1;
+
 	return swiotlb;
 }
 IOMMU_INIT(pci_swiotlb_detect_4gb,
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 91271122f0df..502a77d0adb0 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
 		x86_platform.legacy.reserve_bios_regions = 1;
 		break;
 	case X86_SUBARCH_XEN:
-	case X86_SUBARCH_LGUEST:
 		x86_platform.legacy.devices.pnpbios = 0;
 		x86_platform.legacy.rtc = 0;
 		break;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ca198080ea9..bd6b85fac666 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -355,6 +355,7 @@ bool xen_set_default_idle(void)
 	return ret;
 }
 #endif
+
 void stop_this_cpu(void *dummy)
 {
 	local_irq_disable();
@@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy)
 	disable_local_APIC();
 	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 
-	for (;;)
-		halt();
+	for (;;) {
+		/*
+		 * Use wbinvd followed by hlt to stop the processor. This
+		 * provides support for kexec on a processor that supports
+		 * SME. With kexec, going from SME inactive to SME active
+		 * requires clearing cache entries so that addresses without
+		 * the encryption bit set don't corrupt the same physical
+		 * address that has the encryption bit set when caches are
+		 * flushed. To achieve this a wbinvd is performed followed by
+		 * a hlt. Even if the processor is not in the kexec/SME
+		 * scenario this only adds a wbinvd to a halting processor.
+		 */
+		asm volatile("wbinvd; hlt" : : : "memory");
+	}
 }
 
 /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c6d6dc5f8bb2..11966251cd42 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -56,7 +56,7 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
 #include <asm/proto.h>
 
 void __show_regs(struct pt_regs *regs, int all)
@@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	if (user_mode(regs)) {
 		sp = regs->sp;
-		ss = regs->ss & 0xffff;
+		ss = regs->ss;
 		gs = get_user_gs(regs);
 	} else {
 		sp = kernel_stack_pointer(regs);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c3169be4c596..302e7b2572d1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
 #include <asm/unistd.h>
 #ifdef CONFIG_IA32_EMULATION
 /* Not included via unistd.h */
@@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all)
 	unsigned int fsindex, gsindex;
 	unsigned int ds, cs, es;
 
-	printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
-		(void *)regs->ip);
+	printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
 	printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
 		regs->sp, regs->flags);
 	if (regs->orig_ax != -1)
@@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task)
 	}
 }
 
+enum which_selector {
+	FS,
+	GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+					     unsigned short selector,
+					     enum which_selector which)
+{
+	if (likely(selector == 0)) {
+		/*
+		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
+		 * be the pre-existing saved base or it could be zero.  On AMD
+		 * (with X86_BUG_NULL_SEG), the segment base could be almost
+		 * anything.
+		 *
+		 * This branch is very hot (it's hit twice on almost every
+		 * context switch between 64-bit programs), and avoiding
+		 * the RDMSR helps a lot, so we just assume that whatever
+		 * value is already saved is correct.  This matches historical
+		 * Linux behavior, so it won't break existing applications.
+		 *
+		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+		 * report that the base is zero, it needs to actually be zero:
+		 * see the corresponding logic in load_seg_legacy.
+		 */
+	} else {
+		/*
+		 * If the selector is 1, 2, or 3, then the base is zero on
+		 * !X86_BUG_NULL_SEG CPUs and could be anything on
+		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
+		 * has never attempted to preserve the base across context
+		 * switches.
+		 *
+		 * If selector > 3, then it refers to a real segment, and
+		 * saving the base isn't necessary.
+		 */
+		if (which == FS)
+			prev_p->thread.fsbase = 0;
+		else
+			prev_p->thread.gsbase = 0;
+	}
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+	savesegment(fs, task->thread.fsindex);
+	savesegment(gs, task->thread.gsindex);
+	save_base_legacy(task, task->thread.fsindex, FS);
+	save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+static __always_inline void loadseg(enum which_selector which,
+				    unsigned short sel)
+{
+	if (which == FS)
+		loadsegment(fs, sel);
+	else
+		load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+					    unsigned long prev_base,
+					    unsigned short next_index,
+					    unsigned long next_base,
+					    enum which_selector which)
+{
+	if (likely(next_index <= 3)) {
+		/*
+		 * The next task is using 64-bit TLS, is not using this
+		 * segment at all, or is having fun with arcane CPU features.
+		 */
+		if (next_base == 0) {
+			/*
+			 * Nasty case: on AMD CPUs, we need to forcibly zero
+			 * the base.
+			 */
+			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+				loadseg(which, __USER_DS);
+				loadseg(which, next_index);
+			} else {
+				/*
+				 * We could try to exhaustively detect cases
+				 * under which we can skip the segment load,
+				 * but there's really only one case that matters
+				 * for performance: if both the previous and
+				 * next states are fully zeroed, we can skip
+				 * the load.
+				 *
+				 * (This assumes that prev_base == 0 has no
+				 * false positives.  This is the case on
+				 * Intel-style CPUs.)
+				 */
+				if (likely(prev_index | next_index | prev_base))
+					loadseg(which, next_index);
+			}
+		} else {
+			if (prev_index != next_index)
+				loadseg(which, next_index);
+			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+			       next_base);
+		}
+	} else {
+		/*
+		 * The next task is using a real segment.  Loading the selector
+		 * is sufficient.
+		 */
+		loadseg(which, next_index);
+	}
+}
+
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 		unsigned long arg, struct task_struct *p, unsigned long tls)
 {
@@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 		    unsigned long new_sp,
 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 {
+	WARN_ON_ONCE(regs != current_pt_regs());
+
+	if (static_cpu_has(X86_BUG_NULL_SEG)) {
+		/* Loading zero below won't clear the base. */
+		loadsegment(fs, __USER_DS);
+		load_gs_index(__USER_DS);
+	}
+
 	loadsegment(fs, 0);
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
 	load_gs_index(0);
+
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	regs->cs		= _cs;
@@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	unsigned prev_fsindex, prev_gsindex;
+
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+		     this_cpu_read(irq_count) != -1);
 
 	switch_fpu_prepare(prev_fpu, cpu);
 
@@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 *
 	 * (e.g. xen_load_tls())
 	 */
-	savesegment(fs, prev_fsindex);
-	savesegment(gs, prev_gsindex);
+	save_fsgs(prev_p);
 
 	/*
 	 * Load TLS before restoring any segments so that segment loads
@@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (unlikely(next->ds | prev->ds))
 		loadsegment(ds, next->ds);
 
-	/*
-	 * Switch FS and GS.
-	 *
-	 * These are even more complicated than DS and ES: they have
-	 * 64-bit bases are that controlled by arch_prctl.  The bases
-	 * don't necessarily match the selectors, as user code can do
-	 * any number of things to cause them to be inconsistent.
-	 *
-	 * We don't promise to preserve the bases if the selectors are
-	 * nonzero.  We also don't promise to preserve the base if the
-	 * selector is zero and the base doesn't match whatever was
-	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
-	 * FSGSBASE instructions are enabled, we'll need to offer
-	 * stronger guarantees.)
-	 *
-	 * As an invariant,
-	 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
-	 * impossible.
-	 */
-	if (next->fsindex) {
-		/* Loading a nonzero value into FS sets the index and base. */
-		loadsegment(fs, next->fsindex);
-	} else {
-		if (next->fsbase) {
-			/* Next index is zero but next base is nonzero. */
-			if (prev_fsindex)
-				loadsegment(fs, 0);
-			wrmsrl(MSR_FS_BASE, next->fsbase);
-		} else {
-			/* Next base and index are both zero. */
-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
-				/*
-				 * We don't know the previous base and can't
-				 * find out without RDMSR.  Forcibly clear it.
-				 */
-				loadsegment(fs, __USER_DS);
-				loadsegment(fs, 0);
-			} else {
-				/*
-				 * If the previous index is zero and ARCH_SET_FS
-				 * didn't change the base, then the base is
-				 * also zero and we don't need to do anything.
-				 */
-				if (prev->fsbase || prev_fsindex)
-					loadsegment(fs, 0);
-			}
-		}
-	}
-	/*
-	 * Save the old state and preserve the invariant.
-	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
-	 * without RDMSR because Intel user code can zero it without telling
-	 * us and AMD user code can program any 32-bit value without telling
-	 * us.
-	 */
-	if (prev_fsindex)
-		prev->fsbase = 0;
-	prev->fsindex = prev_fsindex;
-
-	if (next->gsindex) {
-		/* Loading a nonzero value into GS sets the index and base. */
-		load_gs_index(next->gsindex);
-	} else {
-		if (next->gsbase) {
-			/* Next index is zero but next base is nonzero. */
-			if (prev_gsindex)
-				load_gs_index(0);
-			wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
-		} else {
-			/* Next base and index are both zero. */
-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
-				/*
-				 * We don't know the previous base and can't
-				 * find out without RDMSR.  Forcibly clear it.
-				 *
-				 * This contains a pointless SWAPGS pair.
-				 * Fixing it would involve an explicit check
-				 * for Xen or a new pvop.
-				 */
-				load_gs_index(__USER_DS);
-				load_gs_index(0);
-			} else {
-				/*
-				 * If the previous index is zero and ARCH_SET_GS
-				 * didn't change the base, then the base is
-				 * also zero and we don't need to do anything.
-				 */
-				if (prev->gsbase || prev_gsindex)
-					load_gs_index(0);
-			}
-		}
-	}
-	/*
-	 * Save the old state and preserve the invariant.
-	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
-	 * without RDMSR because Intel user code can zero it without telling
-	 * us and AMD user code can program any 32-bit value without telling
-	 * us.
-	 */
-	if (prev_gsindex)
-		prev->gsbase = 0;
-	prev->gsindex = prev_gsindex;
+	load_seg_legacy(prev->fsindex, prev->fsbase,
+			next->fsindex, next->fsbase, FS);
+	load_seg_legacy(prev->gsindex, prev->gsbase,
+			next->gsindex, next->gsbase, GS);
 
 	switch_fpu_finish(next_fpu, cpu);
 
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 67393fc88353..54984b142641 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -38,8 +38,6 @@
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
-static const struct desc_ptr no_idt = {};
-
 /*
  * This is set if we need to go through the 'emergency' path.
  * When machine_emergency_restart() is called, we may be on
@@ -471,12 +469,12 @@ static int __init reboot_init(void)
 
 	/*
 	 * The DMI quirks table takes precedence. If no quirks entry
-	 * matches and the ACPI Hardware Reduced bit is set, force EFI
-	 * reboot.
+	 * matches and the ACPI Hardware Reduced bit is set and EFI
+	 * runtime services are enabled, force EFI reboot.
 	 */
 	rv = dmi_check_system(reboot_dmi_table);
 
-	if (!rv && efi_reboot_required())
+	if (!rv && efi_reboot_required() && !efi_runtime_disabled())
 		reboot_type = BOOT_EFI;
 
 	return 0;
@@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void)
 			break;
 
 		case BOOT_TRIPLE:
-			load_idt(&no_idt);
+			idt_invalidate(NULL);
 			__asm__ __volatile__("int3");
 
 			/* We're probably dead after this, but... */
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 98111b38ebfd..307d3bac5f04 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -47,6 +47,7 @@ relocate_kernel:
 	 * %rsi page_list
 	 * %rdx start address
 	 * %rcx preserve_context
+	 * %r8  sme_active
 	 */
 
 	/* Save the CPU context, used for jumping back */
@@ -71,6 +72,9 @@ relocate_kernel:
 	pushq $0
 	popfq
 
+	/* Save SME active flag */
+	movq	%r8, %r12
+
 	/*
 	 * get physical address of control page now
 	 * this is impossible after page table switch
@@ -132,6 +136,16 @@ identity_mapped:
 	/* Flush the TLB (needed?) */
 	movq	%r9, %cr3
 
+	/*
+	 * If SME is active, there could be old encrypted cache line
+	 * entries that will conflict with the now unencrypted memory
+	 * used by kexec. Flush the caches before copying the kernel.
+	 */
+	testq	%r12, %r12
+	jz 1f
+	wbinvd
+1:
+
 	movq	%rcx, %r11
 	call	swap_pages
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..9cc16a841745 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -69,6 +69,7 @@
 #include <linux/crash_dump.h>
 #include <linux/tboot.h>
 #include <linux/jiffies.h>
+#include <linux/mem_encrypt.h>
 
 #include <linux/usb/xhci-dbgp.h>
 #include <video/edid.h>
@@ -115,6 +116,7 @@
 #include <asm/microcode.h>
 #include <asm/mmu_context.h>
 #include <asm/kaslr.h>
+#include <asm/unwind.h>
 
 /*
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -374,6 +376,14 @@ static void __init reserve_initrd(void)
 	    !ramdisk_image || !ramdisk_size)
 		return;		/* No initrd provided by bootloader */
 
+	/*
+	 * If SME is active, this memory will be marked encrypted by the
+	 * kernel when it is accessed (including relocation). However, the
+	 * ramdisk image was loaded decrypted by the bootloader, so make
+	 * sure that it is encrypted before accessing it.
+	 */
+	sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+
 	initrd_start = 0;
 
 	mapped_size = memblock_mem_size(max_pfn_mapped);
@@ -890,7 +900,7 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	olpc_ofw_detect();
 
-	early_trap_init();
+	idt_setup_early_traps();
 	early_cpu_init();
 	early_ioremap_init();
 
@@ -1161,7 +1171,7 @@ void __init setup_arch(char **cmdline_p)
 
 	init_mem_mapping();
 
-	early_trap_pf_init();
+	idt_setup_early_pf();
 
 	/*
 	 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
@@ -1310,6 +1320,8 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled(EFI_BOOT))
 		efi_apply_memmap_quirks();
 #endif
+
+	unwind_init();
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 10edd1e69a68..6e8fcb6f7e1e 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr)
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
-	struct desc_struct gdt;
+	struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
+					      0xFFFFF);
 
-	pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
-			0x2 | DESCTYPE_S, 0x8);
-	gdt.s = 1;
-	write_gdt_entry(get_cpu_gdt_rw(cpu),
-			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
+	write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
 #endif
 }
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cc30a74e4adb..e04442345fc0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	} else if (IS_ENABLED(CONFIG_X86_32) &&
 		   !onsigstack &&
-		   (regs->ss & 0xffff) != __USER_DS &&
+		   regs->ss != __USER_DS &&
 		   !(ka->sa.sa_flags & SA_RESTORER) &&
 		   ka->sa.sa_restorer) {
 		/* This is the legacy signal stack switching. */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d798c0da451c..5c574dff4c1a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -254,84 +254,45 @@ finish:
 }
 
 /*
- * Reschedule call back.
+ * Reschedule call back. KVM uses this interrupt to force a cpu out of
+ * guest mode
  */
-static inline void __smp_reschedule_interrupt(void)
-{
-	inc_irq_stat(irq_resched_count);
-	scheduler_ipi();
-}
-
 __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
-	__smp_reschedule_interrupt();
-	/*
-	 * KVM uses this interrupt to force a cpu out of guest mode
-	 */
-}
-
-__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
-{
-	/*
-	 * Need to call irq_enter() before calling the trace point.
-	 * __smp_reschedule_interrupt() calls irq_enter/exit() too (in
-	 * scheduler_ipi(). This is OK, since those functions are allowed
-	 * to nest.
-	 */
-	ipi_entering_ack_irq();
-	trace_reschedule_entry(RESCHEDULE_VECTOR);
-	__smp_reschedule_interrupt();
-	trace_reschedule_exit(RESCHEDULE_VECTOR);
-	exiting_irq();
-	/*
-	 * KVM uses this interrupt to force a cpu out of guest mode
-	 */
-}
+	inc_irq_stat(irq_resched_count);
 
-static inline void __smp_call_function_interrupt(void)
-{
-	generic_smp_call_function_interrupt();
-	inc_irq_stat(irq_call_count);
+	if (trace_resched_ipi_enabled()) {
+		/*
+		 * scheduler_ipi() might call irq_enter() as well, but
+		 * nested calls are fine.
+		 */
+		irq_enter();
+		trace_reschedule_entry(RESCHEDULE_VECTOR);
+		scheduler_ipi();
+		trace_reschedule_exit(RESCHEDULE_VECTOR);
+		irq_exit();
+		return;
+	}
+	scheduler_ipi();
 }
 
 __visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
-	__smp_call_function_interrupt();
-	exiting_irq();
-}
-
-__visible void __irq_entry
-smp_trace_call_function_interrupt(struct pt_regs *regs)
-{
-	ipi_entering_ack_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
-	__smp_call_function_interrupt();
-	trace_call_function_exit(CALL_FUNCTION_VECTOR);
-	exiting_irq();
-}
-
-static inline void __smp_call_function_single_interrupt(void)
-{
-	generic_smp_call_function_single_interrupt();
 	inc_irq_stat(irq_call_count);
-}
-
-__visible void __irq_entry
-smp_call_function_single_interrupt(struct pt_regs *regs)
-{
-	ipi_entering_ack_irq();
-	__smp_call_function_single_interrupt();
+	generic_smp_call_function_interrupt();
+	trace_call_function_exit(CALL_FUNCTION_VECTOR);
 	exiting_irq();
 }
 
-__visible void __irq_entry
-smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r)
 {
 	ipi_entering_ack_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
-	__smp_call_function_single_interrupt();
+	inc_irq_stat(irq_call_count);
+	generic_smp_call_function_single_interrupt();
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
 	exiting_irq();
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b474c8de7fba..54b9e89d4d6b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -971,7 +971,8 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
  * Returns zero if CPU booted OK, else error code from
  * ->wakeup_secondary_cpu.
  */
-static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
+		       int *cpu0_nmi_registered)
 {
 	volatile u32 *trampoline_status =
 		(volatile u32 *) __va(real_mode_header->trampoline_status);
@@ -979,7 +980,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	unsigned long start_ip = real_mode_header->trampoline_start;
 
 	unsigned long boot_error = 0;
-	int cpu0_nmi_registered = 0;
 	unsigned long timeout;
 
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
@@ -1035,7 +1035,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 		boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
 	else
 		boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
-						     &cpu0_nmi_registered);
+						     cpu0_nmi_registered);
 
 	if (!boot_error) {
 		/*
@@ -1080,12 +1080,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 		 */
 		smpboot_restore_warm_reset_vector();
 	}
-	/*
-	 * Clean up the nmi handler. Do this after the callin and callout sync
-	 * to avoid impact of possible long unregister time.
-	 */
-	if (cpu0_nmi_registered)
-		unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
 
 	return boot_error;
 }
@@ -1093,8 +1087,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int apicid = apic->cpu_present_to_apicid(cpu);
+	int cpu0_nmi_registered = 0;
 	unsigned long flags;
-	int err;
+	int err, ret = 0;
 
 	WARN_ON(irqs_disabled());
 
@@ -1131,10 +1126,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 
 	common_cpu_up(cpu, tidle);
 
-	err = do_boot_cpu(apicid, cpu, tidle);
+	err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
 	if (err) {
 		pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
-		return -EIO;
+		ret = -EIO;
+		goto unreg_nmi;
 	}
 
 	/*
@@ -1150,7 +1146,15 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 		touch_nmi_watchdog();
 	}
 
-	return 0;
+unreg_nmi:
+	/*
+	 * Clean up the nmi handler. Do this after the callin and callout sync
+	 * to avoid impact of possible long unregister time.
+	 */
+	if (cpu0_nmi_registered)
+		unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
+
+	return ret;
 }
 
 /**
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 5f25cfbd952e..5ee663836c08 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 	unsigned long addr, seg;
 
 	addr = regs->ip;
-	seg = regs->cs & 0xffff;
+	seg = regs->cs;
 	if (v8086_mode(regs)) {
 		addr = (addr & 0xffff) + (seg << 4);
 		return addr;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 213ddf3e937d..73e4d28112f8 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,6 +21,7 @@
 #include <asm/compat.h>
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
+#include <asm/mpx.h>
 
 /*
  * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -100,8 +101,8 @@ out:
 	return error;
 }
 
-static void find_start_end(unsigned long flags, unsigned long *begin,
-			   unsigned long *end)
+static void find_start_end(unsigned long addr, unsigned long flags,
+		unsigned long *begin, unsigned long *end)
 {
 	if (!in_compat_syscall() && (flags & MAP_32BIT)) {
 		/* This is usually used needed to map code in small
@@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 	}
 
 	*begin	= get_mmap_base(1);
-	*end	= in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
+	if (in_compat_syscall())
+		*end = task_size_32bit();
+	else
+		*end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
 }
 
 unsigned long
@@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_unmapped_area_info info;
 	unsigned long begin, end;
 
+	addr = mpx_unmapped_area_check(addr, len, flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
 	if (flags & MAP_FIXED)
 		return addr;
 
-	find_start_end(flags, &begin, &end);
+	find_start_end(addr, flags, &begin, &end);
 
 	if (len > end)
 		return -ENOMEM;
@@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
 
+	addr = mpx_unmapped_area_check(addr, len, flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
 	/* requested length too big for entire address space */
 	if (len > TASK_SIZE)
 		return -ENOMEM;
@@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
 	info.high_limit = get_mmap_base(0);
+
+	/*
+	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+	 * in the full address space.
+	 *
+	 * !in_compat_syscall() check to avoid high addresses for x32.
+	 */
+	if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+		info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
 	info.align_mask = 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
 	if (filp) {
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index dcd699baea1b..a106b9719c58 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx,
 
 	while (n-- > 0) {
 		if (LDT_empty(info) || LDT_zero(info)) {
-			desc->a = desc->b = 0;
+			memset(desc, 0, sizeof(*desc));
 		} else {
 			fill_ldt(desc, info);
 
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 15515132bf0d..c6636d1f60b9 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -4,57 +4,38 @@
  * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
  *
  */
-#include <asm/hw_irq.h>
-#include <asm/desc.h>
+#include <linux/jump_label.h>
 #include <linux/atomic.h>
 
-atomic_t trace_idt_ctr = ATOMIC_INIT(0);
-struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
-				(unsigned long) trace_idt_table };
-
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
 
-static int trace_irq_vector_refcount;
-static DEFINE_MUTEX(irq_vector_mutex);
+DEFINE_STATIC_KEY_FALSE(trace_pagefault_key);
 
-static void set_trace_idt_ctr(int val)
+int trace_pagefault_reg(void)
 {
-	atomic_set(&trace_idt_ctr, val);
-	/* Ensure the trace_idt_ctr is set before sending IPI */
-	wmb();
+	static_branch_inc(&trace_pagefault_key);
+	return 0;
 }
 
-static void switch_idt(void *arg)
+void trace_pagefault_unreg(void)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	load_current_idt();
-	local_irq_restore(flags);
+	static_branch_dec(&trace_pagefault_key);
 }
 
-int trace_irq_vector_regfunc(void)
+#ifdef CONFIG_SMP
+
+DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+
+int trace_resched_ipi_reg(void)
 {
-	mutex_lock(&irq_vector_mutex);
-	if (!trace_irq_vector_refcount) {
-		set_trace_idt_ctr(1);
-		smp_call_function(switch_idt, NULL, 0);
-		switch_idt(NULL);
-	}
-	trace_irq_vector_refcount++;
-	mutex_unlock(&irq_vector_mutex);
+	static_branch_inc(&trace_resched_ipi_key);
 	return 0;
 }
 
-void trace_irq_vector_unregfunc(void)
+void trace_resched_ipi_unreg(void)
 {
-	mutex_lock(&irq_vector_mutex);
-	trace_irq_vector_refcount--;
-	if (!trace_irq_vector_refcount) {
-		set_trace_idt_ctr(0);
-		smp_call_function(switch_idt, NULL, 0);
-		switch_idt(NULL);
-	}
-	mutex_unlock(&irq_vector_mutex);
+	static_branch_dec(&trace_resched_ipi_key);
 }
+
+#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf54309b85da..34ea3651362e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -38,11 +38,6 @@
 #include <linux/smp.h>
 #include <linux/io.h>
 
-#ifdef CONFIG_EISA
-#include <linux/ioport.h>
-#include <linux/eisa.h>
-#endif
-
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
 #endif
@@ -70,20 +65,13 @@
 #include <asm/x86_init.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
-
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
 #else
 #include <asm/processor-flags.h>
 #include <asm/setup.h>
 #include <asm/proto.h>
 #endif
 
-/* Must be page-aligned because the real IDT is used in a fixmap. */
-gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
-
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
 
 static inline void cond_local_irq_enable(struct pt_regs *regs)
 {
@@ -935,87 +923,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 }
 #endif
 
-/* Set of traps needed for early debugging. */
-void __init early_trap_init(void)
-{
-	/*
-	 * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
-	 * is ready in cpu_init() <-- trap_init(). Before trap_init(),
-	 * CPU runs at ring 0 so it is impossible to hit an invalid
-	 * stack.  Using the original stack works well enough at this
-	 * early stage. DEBUG_STACK will be equipped after cpu_init() in
-	 * trap_init().
-	 *
-	 * We don't need to set trace_idt_table like set_intr_gate(),
-	 * since we don't have trace_debug and it will be reset to
-	 * 'debug' in trap_init() by set_intr_gate_ist().
-	 */
-	set_intr_gate_notrace(X86_TRAP_DB, debug);
-	/* int3 can be called from all */
-	set_system_intr_gate(X86_TRAP_BP, &int3);
-#ifdef CONFIG_X86_32
-	set_intr_gate(X86_TRAP_PF, page_fault);
-#endif
-	load_idt(&idt_descr);
-}
-
-void __init early_trap_pf_init(void)
-{
-#ifdef CONFIG_X86_64
-	set_intr_gate(X86_TRAP_PF, page_fault);
-#endif
-}
-
 void __init trap_init(void)
 {
-	int i;
-
-#ifdef CONFIG_EISA
-	void __iomem *p = early_ioremap(0x0FFFD9, 4);
-
-	if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
-		EISA_bus = 1;
-	early_iounmap(p, 4);
-#endif
-
-	set_intr_gate(X86_TRAP_DE, divide_error);
-	set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
-	/* int4 can be called from all */
-	set_system_intr_gate(X86_TRAP_OF, &overflow);
-	set_intr_gate(X86_TRAP_BR, bounds);
-	set_intr_gate(X86_TRAP_UD, invalid_op);
-	set_intr_gate(X86_TRAP_NM, device_not_available);
-#ifdef CONFIG_X86_32
-	set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
-#else
-	set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
-#endif
-	set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
-	set_intr_gate(X86_TRAP_TS, invalid_TSS);
-	set_intr_gate(X86_TRAP_NP, segment_not_present);
-	set_intr_gate(X86_TRAP_SS, stack_segment);
-	set_intr_gate(X86_TRAP_GP, general_protection);
-	set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
-	set_intr_gate(X86_TRAP_MF, coprocessor_error);
-	set_intr_gate(X86_TRAP_AC, alignment_check);
-#ifdef CONFIG_X86_MCE
-	set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
-#endif
-	set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
-
-	/* Reserve all the builtin and the syscall vector: */
-	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
-		set_bit(i, used_vectors);
-
-#ifdef CONFIG_IA32_EMULATION
-	set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
-	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#endif
-
-#ifdef CONFIG_X86_32
-	set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
-	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#endif
+	idt_setup_traps();
 
 	/*
 	 * Set the IDT descriptor to a fixed read-only location, so that the
@@ -1030,20 +940,9 @@ void __init trap_init(void)
 	 */
 	cpu_init();
 
-	/*
-	 * X86_TRAP_DB and X86_TRAP_BP have been set
-	 * in early_trap_init(). However, ITS works only after
-	 * cpu_init() loads TSS. See comments in early_trap_init().
-	 */
-	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
-	/* int3 can be called from all */
-	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+	idt_setup_ist_traps();
 
 	x86_init.irqs.trap_init();
 
-#ifdef CONFIG_X86_64
-	memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
-	set_nmi_gate(X86_TRAP_DB, &debug);
-	set_nmi_gate(X86_TRAP_BP, &int3);
-#endif
+	idt_setup_debugidt_traps();
 }
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..d145a0b1f529 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -10,20 +10,22 @@
 
 #define FRAME_HEADER_SIZE (sizeof(long) * 2)
 
-/*
- * This disables KASAN checking when reading a value from another task's stack,
- * since the other task could be running on another CPU and could have poisoned
- * the stack in the meantime.
- */
-#define READ_ONCE_TASK_STACK(task, x)			\
-({							\
-	unsigned long val;				\
-	if (task == current)				\
-		val = READ_ONCE(x);			\
-	else						\
-		val = READ_ONCE_NOCHECK(x);		\
-	val;						\
-})
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return 0;
+
+	return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return NULL;
+
+	return state->regs ? &state->regs->ip : state->bp + 1;
+}
 
 static void unwind_dump(struct unwind_state *state)
 {
@@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state)
 	}
 }
 
-unsigned long unwind_get_return_address(struct unwind_state *state)
-{
-	if (unwind_done(state))
-		return 0;
-
-	return __kernel_text_address(state->ip) ? state->ip : 0;
-}
-EXPORT_SYMBOL_GPL(unwind_get_return_address);
-
 static size_t regs_size(struct pt_regs *regs)
 {
 	/* x86_32 regs from kernel mode are two words shorter: */
@@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip)
 	if (addr >= __entry_text_start && addr < __entry_text_end)
 		return true;
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 	if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
 		return true;
-#endif
 
 	return false;
 }
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 039f36738e49..4f0e17b90463 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
 }
 EXPORT_SYMBOL_GPL(unwind_get_return_address);
 
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	return NULL;
+}
+
 bool unwind_next_frame(struct unwind_state *state)
 {
 	struct stack_info *info = &state->stack_info;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000000..570b70d3f604
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,582 @@
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/sections.h>
+
+#define orc_warn(fmt, ...) \
+	printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static DEFINE_MUTEX(sort_mutex);
+int *cur_orc_ip_table = __start_orc_unwind_ip;
+struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+unsigned int lookup_num_blocks;
+bool orc_init;
+
+static inline unsigned long orc_ip(const int *ip)
+{
+	return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+				    unsigned int num_entries, unsigned long ip)
+{
+	int *first = ip_table;
+	int *last = ip_table + num_entries - 1;
+	int *mid = first, *found = first;
+
+	if (!num_entries)
+		return NULL;
+
+	/*
+	 * Do a binary range search to find the rightmost duplicate of a given
+	 * starting address.  Some entries are section terminators which are
+	 * "weak" entries for ensuring there are no gaps.  They should be
+	 * ignored when they conflict with a real entry.
+	 */
+	while (first <= last) {
+		mid = first + ((last - first) / 2);
+
+		if (orc_ip(mid) <= ip) {
+			found = mid;
+			first = mid + 1;
+		} else
+			last = mid - 1;
+	}
+
+	return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+	struct module *mod;
+
+	mod = __module_address(ip);
+	if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+		return NULL;
+	return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+			  mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+	return NULL;
+}
+#endif
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+	if (!orc_init)
+		return NULL;
+
+	/* For non-init vmlinux addresses, use the fast lookup table: */
+	if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+		unsigned int idx, start, stop;
+
+		idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+		if (unlikely((idx >= lookup_num_blocks-1))) {
+			orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n",
+				 idx, lookup_num_blocks, ip);
+			return NULL;
+		}
+
+		start = orc_lookup[idx];
+		stop = orc_lookup[idx + 1] + 1;
+
+		if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+			     (__start_orc_unwind + stop > __stop_orc_unwind))) {
+			orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n",
+				 idx, lookup_num_blocks, start, stop, ip);
+			return NULL;
+		}
+
+		return __orc_find(__start_orc_unwind_ip + start,
+				  __start_orc_unwind + start, stop - start, ip);
+	}
+
+	/* vmlinux .init slow lookup: */
+	if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
+		return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+				  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+	/* Module lookup: */
+	return orc_module_find(ip);
+}
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+	struct orc_entry *orc_a, *orc_b;
+	struct orc_entry orc_tmp;
+	int *a = _a, *b = _b, tmp;
+	int delta = _b - _a;
+
+	/* Swap the .orc_unwind_ip entries: */
+	tmp = *a;
+	*a = *b + delta;
+	*b = tmp - delta;
+
+	/* Swap the corresponding .orc_unwind entries: */
+	orc_a = cur_orc_table + (a - cur_orc_ip_table);
+	orc_b = cur_orc_table + (b - cur_orc_ip_table);
+	orc_tmp = *orc_a;
+	*orc_a = *orc_b;
+	*orc_b = orc_tmp;
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+	struct orc_entry *orc_a;
+	const int *a = _a, *b = _b;
+	unsigned long a_val = orc_ip(a);
+	unsigned long b_val = orc_ip(b);
+
+	if (a_val > b_val)
+		return 1;
+	if (a_val < b_val)
+		return -1;
+
+	/*
+	 * The "weak" section terminator entries need to always be on the left
+	 * to ensure the lookup code skips them in favor of real entries.
+	 * These terminator entries exist to handle any gaps created by
+	 * whitelisted .o files which didn't get objtool generation.
+	 */
+	orc_a = cur_orc_table + (a - cur_orc_ip_table);
+	return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1;
+}
+
+#ifdef CONFIG_MODULES
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+			void *_orc, size_t orc_size)
+{
+	int *orc_ip = _orc_ip;
+	struct orc_entry *orc = _orc;
+	unsigned int num_entries = orc_ip_size / sizeof(int);
+
+	WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+		     orc_size % sizeof(*orc) != 0 ||
+		     num_entries != orc_size / sizeof(*orc));
+
+	/*
+	 * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+	 * associate an .orc_unwind_ip table entry with its corresponding
+	 * .orc_unwind entry so they can both be swapped.
+	 */
+	mutex_lock(&sort_mutex);
+	cur_orc_ip_table = orc_ip;
+	cur_orc_table = orc;
+	sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+	mutex_unlock(&sort_mutex);
+
+	mod->arch.orc_unwind_ip = orc_ip;
+	mod->arch.orc_unwind = orc;
+	mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+	size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+	size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+	size_t num_entries = orc_ip_size / sizeof(int);
+	struct orc_entry *orc;
+	int i;
+
+	if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+	    orc_size % sizeof(struct orc_entry) != 0 ||
+	    num_entries != orc_size / sizeof(struct orc_entry)) {
+		orc_warn("WARNING: Bad or missing .orc_unwind table.  Disabling unwinder.\n");
+		return;
+	}
+
+	/* Sort the .orc_unwind and .orc_unwind_ip tables: */
+	sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
+	     orc_sort_swap);
+
+	/* Initialize the fast lookup table: */
+	lookup_num_blocks = orc_lookup_end - orc_lookup;
+	for (i = 0; i < lookup_num_blocks-1; i++) {
+		orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+				 num_entries,
+				 LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+		if (!orc) {
+			orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
+			return;
+		}
+
+		orc_lookup[i] = orc - __start_orc_unwind;
+	}
+
+	/* Initialize the ending block: */
+	orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+			 LOOKUP_STOP_IP);
+	if (!orc) {
+		orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
+		return;
+	}
+	orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+	orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return 0;
+
+	return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return NULL;
+
+	if (state->regs)
+		return &state->regs->ip;
+
+	if (state->sp)
+		return (unsigned long *)state->sp - 1;
+
+	return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+			    size_t len)
+{
+	struct stack_info *info = &state->stack_info;
+
+	/*
+	 * If the address isn't on the current stack, switch to the next one.
+	 *
+	 * We may have to traverse multiple stacks to deal with the possibility
+	 * that info->next_sp could point to an empty stack and the address
+	 * could be on a subsequent stack.
+	 */
+	while (!on_stack(info, (void *)addr, len))
+		if (get_stack_info(info->next_sp, state->task, info,
+				   &state->stack_mask))
+			return false;
+
+	return true;
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+			    unsigned long *val)
+{
+	if (!stack_access_ok(state, addr, sizeof(long)))
+		return false;
+
+	*val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr);
+	return true;
+}
+
+#define REGS_SIZE (sizeof(struct pt_regs))
+#define SP_OFFSET (offsetof(struct pt_regs, sp))
+#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
+#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+			     unsigned long *ip, unsigned long *sp, bool full)
+{
+	size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
+	size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
+	struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
+
+	if (IS_ENABLED(CONFIG_X86_64)) {
+		if (!stack_access_ok(state, addr, regs_size))
+			return false;
+
+		*ip = regs->ip;
+		*sp = regs->sp;
+
+		return true;
+	}
+
+	if (!stack_access_ok(state, addr, sp_offset))
+		return false;
+
+	*ip = regs->ip;
+
+	if (user_mode(regs)) {
+		if (!stack_access_ok(state, addr + sp_offset,
+				     REGS_SIZE - SP_OFFSET))
+			return false;
+
+		*sp = regs->sp;
+	} else
+		*sp = (unsigned long)&regs->sp;
+
+	return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+	unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
+	enum stack_type prev_type = state->stack_info.type;
+	struct orc_entry *orc;
+	struct pt_regs *ptregs;
+	bool indirect = false;
+
+	if (unwind_done(state))
+		return false;
+
+	/* Don't let modules unload while we're reading their ORC data. */
+	preempt_disable();
+
+	/* Have we reached the end? */
+	if (state->regs && user_mode(state->regs))
+		goto done;
+
+	/*
+	 * Find the orc_entry associated with the text address.
+	 *
+	 * Decrement call return addresses by one so they work for sibling
+	 * calls and calls to noreturn functions.
+	 */
+	orc = orc_find(state->signal ? state->ip : state->ip - 1);
+	if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
+		goto done;
+	orig_ip = state->ip;
+
+	/* Find the previous frame's stack: */
+	switch (orc->sp_reg) {
+	case ORC_REG_SP:
+		sp = state->sp + orc->sp_offset;
+		break;
+
+	case ORC_REG_BP:
+		sp = state->bp + orc->sp_offset;
+		break;
+
+	case ORC_REG_SP_INDIRECT:
+		sp = state->sp + orc->sp_offset;
+		indirect = true;
+		break;
+
+	case ORC_REG_BP_INDIRECT:
+		sp = state->bp + orc->sp_offset;
+		indirect = true;
+		break;
+
+	case ORC_REG_R10:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg R10 at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->r10;
+		break;
+
+	case ORC_REG_R13:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg R13 at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->r13;
+		break;
+
+	case ORC_REG_DI:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg DI at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->di;
+		break;
+
+	case ORC_REG_DX:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg DX at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->dx;
+		break;
+
+	default:
+		orc_warn("unknown SP base reg %d for ip %p\n",
+			 orc->sp_reg, (void *)state->ip);
+		goto done;
+	}
+
+	if (indirect) {
+		if (!deref_stack_reg(state, sp, &sp))
+			goto done;
+	}
+
+	/* Find IP, SP and possibly regs: */
+	switch (orc->type) {
+	case ORC_TYPE_CALL:
+		ip_p = sp - sizeof(long);
+
+		if (!deref_stack_reg(state, ip_p, &state->ip))
+			goto done;
+
+		state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+						  state->ip, (void *)ip_p);
+
+		state->sp = sp;
+		state->regs = NULL;
+		state->signal = false;
+		break;
+
+	case ORC_TYPE_REGS:
+		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+			orc_warn("can't dereference registers at %p for ip %p\n",
+				 (void *)sp, (void *)orig_ip);
+			goto done;
+		}
+
+		state->regs = (struct pt_regs *)sp;
+		state->full_regs = true;
+		state->signal = true;
+		break;
+
+	case ORC_TYPE_REGS_IRET:
+		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+			orc_warn("can't dereference iret registers at %p for ip %p\n",
+				 (void *)sp, (void *)orig_ip);
+			goto done;
+		}
+
+		ptregs = container_of((void *)sp, struct pt_regs, ip);
+		if ((unsigned long)ptregs >= prev_sp &&
+		    on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
+			state->regs = ptregs;
+			state->full_regs = false;
+		} else
+			state->regs = NULL;
+
+		state->signal = true;
+		break;
+
+	default:
+		orc_warn("unknown .orc_unwind entry type %d\n", orc->type);
+		break;
+	}
+
+	/* Find BP: */
+	switch (orc->bp_reg) {
+	case ORC_REG_UNDEFINED:
+		if (state->regs && state->full_regs)
+			state->bp = state->regs->bp;
+		break;
+
+	case ORC_REG_PREV_SP:
+		if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+			goto done;
+		break;
+
+	case ORC_REG_BP:
+		if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+			goto done;
+		break;
+
+	default:
+		orc_warn("unknown BP base reg %d for ip %p\n",
+			 orc->bp_reg, (void *)orig_ip);
+		goto done;
+	}
+
+	/* Prevent a recursive loop due to bad ORC data: */
+	if (state->stack_info.type == prev_type &&
+	    on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+	    state->sp <= prev_sp) {
+		orc_warn("stack going in the wrong direction? ip=%p\n",
+			 (void *)orig_ip);
+		goto done;
+	}
+
+	preempt_enable();
+	return true;
+
+done:
+	preempt_enable();
+	state->stack_info.type = STACK_TYPE_UNKNOWN;
+	return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+		    struct pt_regs *regs, unsigned long *first_frame)
+{
+	memset(state, 0, sizeof(*state));
+	state->task = task;
+
+	/*
+	 * Refuse to unwind the stack of a task while it's executing on another
+	 * CPU.  This check is racy, but that's ok: the unwinder has other
+	 * checks to prevent it from going off the rails.
+	 */
+	if (task_on_another_cpu(task))
+		goto done;
+
+	if (regs) {
+		if (user_mode(regs))
+			goto done;
+
+		state->ip = regs->ip;
+		state->sp = kernel_stack_pointer(regs);
+		state->bp = regs->bp;
+		state->regs = regs;
+		state->full_regs = true;
+		state->signal = true;
+
+	} else if (task == current) {
+		asm volatile("lea (%%rip), %0\n\t"
+			     "mov %%rsp, %1\n\t"
+			     "mov %%rbp, %2\n\t"
+			     : "=r" (state->ip), "=r" (state->sp),
+			       "=r" (state->bp));
+
+	} else {
+		struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+		state->sp = task->thread.sp;
+		state->bp = READ_ONCE_NOCHECK(frame->bp);
+		state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+	}
+
+	if (get_stack_info((unsigned long *)state->sp, state->task,
+			   &state->stack_info, &state->stack_mask))
+		return;
+
+	/*
+	 * The caller can provide the address of the first frame directly
+	 * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+	 * to start unwinding at.  Skip ahead until we reach it.
+	 */
+
+	/* When starting from regs, skip the regs frame: */
+	if (regs) {
+		unwind_next_frame(state);
+		return;
+	}
+
+	/* Otherwise, skip ahead to the user-specified starting frame: */
+	while (!unwind_done(state) &&
+	       (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+			state->sp <= (unsigned long)first_frame))
+		unwind_next_frame(state);
+
+	return;
+
+done:
+	state->stack_info.type = STACK_TYPE_UNKNOWN;
+	return;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8a3b61be0aa..f05f00acac89 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/page_types.h>
+#include <asm/orc_lookup.h>
 #include <asm/cache.h>
 #include <asm/boot.h>
 
@@ -148,6 +149,8 @@ SECTIONS
 
 	BUG_TABLE
 
+	ORC_UNWIND_TABLE
+
 	. = ALIGN(PAGE_SIZE);
 	__vvar_page = .;
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 760433b2574a..3ea624452f93 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,7 +22,7 @@ config KVM
 	depends on HAVE_KVM
 	depends on HIGH_RES_TIMERS
 	# for TASKSTATS/TASK_DELAY_ACCT:
-	depends on NET
+	depends on NET && MULTIUSER
 	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
 	select ANON_INODES
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2eea522c..19adbb418443 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -469,7 +469,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
 			cpuid_mask(&entry->ecx, CPUID_7_ECX);
 			/* PKU is not yet implemented for shadow paging. */
-			if (!tdp_enabled)
+			if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
 				entry->ecx &= ~F(PKU);
 			entry->edx &= kvm_cpuid_7_0_edx_x86_features;
 			entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 2695a34fa1c5..337b6d2730fa 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -649,9 +649,10 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
 				}
 
 				if ((stimer->config & HV_STIMER_ENABLE) &&
-				    stimer->count)
-					stimer_start(stimer);
-				else
+				    stimer->count) {
+					if (!stimer->msg_pending)
+						stimer_start(stimer);
+				} else
 					stimer_cleanup(stimer);
 			}
 		}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 762cdf2595f9..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,11 +84,6 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
 		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
 }
 
-static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
-{
-	return kvm_x86_ops->get_pkru(vcpu);
-}
-
 static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2819d4c123eb..589dcc117086 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,11 +1495,10 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
 
 static void cancel_hv_timer(struct kvm_lapic *apic)
 {
+	WARN_ON(preemptible());
 	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
-	preempt_disable();
 	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
 	apic->lapic_timer.hv_timer_in_use = false;
-	preempt_enable();
 }
 
 static bool start_hv_timer(struct kvm_lapic *apic)
@@ -1507,6 +1506,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
 	struct kvm_timer *ktimer = &apic->lapic_timer;
 	int r;
 
+	WARN_ON(preemptible());
 	if (!kvm_x86_ops->set_hv_timer)
 		return false;
 
@@ -1538,6 +1538,8 @@ static bool start_hv_timer(struct kvm_lapic *apic)
 static void start_sw_timer(struct kvm_lapic *apic)
 {
 	struct kvm_timer *ktimer = &apic->lapic_timer;
+
+	WARN_ON(preemptible());
 	if (apic->lapic_timer.hv_timer_in_use)
 		cancel_hv_timer(apic);
 	if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
@@ -1552,15 +1554,20 @@ static void start_sw_timer(struct kvm_lapic *apic)
 
 static void restart_apic_timer(struct kvm_lapic *apic)
 {
+	preempt_disable();
 	if (!start_hv_timer(apic))
 		start_sw_timer(apic);
+	preempt_enable();
 }
 
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+	preempt_disable();
+	/* If the preempt notifier has already run, it also called apic_timer_expired */
+	if (!apic->lapic_timer.hv_timer_in_use)
+		goto out;
 	WARN_ON(swait_active(&vcpu->wq));
 	cancel_hv_timer(apic);
 	apic_timer_expired(apic);
@@ -1569,6 +1576,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 		advance_periodic_target_expiration(apic);
 		restart_apic_timer(apic);
 	}
+out:
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
 
@@ -1582,9 +1591,11 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
+	preempt_disable();
 	/* Possibly the TSC deadline timer is not enabled yet */
 	if (apic->lapic_timer.hv_timer_in_use)
 		start_sw_timer(apic);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b1dd114956a..04d750813c9d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -108,7 +108,7 @@ module_param(dbg, bool, 0644);
 	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 
 
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
 #define PT64_DIR_BASE_ADDR_MASK \
 	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
 #define PT64_LVL_ADDR_MASK(level) \
@@ -126,7 +126,7 @@ module_param(dbg, bool, 0644);
 					    * PT32_LEVEL_BITS))) - 1))
 
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
-			| shadow_x_mask | shadow_nx_mask)
+			| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_mmio_value;
 static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_me_mask;
 
 /*
  * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
@@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
  */
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-		u64 acc_track_mask)
+		u64 acc_track_mask, u64 me_mask)
 {
 	BUG_ON(!dirty_mask != !accessed_mask);
 	BUG_ON(!accessed_mask && !acc_track_mask);
@@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 	shadow_x_mask = x_mask;
 	shadow_present_mask = p_mask;
 	shadow_acc_track_mask = acc_track_mask;
+	shadow_me_mask = me_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
 	spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
-	       shadow_user_mask | shadow_x_mask;
+	       shadow_user_mask | shadow_x_mask | shadow_me_mask;
 
 	if (sp_ad_disabled(sp))
 		spte |= shadow_acc_track_value;
@@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		pte_access &= ~ACC_WRITE_MASK;
 
 	spte |= (u64)pfn << PAGE_SHIFT;
+	spte |= shadow_me_mask;
 
 	if (pte_access & ACC_WRITE_MASK) {
 
@@ -4106,16 +4109,28 @@ void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
 	bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+	struct rsvd_bits_validate *shadow_zero_check;
+	int i;
 
 	/*
 	 * Passing "true" to the last argument is okay; it adds a check
 	 * on bit 8 of the SPTEs which KVM doesn't use anyway.
 	 */
-	__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+	shadow_zero_check = &context->shadow_zero_check;
+	__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
 				boot_cpu_data.x86_phys_bits,
 				context->shadow_root_level, uses_nx,
 				guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
 				true);
+
+	if (!shadow_me_mask)
+		return;
+
+	for (i = context->shadow_root_level; --i >= 0;) {
+		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+	}
+
 }
 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
 
@@ -4133,17 +4148,29 @@ static void
 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 				struct kvm_mmu *context)
 {
+	struct rsvd_bits_validate *shadow_zero_check;
+	int i;
+
+	shadow_zero_check = &context->shadow_zero_check;
+
 	if (boot_cpu_is_amd())
-		__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+		__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
 					boot_cpu_data.x86_phys_bits,
 					context->shadow_root_level, false,
 					boot_cpu_has(X86_FEATURE_GBPAGES),
 					true, true);
 	else
-		__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+		__reset_rsvds_bits_mask_ept(shadow_zero_check,
 					    boot_cpu_data.x86_phys_bits,
 					    false);
 
+	if (!shadow_me_mask)
+		return;
+
+	for (i = context->shadow_root_level; --i >= 0;) {
+		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+	}
 }
 
 /*
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a000dd..4b9a3ae6b725 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -185,7 +185,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 		* index of the protection domain, so pte_pkey * 2 is
 		* is the index of the first bit for the domain.
 		*/
-		pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+		pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
 
 		/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
 		offset = (pfec & ~1) +
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4d8141e533c3..8dbd8dbc83eb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1100,7 +1100,7 @@ static __init int svm_hardware_setup(void)
 
 	if (vls) {
 		if (!npt_enabled ||
-		    !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) ||
+		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
 		    !IS_ENABLED(CONFIG_X86_64)) {
 			vls = false;
 		} else {
@@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb *vmcb = svm->vmcb;
 	struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
-	phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
-	phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
-	phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+	phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+	phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
+	phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
 
 	vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
 	vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 		set_intercept(svm, INTERCEPT_MWAIT);
 	}
 
-	control->iopm_base_pa = iopm_base;
-	control->msrpm_base_pa = __pa(svm->msrpm);
+	control->iopm_base_pa = __sme_set(iopm_base);
+	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
 	control->int_ctl = V_INTR_MASKING_MASK;
 
 	init_seg(&save->es);
@@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 		return -EINVAL;
 
 	new_entry = READ_ONCE(*entry);
-	new_entry = (page_to_phys(svm->avic_backing_page) &
-		     AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
-		     AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+	new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+			      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+			      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
 	WRITE_ONCE(*entry, new_entry);
 
 	svm->avic_physical_id_cache = entry;
@@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
-	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+	svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
 	svm->asid_generation = 0;
 	init_vmcb(svm);
 
@@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+	__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
 	__free_page(virt_to_page(svm->nested.hsave));
 	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
@@ -1777,11 +1777,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
 }
 
-static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 {
 	switch (reg) {
@@ -2335,7 +2330,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
 	u64 pdpte;
 	int ret;
 
-	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
 				       offset_in_page(cr3) + index * 8, 8);
 	if (ret)
 		return 0;
@@ -2347,7 +2342,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->control.nested_cr3 = root;
+	svm->vmcb->control.nested_cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_NPT);
 	svm_flush_tlb(vcpu);
 }
@@ -2430,6 +2425,16 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
 	svm->vmcb->control.exit_code_hi = 0;
 	svm->vmcb->control.exit_info_1 = error_code;
+
+	/*
+	 * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
+	 * The fix is to add the ancillary datum (CR2 or DR6) to structs
+	 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
+	 * written only when inject_pending_event runs (DR6 would written here
+	 * too).  This should be conditional on a new capability---if the
+	 * capability is disabled, kvm_multiple_exception would write the
+	 * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+	 */
 	if (svm->vcpu.arch.exception.nested_apf)
 		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
 	else
@@ -2868,7 +2873,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 		svm->nested.msrpm[p] = svm->msrpm[p] | value;
 	}
 
-	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+	svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
 
 	return true;
 }
@@ -4501,7 +4506,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 	pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
 		 irq.vector);
 	*svm = to_svm(vcpu);
-	vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+	vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
 	vcpu_info->vector = irq.vector;
 
 	return 0;
@@ -4552,7 +4557,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 			struct amd_iommu_pi_data pi;
 
 			/* Try to enable guest_mode in IRTE */
-			pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+			pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+					    AVIC_HPA_MASK);
 			pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
 						     svm->vcpu.vcpu_id);
 			pi.is_guest_mode = true;
@@ -5001,7 +5007,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->save.cr3 = root;
+	svm->vmcb->save.cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_CR);
 	svm_flush_tlb(vcpu);
 }
@@ -5010,7 +5016,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->control.nested_cr3 = root;
+	svm->vmcb->control.nested_cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_NPT);
 
 	/* Also sync guest cr3 here in case we live migrate */
@@ -5403,8 +5409,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
 
-	.get_pkru = svm_get_pkru,
-
 	.tlb_flush = svm_flush_tlb,
 
 	.run = svm_vcpu_run,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 84e62acf2dd8..70b90c0810d0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -198,7 +198,8 @@ struct loaded_vmcs {
 	struct vmcs *vmcs;
 	struct vmcs *shadow_vmcs;
 	int cpu;
-	int launched;
+	bool launched;
+	bool nmi_known_unmasked;
 	struct list_head loaded_vmcss_on_cpu_link;
 };
 
@@ -415,13 +416,10 @@ struct nested_vmx {
 
 	/* The guest-physical address of the current VMCS L1 keeps for L2 */
 	gpa_t current_vmptr;
-	/* The host-usable pointer to the above */
-	struct page *current_vmcs12_page;
-	struct vmcs12 *current_vmcs12;
 	/*
 	 * Cache of the guest's VMCS, existing outside of guest memory.
 	 * Loaded from guest memory during VMPTRLD. Flushed to guest
-	 * memory during VMXOFF, VMCLEAR, VMPTRLD.
+	 * memory during VMCLEAR and VMPTRLD.
 	 */
 	struct vmcs12 *cached_vmcs12;
 	/*
@@ -562,7 +560,6 @@ struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
 	u8                    fail;
-	bool                  nmi_known_unmasked;
 	u32                   exit_intr_info;
 	u32                   idt_vectoring_info;
 	ulong                 rflags;
@@ -639,8 +636,6 @@ struct vcpu_vmx {
 
 	u64 current_tsc_ratio;
 
-	bool guest_pkru_valid;
-	u32 guest_pkru;
 	u32 host_pkru;
 
 	/*
@@ -927,6 +922,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+					    u16 error_code);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -2326,6 +2325,11 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 	__vmx_load_host_state(to_vmx(vcpu));
 }
 
+static bool emulation_required(struct kvm_vcpu *vcpu)
+{
+	return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
 /*
@@ -2363,6 +2367,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
+	unsigned long old_rflags = vmx_get_rflags(vcpu);
+
 	__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
 	to_vmx(vcpu)->rflags = rflags;
 	if (to_vmx(vcpu)->rmode.vm86_active) {
@@ -2370,11 +2376,9 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 	}
 	vmcs_writel(GUEST_RFLAGS, rflags);
-}
 
-static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
-{
-	return to_vmx(vcpu)->guest_pkru;
+	if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
+		to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
 }
 
 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -2418,6 +2422,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+					       unsigned long exit_qual)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	unsigned int nr = vcpu->arch.exception.nr;
+	u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+	if (vcpu->arch.exception.has_error_code) {
+		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+	}
+
+	if (kvm_exception_is_soft(nr))
+		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+	else
+		intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+	if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+	    vmx_get_nmi_mask(vcpu))
+		intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -2427,23 +2455,38 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	unsigned int nr = vcpu->arch.exception.nr;
 
-	if (!((vmcs12->exception_bitmap & (1u << nr)) ||
-		(nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
-		return 0;
+	if (nr == PF_VECTOR) {
+		if (vcpu->arch.exception.nested_apf) {
+			nested_vmx_inject_exception_vmexit(vcpu,
+							   vcpu->arch.apf.nested_apf_token);
+			return 1;
+		}
+		/*
+		 * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+		 * The fix is to add the ancillary datum (CR2 or DR6) to structs
+		 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+		 * can be written only when inject_pending_event runs.  This should be
+		 * conditional on a new capability---if the capability is disabled,
+		 * kvm_multiple_exception would write the ancillary information to
+		 * CR2 or DR6, for backwards ABI-compatibility.
+		 */
+		if (nested_vmx_is_page_fault_vmexit(vmcs12,
+						    vcpu->arch.exception.error_code)) {
+			nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
+			return 1;
+		}
+	} else {
+		unsigned long exit_qual = 0;
+		if (nr == DB_VECTOR)
+			exit_qual = vcpu->arch.dr6;
 
-	if (vcpu->arch.exception.nested_apf) {
-		vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
-		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-			PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
-			INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
-			vcpu->arch.apf.nested_apf_token);
-		return 1;
+		if (vmcs12->exception_bitmap & (1u << nr)) {
+			nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+			return 1;
+		}
 	}
 
-	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-			  vmcs_read32(VM_EXIT_INTR_INFO),
-			  vmcs_readl(EXIT_QUALIFICATION));
-	return 1;
+	return 0;
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
@@ -2657,7 +2700,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	 * reason is that if one of these bits is necessary, it will appear
 	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
 	 * fields of vmcs01 and vmcs02, will turn these bits off - and
-	 * nested_vmx_exit_handled() will not pass related exits to L1.
+	 * nested_vmx_exit_reflected() will not pass related exits to L1.
 	 * These rules have exceptions below.
 	 */
 
@@ -3857,11 +3900,6 @@ static __init int alloc_kvm_area(void)
 	return 0;
 }
 
-static bool emulation_required(struct kvm_vcpu *vcpu)
-{
-	return emulate_invalid_guest_state && !guest_state_valid(vcpu);
-}
-
 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
 		struct kvm_segment *save)
 {
@@ -4950,6 +4988,28 @@ static bool vmx_get_enable_apicv(void)
 	return enable_apicv;
 }
 
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	gfn_t gfn;
+
+	/*
+	 * Don't need to mark the APIC access page dirty; it is never
+	 * written to by the CPU during APIC virtualization.
+	 */
+
+	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+		gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
+
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
+}
+
+
 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4957,18 +5017,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	void *vapic_page;
 	u16 status;
 
-	if (vmx->nested.pi_desc &&
-	    vmx->nested.pi_pending) {
-		vmx->nested.pi_pending = false;
-		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-			return;
-
-		max_irr = find_last_bit(
-			(unsigned long *)vmx->nested.pi_desc->pir, 256);
+	if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+		return;
 
-		if (max_irr == 256)
-			return;
+	vmx->nested.pi_pending = false;
+	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+		return;
 
+	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+	if (max_irr != 256) {
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
 		kunmap(vmx->nested.virtual_apic_page);
@@ -4980,11 +5037,16 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			vmcs_write16(GUEST_INTR_STATUS, status);
 		}
 	}
+
+	nested_mark_vmcs12_pages_dirty(vcpu);
 }
 
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+						     bool nested)
 {
 #ifdef CONFIG_SMP
+	int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
+
 	if (vcpu->mode == IN_GUEST_MODE) {
 		struct vcpu_vmx *vmx = to_vmx(vcpu);
 
@@ -5002,8 +5064,7 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
 		 */
 		WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
 
-		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
-				POSTED_INTR_VECTOR);
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
 		return true;
 	}
 #endif
@@ -5018,7 +5079,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 	if (is_guest_mode(vcpu) &&
 	    vector == vmx->nested.posted_intr_nv) {
 		/* the PIR and ON have been set by L1. */
-		kvm_vcpu_trigger_posted_interrupt(vcpu);
+		kvm_vcpu_trigger_posted_interrupt(vcpu, true);
 		/*
 		 * If a posted intr is not recognized by hardware,
 		 * we will accomplish it in the next vmentry.
@@ -5052,7 +5113,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 	if (pi_test_and_set_on(&vmx->pi_desc))
 		return;
 
-	if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
 		kvm_vcpu_kick(vcpu);
 }
 
@@ -5510,10 +5571,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (!is_guest_mode(vcpu)) {
-		++vcpu->stat.nmi_injections;
-		vmx->nmi_known_unmasked = false;
-	}
+	++vcpu->stat.nmi_injections;
+	vmx->loaded_vmcs->nmi_known_unmasked = false;
 
 	if (vmx->rmode.vm86_active) {
 		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
@@ -5527,16 +5586,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-	if (to_vmx(vcpu)->nmi_known_unmasked)
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	bool masked;
+
+	if (vmx->loaded_vmcs->nmi_known_unmasked)
 		return false;
-	return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)	& GUEST_INTR_STATE_NMI;
+	masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+	return masked;
 }
 
 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	vmx->nmi_known_unmasked = !masked;
+	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
 	if (masked)
 		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 			      GUEST_INTR_STATE_NMI);
@@ -6492,7 +6556,7 @@ void vmx_enable_tdp(void)
 		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
 		0ull, VMX_EPT_EXECUTABLE_MASK,
 		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-		VMX_EPT_RWX_MASK);
+		VMX_EPT_RWX_MASK, 0ull);
 
 	ept_set_mmio_spte_mask();
 	kvm_enable_tdp();
@@ -7124,34 +7188,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+	vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+	vmcs_write64(VMCS_LINK_POINTER, -1ull);
+}
+
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
 	if (vmx->nested.current_vmptr == -1ull)
 		return;
 
-	/* current_vmptr and current_vmcs12 are always set/reset together */
-	if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
-		return;
-
 	if (enable_shadow_vmcs) {
 		/* copy to memory all shadowed fields in case
 		   they were modified */
 		copy_shadow_to_vmcs12(vmx);
 		vmx->nested.sync_shadow_vmcs = false;
-		vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
-				SECONDARY_EXEC_SHADOW_VMCS);
-		vmcs_write64(VMCS_LINK_POINTER, -1ull);
+		vmx_disable_shadow_vmcs(vmx);
 	}
 	vmx->nested.posted_intr_nv = -1;
 
 	/* Flush VMCS12 to guest memory */
-	memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
-	       VMCS12_SIZE);
+	kvm_vcpu_write_guest_page(&vmx->vcpu,
+				  vmx->nested.current_vmptr >> PAGE_SHIFT,
+				  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
 
-	kunmap(vmx->nested.current_vmcs12_page);
-	nested_release_page(vmx->nested.current_vmcs12_page);
 	vmx->nested.current_vmptr = -1ull;
-	vmx->nested.current_vmcs12 = NULL;
 }
 
 /*
@@ -7165,12 +7227,14 @@ static void free_nested(struct vcpu_vmx *vmx)
 
 	vmx->nested.vmxon = false;
 	free_vpid(vmx->nested.vpid02);
-	nested_release_vmcs12(vmx);
+	vmx->nested.posted_intr_nv = -1;
+	vmx->nested.current_vmptr = -1ull;
 	if (vmx->nested.msr_bitmap) {
 		free_page((unsigned long)vmx->nested.msr_bitmap);
 		vmx->nested.msr_bitmap = NULL;
 	}
 	if (enable_shadow_vmcs) {
+		vmx_disable_shadow_vmcs(vmx);
 		vmcs_clear(vmx->vmcs01.shadow_vmcs);
 		free_vmcs(vmx->vmcs01.shadow_vmcs);
 		vmx->vmcs01.shadow_vmcs = NULL;
@@ -7569,14 +7633,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		}
 
 		nested_release_vmcs12(vmx);
-		vmx->nested.current_vmcs12 = new_vmcs12;
-		vmx->nested.current_vmcs12_page = page;
 		/*
 		 * Load VMCS12 from guest memory since it is not already
 		 * cached.
 		 */
-		memcpy(vmx->nested.cached_vmcs12,
-		       vmx->nested.current_vmcs12, VMCS12_SIZE);
+		memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+		kunmap(page);
+		nested_release_page_clean(page);
+
 		set_current_vmptr(vmx, vmptr);
 	}
 
@@ -8009,12 +8073,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
  * should handle it ourselves in L0 (and then continue L2). Only call this
  * when in is_guest_mode (L2).
  */
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u32 exit_reason = vmx->exit_reason;
 
 	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
 				vmcs_readl(EXIT_QUALIFICATION),
@@ -8023,6 +8086,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
 				KVM_ISA_VMX);
 
+	/*
+	 * The host physical addresses of some pages of guest memory
+	 * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+	 * may write to these pages via their host physical address while
+	 * L2 is running, bypassing any address-translation-based dirty
+	 * tracking (e.g. EPT write protection).
+	 *
+	 * Mark them dirty on every exit from L2 to prevent them from
+	 * getting out of sync with dirty tracking.
+	 */
+	nested_mark_vmcs12_pages_dirty(vcpu);
+
 	if (vmx->nested.nested_run_pending)
 		return false;
 
@@ -8159,6 +8234,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	}
 }
 
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/*
+	 * At this point, the exit interruption info in exit_intr_info
+	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
+	 * we need to query the in-kernel LAPIC.
+	 */
+	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+	if ((exit_intr_info &
+	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		vmcs12->vm_exit_intr_error_code =
+			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	}
+
+	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+			  vmcs_readl(EXIT_QUALIFICATION));
+	return 1;
+}
+
 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
 	*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -8405,12 +8503,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required)
 		return handle_invalid_guest_state(vcpu);
 
-	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
-		nested_vmx_vmexit(vcpu, exit_reason,
-				  vmcs_read32(VM_EXIT_INTR_INFO),
-				  vmcs_readl(EXIT_QUALIFICATION));
-		return 1;
-	}
+	if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+		return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
 		dump_vmcs();
@@ -8685,7 +8779,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 
 		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
 		desc = (gate_desc *)vmx->host_idt_base + vector;
-		entry = gate_offset(*desc);
+		entry = gate_offset(desc);
 		asm volatile(
 #ifdef CONFIG_X86_64
 			"mov %%" _ASM_SP ", %[sp]\n\t"
@@ -8736,7 +8830,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 
 	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
-	if (vmx->nmi_known_unmasked)
+	if (vmx->loaded_vmcs->nmi_known_unmasked)
 		return;
 	/*
 	 * Can't use vmx->exit_intr_info since we're not sure what
@@ -8760,7 +8854,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 			      GUEST_INTR_STATE_NMI);
 	else
-		vmx->nmi_known_unmasked =
+		vmx->loaded_vmcs->nmi_known_unmasked =
 			!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
 			  & GUEST_INTR_STATE_NMI);
 }
@@ -8919,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
-	if (vmx->guest_pkru_valid)
-		__write_pkru(vmx->guest_pkru);
+	if (static_cpu_has(X86_FEATURE_PKU) &&
+	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+	    vcpu->arch.pkru != vmx->host_pkru)
+		__write_pkru(vcpu->arch.pkru);
 
 	atomic_switch_perf_msrs(vmx);
 	debugctlmsr = get_debugctlmsr();
@@ -9068,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * back on host, so it is safe to read guest PKRU from current
 	 * XSAVE.
 	 */
-	if (boot_cpu_has(X86_FEATURE_OSPKE)) {
-		vmx->guest_pkru = __read_pkru();
-		if (vmx->guest_pkru != vmx->host_pkru) {
-			vmx->guest_pkru_valid = true;
+	if (static_cpu_has(X86_FEATURE_PKU) &&
+	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+		vcpu->arch.pkru = __read_pkru();
+		if (vcpu->arch.pkru != vmx->host_pkru)
 			__write_pkru(vmx->host_pkru);
-		} else
-			vmx->guest_pkru_valid = false;
 	}
 
 	/*
@@ -9213,7 +9307,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
-	vmx->nested.current_vmcs12 = NULL;
 
 	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
 
@@ -9499,12 +9592,15 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
 	WARN_ON(!is_guest_mode(vcpu));
 
-	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
-		nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
-				  vmcs_read32(VM_EXIT_INTR_INFO),
-				  vmcs_readl(EXIT_QUALIFICATION));
-	else
+	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+		vmcs12->vm_exit_intr_error_code = fault->error_code;
+		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+				  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+				  fault->address);
+	} else {
 		kvm_inject_page_fault(vcpu, fault);
+	}
 }
 
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
@@ -10032,6 +10128,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			     vmcs12->vm_entry_instruction_len);
 		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
 			     vmcs12->guest_interruptibility_info);
+		vmx->loaded_vmcs->nmi_known_unmasked =
+			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
 	} else {
 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
 	}
@@ -10056,13 +10154,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
 	/* Posted interrupts setting is only taken from vmcs12.  */
 	if (nested_cpu_has_posted_intr(vmcs12)) {
-		/*
-		 * Note that we use L0's vector here and in
-		 * vmx_deliver_nested_posted_interrupt.
-		 */
 		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
 		vmx->nested.pi_pending = false;
-		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
 	} else {
 		exec_control &= ~PIN_BASED_POSTED_INTR;
 	}
@@ -10086,12 +10180,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
 	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
 	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
-	 *
-	 * A problem with this approach (when !enable_ept) is that L1 may be
-	 * injected with more page faults than it asked for. This could have
-	 * caused problems, but in practice existing hypervisors don't care.
-	 * To fix this, we will need to emulate the PFEC checking (on the L1
-	 * page tables), using walk_addr(), when injecting PFs to L1.
 	 */
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
 		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
@@ -10488,6 +10576,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 {
 	struct vmcs12 *vmcs12;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
 	u32 exit_qual;
 	int ret;
 
@@ -10512,6 +10601,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 * for misconfigurations which will anyway be caught by the processor
 	 * when using the merged vmcs02.
 	 */
+	if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
+		nested_vmx_failValid(vcpu,
+				     VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+		goto out;
+	}
+
 	if (vmcs12->launch_state == launch) {
 		nested_vmx_failValid(vcpu,
 			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
@@ -10832,13 +10927,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
 	vmcs12->vm_exit_reason = exit_reason;
 	vmcs12->exit_qualification = exit_qualification;
-
 	vmcs12->vm_exit_intr_info = exit_intr_info;
-	if ((vmcs12->vm_exit_intr_info &
-	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
-	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
-		vmcs12->vm_exit_intr_error_code =
-			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
 	vmcs12->idt_vectoring_info_field = 0;
 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
@@ -10926,7 +11016,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 		 */
 		vmx_flush_tlb(vcpu);
 	}
-
+	/* Restore posted intr vector. */
+	if (nested_cpu_has_posted_intr(vmcs12))
+		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
 
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
 	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -11032,8 +11124,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 
-	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
-	    && nested_exit_intr_ack_set(vcpu)) {
+	/*
+	 * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
+	 * the VM-exit interrupt information (valid interrupt) is always set to
+	 * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
+	 * kvm_cpu_has_interrupt().  See the commit message for details.
+	 */
+	if (nested_exit_intr_ack_set(vcpu) &&
+	    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+	    kvm_cpu_has_interrupt(vcpu)) {
 		int irq = kvm_cpu_get_interrupt(vcpu);
 		WARN_ON(irq < 0);
 		vmcs12->vm_exit_intr_info = irq |
@@ -11576,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
 
-	.get_pkru = vmx_get_pkru,
-
 	.tlb_flush = vmx_flush_tlb,
 
 	.run = vmx_vcpu_run,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5b8f07889f6a..ef5102f80497 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
 #include <linux/sched/stat.h>
+#include <linux/mem_encrypt.h>
 
 #include <trace/events/kvm.h>
 
@@ -597,8 +598,8 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		return true;
 
-	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
-	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
+	gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
+	offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
 	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
 				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 	if (r < 0)
@@ -3159,15 +3160,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		kvm_set_hflags(vcpu, hflags);
 
 		vcpu->arch.smi_pending = events->smi.pending;
-		if (events->smi.smm_inside_nmi)
-			vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
-		else
-			vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-		if (lapic_in_kernel(vcpu)) {
-			if (events->smi.latched_init)
-				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+
+		if (events->smi.smm) {
+			if (events->smi.smm_inside_nmi)
+				vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
 			else
-				clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+				vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+			if (lapic_in_kernel(vcpu)) {
+				if (events->smi.latched_init)
+					set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+				else
+					clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+			}
 		}
 	}
 
@@ -3242,7 +3246,12 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 			u32 size, offset, ecx, edx;
 			cpuid_count(XSTATE_CPUID, index,
 				    &size, &offset, &ecx, &edx);
-			memcpy(dest + offset, src, size);
+			if (feature == XFEATURE_MASK_PKRU)
+				memcpy(dest + offset, &vcpu->arch.pkru,
+				       sizeof(vcpu->arch.pkru));
+			else
+				memcpy(dest + offset, src, size);
+
 		}
 
 		valid -= feature;
@@ -3280,7 +3289,11 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 			u32 size, offset, ecx, edx;
 			cpuid_count(XSTATE_CPUID, index,
 				    &size, &offset, &ecx, &edx);
-			memcpy(dest, src + offset, size);
+			if (feature == XFEATURE_MASK_PKRU)
+				memcpy(&vcpu->arch.pkru, src + offset,
+				       sizeof(vcpu->arch.pkru));
+			else
+				memcpy(dest, src + offset, size);
 		}
 
 		valid -= feature;
@@ -6113,7 +6126,7 @@ int kvm_arch_init(void *opaque)
 
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0,
-			PT_PRESENT_MASK, 0);
+			PT_PRESENT_MASK, 0, sme_me_mask);
 	kvm_timer_init();
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6215,6 +6228,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 
 	lapic_irq.shorthand = 0;
 	lapic_irq.dest_mode = 0;
+	lapic_irq.level = 0;
 	lapic_irq.dest_id = apicid;
 	lapic_irq.msi_redir_hint = false;
 
@@ -6721,17 +6735,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
 
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
-					   unsigned long address)
-{
-	/*
-	 * The physical address of apic access page is stored in the VMCS.
-	 * Update it when it becomes invalid.
-	 */
-	if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT))
-		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-}
-
 /*
  * Returns 1 to let vcpu_run() continue the guest execution loop without
  * exiting to the userspace.  Otherwise, the value will be returned to the
@@ -7629,7 +7632,9 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	 */
 	vcpu->guest_fpu_loaded = 1;
 	__kernel_fpu_begin();
-	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
+	/* PKRU is separately restored in kvm_x86_ops->run.  */
+	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+				~XFEATURE_MASK_PKRU);
 	trace_kvm_fpu(1);
 }
 
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
deleted file mode 100644
index 08f41caada45..000000000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
-	bool "Lguest guest support"
-	depends on X86_32 && PARAVIRT && PCI
-	select TTY
-	select VIRTUALIZATION
-	select VIRTIO
-	select VIRTIO_CONSOLE
-	help
-	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
-	  allow your kernel to boot under lguest.  This option will increase
-	  your kernel size by about 10k.  If in doubt, say N.
-
-	  If you say Y here, make sure you say Y (or M) to the virtio block
-	  and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
deleted file mode 100644
index 8f38d577a2fa..000000000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y		:= head_32.o boot.o
-CFLAGS_boot.o	:= $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
deleted file mode 100644
index 99472698c931..000000000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways.  First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes.  We call the first kernel the Host,
- * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time.  This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest?  We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h>		/* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways.  In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
-	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
-	.noirq_iret = (u32)lguest_noirq_iret,
-	.kernel_address = PAGE_OFFSET,
-	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really.  We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly.  This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
-			unsigned long arg2, unsigned long arg3,
-			unsigned long arg4)
-{
-	/* Note: This code assumes we're uniprocessor. */
-	static unsigned int next_call;
-	unsigned long flags;
-
-	/*
-	 * Disable interrupts if not already disabled: we don't want an
-	 * interrupt handler making a hypercall while we're already doing
-	 * one!
-	 */
-	local_irq_save(flags);
-	if (lguest_data.hcall_status[next_call] != 0xFF) {
-		/* Table full, so do normal hcall which will flush table. */
-		hcall(call, arg1, arg2, arg3, arg4);
-	} else {
-		lguest_data.hcalls[next_call].arg0 = call;
-		lguest_data.hcalls[next_call].arg1 = arg1;
-		lguest_data.hcalls[next_call].arg2 = arg2;
-		lguest_data.hcalls[next_call].arg3 = arg3;
-		lguest_data.hcalls[next_call].arg4 = arg4;
-		/* Arguments must all be written before we mark it to go */
-		wmb();
-		lguest_data.hcall_status[next_call] = 0;
-		if (++next_call == LHCALL_RING_SIZE)
-			next_call = 0;
-	}
-	local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall().  This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
- * are reasonably expensive, batching them up makes sense.  For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, 0, 0, 0);
-	else
-		async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, 0, 0);
-	else
-		async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, 0);
-	else
-		async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3,
-			unsigned long arg4)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, arg4);
-	else
-		async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls.  Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction.  The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags").  The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag.  Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
-	return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
-	lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment.  Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules.  In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register.  To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch.  One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled.  There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag.  If an interrupt does come
- * in, we then disable them for real.  This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in.  Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares?  The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
-				   int entrynum, const gate_desc *g)
-{
-	/*
-	 * The gate_desc structure is 8 bytes long: we hand it to the Host in
-	 * two 32-bit chunks.  The whole 32-bit kernel used to hand descriptors
-	 * around like this; typesafety wasn't a big concern in Linux's early
-	 * years.
-	 */
-	u32 *desc = (u32 *)g;
-	/* Keep the local copy up to date. */
-	native_write_idt_entry(dt, entrynum, g);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *idt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT).  You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table.  There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *gdt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
-				   const void *desc, int type)
-{
-	native_write_gdt_entry(dt, entrynum, desc, type);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
-	      dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables).  As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	/*
-	 * There's one problem which normal hardware doesn't have: the Host
-	 * can't handle us removing entries we're currently using.  So we clear
-	 * the GS register here: if it's needed it'll be reloaded anyway.
-	 */
-	lazy_load_gs(0);
-	lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
- * uses this for some strange applications like Wine.  We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment.  Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops.  So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features.  It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 6 languages.  I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction.  Then I pretty much turned
- * off feature bits until the Guest booted.  (Don't say that: you'll damage
- * lguest sales!)  Shut up, inner voice!  (Hey, just pointing out that this is
- * hardly future proof.)  No one's listening!  They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged.  So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
-			 unsigned int *cx, unsigned int *dx)
-{
-	int function = *ax;
-
-	native_cpuid(ax, bx, cx, dx);
-	switch (function) {
-	/*
-	 * CPUID 0 gives the highest legal CPUID number (and the ID string).
-	 * We futureproof our code a little by sticking to known CPUID values.
-	 */
-	case 0:
-		if (*ax > 5)
-			*ax = 5;
-		break;
-
-	/*
-	 * CPUID 1 is a basic feature request.
-	 *
-	 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
-	 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
-	 */
-	case 1:
-		*cx &= 0x00002201;
-		*dx &= 0x07808151;
-		/*
-		 * The Host can do a nice optimization if it knows that the
-		 * kernel mappings (addresses above 0xC0000000 or whatever
-		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
-		 * flush_tlb_user() for both user and kernel mappings unless
-		 * the Page Global Enable (PGE) feature bit is set.
-		 */
-		*dx |= 0x00002000;
-		/*
-		 * We also lie, and say we're family id 5.  6 or greater
-		 * leads to a rdmsr in early_init_intel which we can't handle.
-		 * Family ID is returned as bits 8-12 in ax.
-		 */
-		*ax &= 0xFFFFF0FF;
-		*ax |= 0x00000500;
-		break;
-
-	/*
-	 * This is used to detect if we're running under KVM.  We might be,
-	 * but that's a Host matter, not us.  So say we're not.
-	 */
-	case KVM_CPUID_SIGNATURE:
-		*bx = *cx = *dx = 0;
-		break;
-
-	/*
-	 * 0x80000000 returns the highest Extended Function, so we futureproof
-	 * like we do above by limiting it to known fields.
-	 */
-	case 0x80000000:
-		if (*ax > 0x80000008)
-			*ax = 0x80000008;
-		break;
-
-	/*
-	 * PAE systems can mark pages as non-executable.  Linux calls this the
-	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
-	 * Virus Protection).  We just switch it off here, since we don't
-	 * support it.
-	 */
-	case 0x80000001:
-		*dx &= ~(1 << 20);
-		break;
-	}
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it.  The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0.  cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do?  Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used.  Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
-	return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads.  The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
-	return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
-	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
-	current_cr3 = cr3;
-
-	/* These two page tables are simple, linear, and used during boot */
-	if (cr3 != __pa_symbol(swapper_pg_dir) &&
-	    cr3 != __pa_symbol(initial_page_table))
-		cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
-	return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
-	return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant.  The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each.  The CPU
- * maps virtual addresses to physical addresses using "page tables".  We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables.   But since most virtual addresses
- * are unused, we use a two level index which saves space.  The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages.  Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- *	    |  	   --------->+---------+
- *	    |	      |	     | PADDR1  |
- *	  Mid-level   |	     | PADDR2  |
- *	  (PMD) page  |	     | 	       |
- *	    |	      |	   Lower-level |
- *	    |	      |	   (PTE) page  |
- *	    |	      |	     |	       |
- *	      ....    	     	 ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page.  If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- *  1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- *    Index into top     Index into second      Offset within page
- *  page directory page    pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper  ]
- *	   [   Level  ]
- *	   [  Entries ]
- *	   [(PUD Page)]---> +---------+
- *	 		    |  	   --------->+---------+
- *	 		    |	      |	     | PADDR1  |
- *	 		  Mid-level   |	     | PADDR2  |
- *	 		  (PMD) page  |	     | 	       |
- *	 		    |	      |	   Lower-level |
- *	 		    |	      |	   (PTE) page  |
- *	 		    |	      |	     |	       |
- *	 		      ....    	     	 ....
- *
- *
- * And the virtual address is decoded as:
- *
- *         1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- *      |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into    Index into mid    Index into lower    Offset within page
- * top entries   directory page     pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set.  Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages.  The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space.  We tell the Host the toplevel and
- * address this corresponds to.  The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
-			       pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
-	/* PAE needs to hand a 64 bit page table entry, so it uses two args. */
-	lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
-		    ptep->pte_low, ptep->pte_high);
-#else
-	lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
-	native_set_pud(pudp, pudval);
-
-	/* 32 bytes aligned pdpt address and the index. */
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
-		   (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more.  This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them.  Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds!  So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry.  These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-	native_set_pte_atomic(ptep, pte);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
-			     pte_t *ptep)
-{
-	native_pte_clear(mm, addr, ptep);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
-	lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations.  On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present).  Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
-	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed.  That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit.  We don't actually "ack" interrupts as such, we
- * just mask and unmask them.  I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
-	set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
-	clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
-	.name		= "lguest",
-	.irq_mask	= disable_lguest_irq,
-	.irq_mask_ack	= disable_lguest_irq,
-	.irq_unmask	= enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
-	struct irq_desc *desc;
-	int err;
-
-	/* Returns -ve error or vector number. */
-	err = irq_alloc_desc_at(irq, 0);
-	if (err < 0 && err != -EEXIST)
-		return err;
-
-	/*
-	 * Tell the Linux infrastructure that the interrupt is
-	 * controlled by our level-based lguest interrupt controller.
-	 */
-	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
-				      handle_level_irq, "level");
-
-	/* Some systems map "vectors" to interrupts weirdly.  Not us! */
-	desc = irq_to_desc(irq);
-	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
-	return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
-	int err;
-	u8 line = 0;
-
-	/* We literally use the PCI interrupt line as the irq number. */
-	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-	err = lguest_setup_irq(line);
-	if (!err)
-		dev->irq = line;
-	return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
-	WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
-	unsigned int i;
-
-	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-		if (i != IA32_SYSCALL_VECTOR)
-			set_intr_gate(i, irq_entries_start +
-					8 * (i - FIRST_EXTERNAL_VECTOR));
-	}
-
-	/*
-	 * This call is required to set up for 4k stacks, where we have
-	 * separate stacks for hard and soft interrupts.
-	 */
-	irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
-	*now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
-	return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
-	unsigned long sec, nsec;
-
-	/*
-	 * Since the time is in two parts (seconds and nanoseconds), we risk
-	 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
-	 * and getting 99 and 0.  As Linux tends to come apart under the stress
-	 * of time travel, we must be careful:
-	 */
-	do {
-		/* First we read the seconds part. */
-		sec = lguest_data.time.tv_sec;
-		/*
-		 * This read memory barrier tells the compiler and the CPU that
-		 * this can't be reordered: we have to complete the above
-		 * before going on.
-		 */
-		rmb();
-		/* Now we read the nanoseconds part. */
-		nsec = lguest_data.time.tv_nsec;
-		/* Make sure we've done that. */
-		rmb();
-		/* Now if the seconds part has changed, try again. */
-	} while (unlikely(lguest_data.time.tv_sec != sec));
-
-	/* Our lguest clock is in real nanoseconds. */
-	return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
-	.name		= "lguest",
-	.rating		= 200,
-	.read		= lguest_clock_read,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future.  Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
-                                           struct clock_event_device *evt)
-{
-	/* FIXME: I don't think this can ever happen, but James tells me he had
-	 * to put this code in.  Maybe we should remove it now.  Anyone? */
-	if (delta < LG_CLOCK_MIN_DELTA) {
-		if (printk_ratelimit())
-			printk(KERN_DEBUG "%s: small delta %lu ns\n",
-			       __func__, delta);
-		return -ETIME;
-	}
-
-	/* Please wake us this far in the future. */
-	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
-	return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
-	/* A 0 argument shuts the clock down. */
-	hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
-	return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
-	.name                   = "lguest",
-	.features               = CLOCK_EVT_FEAT_ONESHOT,
-	.set_next_event         = lguest_clockevent_set_next_event,
-	.set_state_shutdown	= lguest_clockevent_shutdown,
-	.rating                 = INT_MAX,
-	.mult                   = 1,
-	.shift                  = 0,
-	.min_delta_ns           = LG_CLOCK_MIN_DELTA,
-	.min_delta_ticks        = LG_CLOCK_MIN_DELTA,
-	.max_delta_ns           = LG_CLOCK_MAX_DELTA,
-	.max_delta_ticks        = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0).  We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
-	unsigned long flags;
-
-	/* Don't interrupt us while this is running. */
-	local_irq_save(flags);
-	lguest_clockevent.event_handler(&lguest_clockevent);
-	local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure.  The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
-	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	if (lguest_setup_irq(0) != 0)
-		panic("Could not set up timer irq");
-	irq_set_handler(0, lguest_time_irq);
-
-	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
-	/* We can't set cpumask in the initializer: damn C limitations!  Set it
-	 * here and register our timer device. */
-	lguest_clockevent.cpumask = cpumask_of(0);
-	clockevents_register_device(&lguest_clockevent);
-
-	/* Finally, we unblock the timer interrupt. */
-	clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work.  They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use.  For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
-			    struct thread_struct *thread)
-{
-	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
-		   THREAD_SIZE / PAGE_SIZE);
-	tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
-	/* FIXME: Implement */
-	return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
-	/* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices).  This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that.  Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0.  So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads.  So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced.  It will also never interrupt anything.  It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
-	return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
-	return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
-	return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
-	return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
-	apic->read = lguest_apic_read;
-	apic->write = lguest_apic_write;
-	apic->icr_read = lguest_apic_icr_read;
-	apic->icr_write = lguest_apic_icr_write;
-	apic->wait_icr_idle = lguest_apic_wait_icr_idle;
-	apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP!  Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
-	hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
-	hcall(LHCALL_SHUTDOWN, __pa("Power down"),
-	      LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't.  But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-	/* The hcall won't return, but to keep gcc happy, we're "done". */
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
-	.notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
-	/*
-	 * The Linux bootloader header contains an "e820" memory map: the
-	 * Launcher populated the first entry with our memory limit.
-	 */
-	e820__range_add(boot_params.e820_table[0].addr,
-			  boot_params.e820_table[0].size,
-			  boot_params.e820_table[0].type);
-
-	/* This string is for the boot messages. */
-	return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
-	write_pci_config_byte(0, 1, 0,
-			      cfg_offset + offsetof(struct virtio_pci_cap, bar),
-			      0);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, length),
-			 4);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, offset),
-			 off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
-	/*
-	 * We could set this up once, then leave it; nothing else in the *
-	 * kernel should touch these registers.  But if it went wrong, that
-	 * would be a horrible bug to find.
-	 */
-	set_cfg_window(cfg_offset, off);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
-	u8 cap, common_cap = 0, device_cap = 0;
-	u32 device_len;
-
-	/* Avoid recursive printk into here. */
-	console_cfg_offset = -1;
-
-	if (!early_pci_allowed()) {
-		printk(KERN_ERR "lguest: early PCI access not allowed!\n");
-		return;
-	}
-
-	/* We expect a console PCI device at BUS0, slot 1. */
-	if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
-		printk(KERN_ERR "lguest: PCI device is %#x!\n",
-		       read_pci_config(0, 1, 0, 0));
-		return;
-	}
-
-	/* Find the capabilities we need (must be in bar0) */
-	cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
-	while (cap) {
-		u8 vndr = read_pci_config_byte(0, 1, 0, cap);
-		if (vndr == PCI_CAP_ID_VNDR) {
-			u8 type, bar;
-
-			type = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, cfg_type));
-			bar = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, bar));
-
-			switch (type) {
-			case VIRTIO_PCI_CAP_DEVICE_CFG:
-				if (bar == 0)
-					device_cap = cap;
-				break;
-			case VIRTIO_PCI_CAP_PCI_CFG:
-				console_access_cap = cap;
-				break;
-			}
-		}
-		cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
-	}
-	if (!device_cap || !console_access_cap) {
-		printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
-		       common_cap, device_cap, console_access_cap);
-		return;
-	}
-
-	/*
-	 * Note that we can't check features, until we've set the DRIVER
-	 * status bit.  We don't want to do that until we have a real driver,
-	 * so we just check that the device-specific config has room for
-	 * emerg_wr.  If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
-	 * it should ignore the access.
-	 */
-	device_len = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, length));
-	if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
-			  + sizeof(u32))) {
-		printk(KERN_ERR "lguest: console missing emerg_wr field\n");
-		return;
-	}
-
-	console_cfg_offset = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, offset));
-	printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
-	/* If we couldn't find PCI console, forget it. */
-	if (console_cfg_offset < 0)
-		return count;
-
-	if (unlikely(!console_cfg_offset)) {
-		probe_pci_console();
-		if (console_cfg_offset < 0)
-			return count;
-	}
-
-	write_bar_via_cfg(console_access_cap,
-			  console_cfg_offset
-			  + offsetof(struct virtio_console_config, emerg_wr),
-			  buf[0]);
-	return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel.  This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"?  The rest of that quote is
- * "... But that usually will create another problem."  This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient.  We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
-	const char *start, *end;
-} lguest_insns[] = {
-	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c).  If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
-			     unsigned long addr, unsigned len)
-{
-	unsigned int insn_len;
-
-	/* Don't do anything special if we don't have a replacement */
-	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
-	/* Similarly if it can't fit (doesn't happen, but let's be thorough). */
-	if (len < insn_len)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	/* Copy in our instructions. */
-	memcpy(ibuf, lguest_insns[type].start, insn_len);
-	return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest.  The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
-	/* We're under lguest. */
-	pv_info.name = "lguest";
-	/* We're running at privilege level 1, not 0 as normal. */
-	pv_info.kernel_rpl = 1;
-	/* Everyone except Xen runs with this set. */
-	pv_info.shared_kernel_pmd = 1;
-
-	/*
-	 * We set up all the lguest overrides for sensitive operations.  These
-	 * are detailed with the operations themselves.
-	 */
-
-	/* Interrupt-related operations */
-	pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
-	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
-	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
-	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
-	pv_irq_ops.safe_halt = lguest_safe_halt;
-
-	/* Setup operations */
-	pv_init_ops.patch = lguest_patch;
-
-	/* Intercepts of various CPU instructions */
-	pv_cpu_ops.load_gdt = lguest_load_gdt;
-	pv_cpu_ops.cpuid = lguest_cpuid;
-	pv_cpu_ops.load_idt = lguest_load_idt;
-	pv_cpu_ops.iret = lguest_iret;
-	pv_cpu_ops.load_sp0 = lguest_load_sp0;
-	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
-	pv_cpu_ops.set_ldt = lguest_set_ldt;
-	pv_cpu_ops.load_tls = lguest_load_tls;
-	pv_cpu_ops.get_debugreg = lguest_get_debugreg;
-	pv_cpu_ops.set_debugreg = lguest_set_debugreg;
-	pv_cpu_ops.read_cr0 = lguest_read_cr0;
-	pv_cpu_ops.write_cr0 = lguest_write_cr0;
-	pv_cpu_ops.read_cr4 = lguest_read_cr4;
-	pv_cpu_ops.write_cr4 = lguest_write_cr4;
-	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
-	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
-	pv_cpu_ops.wbinvd = lguest_wbinvd;
-	pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
-	pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
-	/* Pagetable management */
-	pv_mmu_ops.write_cr3 = lguest_write_cr3;
-	pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
-	pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
-	pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-	pv_mmu_ops.set_pte = lguest_set_pte;
-	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
-	pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
-	pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
-	pv_mmu_ops.pte_clear = lguest_pte_clear;
-	pv_mmu_ops.pmd_clear = lguest_pmd_clear;
-	pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
-	pv_mmu_ops.read_cr2 = lguest_read_cr2;
-	pv_mmu_ops.read_cr3 = lguest_read_cr3;
-	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
-	pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
-	pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/* APIC read/write intercepts */
-	set_lguest_basic_apic_ops();
-#endif
-
-	x86_init.resources.memory_setup = lguest_memory_setup;
-	x86_init.irqs.intr_init = lguest_init_IRQ;
-	x86_init.timers.timer_init = lguest_time_init;
-	x86_platform.calibrate_tsc = lguest_tsc_khz;
-	x86_platform.get_wallclock =  lguest_get_wallclock;
-
-	/*
-	 * Now is a good time to look at the implementations of these functions
-	 * before returning to the rest of lguest_init().
-	 */
-
-	/*G:070
-	 * Now we've seen all the paravirt_ops, we return to
-	 * lguest_init() where the rest of the fairly chaotic boot setup
-	 * occurs.
-	 */
-
-	/*
-	 * The stack protector is a weird thing where gcc places a canary
-	 * value on the stack and then checks it on return.  This file is
-	 * compiled with -fno-stack-protector it, so we got this far without
-	 * problems.  The value of the canary is kept at offset 20 from the
-	 * %gs register, so we need to set that up before calling C functions
-	 * in other files.
-	 */
-	setup_stack_canary_segment(0);
-
-	/*
-	 * We could just call load_stack_canary_segment(), but we might as well
-	 * call switch_to_new_gdt() which loads the whole table and sets up the
-	 * per-cpu segment descriptor register %fs as well.
-	 */
-	switch_to_new_gdt(0);
-
-	/*
-	 * The Host<->Guest Switcher lives at the top of our address space, and
-	 * the Host told us how big it is when we made LGUEST_INIT hypercall:
-	 * it put the answer in lguest_data.reserve_mem
-	 */
-	reserve_top_address(lguest_data.reserve_mem);
-
-	/* Hook in our special panic hypercall code. */
-	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
-	/*
-	 * This is messy CPU setup stuff which the native boot code does before
-	 * start_kernel, so we have to do, too:
-	 */
-	cpu_detect(&new_cpu_data);
-	/* head.S usually sets up the first capability word, so do it here. */
-	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
-	/* Math is always hard! */
-	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
-	/* We don't have features.  We have puppies!  Puppies! */
-#ifdef CONFIG_X86_MCE
-	mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
-	acpi_disabled = 1;
-#endif
-
-	/*
-	 * We set the preferred console to "hvc".  This is the "hypervisor
-	 * virtual console" driver written by the PowerPC people, which we also
-	 * adapted for lguest's use.
-	 */
-	add_preferred_console("hvc", 0, NULL);
-
-	/* Register our very early console. */
-	virtio_cons_early_init(early_put_chars);
-
-	/* Don't let ACPI try to control our PCI interrupts. */
-	disable_acpi();
-
-	/* We control them ourselves, by overriding these two hooks. */
-	pcibios_enable_irq = lguest_enable_irq;
-	pcibios_disable_irq = lguest_disable_irq;
-
-	/*
-	 * Last of all, we set the power management poweroff hook to point to
-	 * the Guest routine to power off, and the reboot hook to our restart
-	 * routine.
-	 */
-	pm_power_off = lguest_power_off;
-	machine_ops.restart = lguest_restart;
-
-	/*
-	 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
-	 * to boot as normal.  It never returns.
-	 */
-	i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
deleted file mode 100644
index d5ae63f5ec5d..000000000000
--- a/arch/x86/lguest/head_32.S
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables.  Finally it checks the 'hardware_subarch' field.  This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here!  We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
-	/*
-	 * We make the "initialization" hypercall now to tell the Host where
-	 * our lguest_data struct is.
-	 */
-	movl $LHCALL_LGUEST_INIT, %eax
-	movl $lguest_data - __PAGE_OFFSET, %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
-	movl $LHCALL_NEW_PGTABLE, %eax
-	movl $(initial_page_table - __PAGE_OFFSET), %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Set up the initial stack so we can run C code. */
-	movl $(init_thread_union+THREAD_SIZE),%esp
-
-	/* Jumps are relative: we're running __PAGE_OFFSET too low. */
-	jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers.  These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...)			\
-	lgstart_##name:	insns; lgend_##name:;		\
-	.globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later).  If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages.  First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
-	/*
-	 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
-	 * X86_EFLAGS_IF (ie. "Interrupts enabled").
-	 */
-	movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * But now we need to check if the Host wants to know: there might have
-	 * been interrupts waiting to be delivered, in which case it will have
-	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
-	 * jump to send_interrupts, otherwise we're done.
-	 */
-	cmpl $0, lguest_data+LGUEST_DATA_irq_pending
-	jnz send_interrupts
-	/*
-	 * One cool thing about x86 is that you can do many things without using
-	 * a register.  In this case, the normal path hasn't needed to save or
-	 * restore any registers at all!
-	 */
-	ret
-send_interrupts:
-	/*
-	 * OK, now we need a register: eax is used for the hypercall number,
-	 * which is LHCALL_SEND_INTERRUPTS.
-	 *
-	 * We used not to bother with this pending detection at all, which was
-	 * much simpler.  Sooner or later the Host would realize it had to
-	 * send us an interrupt.  But that turns out to make performance 7
-	 * times worse on a simple tcp benchmark.  So now we do this the hard
-	 * way.
-	 */
-	pushl %eax
-	movl $LHCALL_SEND_INTERRUPTS, %eax
-	/* This is the actual hypercall trap. */
-	int  $LGUEST_TRAP_ENTRY
-	/* Put eax back the way we found it. */
-	popl %eax
-	ret
-
-/*
- * Finally, the "popf" or "restore flags" routine.  The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
-	/* This is just "lguest_data.irq_enabled = flags;" */
-	movl %eax, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * Now, if the %eax value has enabled interrupts and
-	 * lguest_data.irq_pending is set, we want to tell the Host so it can
-	 * deliver any outstanding interrupts.  Fortunately, both values will
-	 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
-	 * instruction will AND them together for us.  If both are set, we
-	 * jump to send_interrupts.
-	 */
-	testl lguest_data+LGUEST_DATA_irq_pending, %eax
-	jnz send_interrupts
-	/* Again, the normal path has used no extra registers.  Clever, huh? */
-	ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it.  However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway.  If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last.  It's *cool*!  It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap.  The
- * stack looks like this:
- *   old address
- *   old code segment & privilege level
- *   old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once.  The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic.  The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever.  Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
-	pushl	2*4(%esp)
-	/*
-	 * Note the %ss: segment prefix here.  Normal data accesses use the
-	 * "ds" segment, but that will have already been restored for whatever
-	 * we're returning to (such as userspace): we can't trust it.  The %ss:
-	 * prefix makes sure we use the stack segment, which is still valid.
-	 */
-	popl	%ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
-	iret
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 5cc78bf57232..3261abb21ef4 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
 	return 0;	/* Buffer overrun */
 }
 
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+		      const char *option, char *buffer, int bufsize)
+{
+	char c;
+	int pos = 0, len = -1;
+	const char *opptr = NULL;
+	char *bufptr = buffer;
+	enum {
+		st_wordstart = 0,	/* Start of word/after whitespace */
+		st_wordcmp,	/* Comparing this word */
+		st_wordskip,	/* Miscompare, skip */
+		st_bufcpy,	/* Copying this to buffer */
+	} state = st_wordstart;
+
+	if (!cmdline)
+		return -1;      /* No command line */
+
+	/*
+	 * This 'pos' check ensures we do not overrun
+	 * a non-NULL-terminated 'cmdline'
+	 */
+	while (pos++ < max_cmdline_size) {
+		c = *(char *)cmdline++;
+		if (!c)
+			break;
+
+		switch (state) {
+		case st_wordstart:
+			if (myisspace(c))
+				break;
+
+			state = st_wordcmp;
+			opptr = option;
+			/* fall through */
+
+		case st_wordcmp:
+			if ((c == '=') && !*opptr) {
+				/*
+				 * We matched all the way to the end of the
+				 * option we were looking for, prepare to
+				 * copy the argument.
+				 */
+				len = 0;
+				bufptr = buffer;
+				state = st_bufcpy;
+				break;
+			} else if (c == *opptr++) {
+				/*
+				 * We are currently matching, so continue
+				 * to the next character on the cmdline.
+				 */
+				break;
+			}
+			state = st_wordskip;
+			/* fall through */
+
+		case st_wordskip:
+			if (myisspace(c))
+				state = st_wordstart;
+			break;
+
+		case st_bufcpy:
+			if (myisspace(c)) {
+				state = st_wordstart;
+			} else {
+				/*
+				 * Increment len, but don't overrun the
+				 * supplied buffer and leave room for the
+				 * NULL terminator.
+				 */
+				if (++len < bufsize)
+					*bufptr++ = c;
+			}
+			break;
+		}
+	}
+
+	if (bufsize)
+		*bufptr = '\0';
+
+	return len;
+}
+
 int cmdline_find_option_bool(const char *cmdline, const char *option)
 {
 	return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
 }
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+			int bufsize)
+{
+	return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+				     buffer, bufsize);
+}
diff --git a/arch/x86/math-emu/Makefile b/arch/x86/math-emu/Makefile
index 9b0c63b60302..1b2dac174321 100644
--- a/arch/x86/math-emu/Makefile
+++ b/arch/x86/math-emu/Makefile
@@ -5,8 +5,8 @@
 #DEBUG	= -DDEBUGGING
 DEBUG	=
 PARANOID = -DPARANOID
-EXTRA_CFLAGS	:= $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION)
-EXTRA_AFLAGS	:= $(PARANOID)
+ccflags-y += $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION)
+asflags-y += $(PARANOID)
 
 # From 'C' language sources:
 C_OBJS =fpu_entry.o errors.o \
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
index f77ba3058b31..066996dba6a2 100644
--- a/arch/x86/math-emu/div_Xsig.S
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -363,3 +363,4 @@ L_bugged_2:
 	pop	%ebx
 	jmp	L_exit
 #endif /* PARANOID */ 
+ENDPROC(div_Xsig)
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
index 47099628fa4c..2c71527bd917 100644
--- a/arch/x86/math-emu/div_small.S
+++ b/arch/x86/math-emu/div_small.S
@@ -44,4 +44,4 @@ ENTRY(FPU_div_small)
 
 	leave
 	ret
-
+ENDPROC(FPU_div_small)
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
index afbc4d805d66..c9c320dccca1 100644
--- a/arch/x86/math-emu/fpu_emu.h
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -157,7 +157,7 @@ extern u_char const data_sizes_16[32];
 
 #define signbyte(a) (((u_char *)(a))[9])
 #define getsign(a) (signbyte(a) & 0x80)
-#define setsign(a,b) { if (b) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; }
+#define setsign(a,b) { if ((b) != 0) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; }
 #define copysign(a,b) { if (getsign(a)) signbyte(b) |= 0x80; \
                         else signbyte(b) &= 0x7f; }
 #define changesign(a) { signbyte(a) ^= 0x80; }
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 0203baefb5c0..d4a7df2205b8 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -147,7 +147,7 @@ void math_emulate(struct math_emu_info *info)
 		}
 
 		code_descriptor = FPU_get_ldt_descriptor(FPU_CS);
-		if (SEG_D_SIZE(code_descriptor)) {
+		if (code_descriptor.d) {
 			/* The above test may be wrong, the book is not clear */
 			/* Segmented 32 bit protected mode */
 			addr_modes.default_mode = SEG32;
@@ -155,11 +155,10 @@ void math_emulate(struct math_emu_info *info)
 			/* 16 bit protected mode */
 			addr_modes.default_mode = PM16;
 		}
-		FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
-		code_limit = code_base
-		    + (SEG_LIMIT(code_descriptor) +
-		       1) * SEG_GRANULARITY(code_descriptor)
-		    - 1;
+		FPU_EIP += code_base = seg_get_base(&code_descriptor);
+		code_limit = seg_get_limit(&code_descriptor) + 1;
+		code_limit *= seg_get_granularity(&code_descriptor);
+		code_limit += code_base - 1;
 		if (code_limit < code_base)
 			code_limit = 0xffffffff;
 	}
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index a179254a5122..699f329f1d40 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -34,17 +34,43 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
 	return ret;
 }
 
-#define SEG_D_SIZE(x)		((x).b & (3 << 21))
-#define SEG_G_BIT(x)		((x).b & (1 << 23))
-#define SEG_GRANULARITY(x)	(((x).b & (1 << 23)) ? 4096 : 1)
-#define SEG_286_MODE(x)		((x).b & ( 0xff000000 | 0xf0000 | (1 << 23)))
-#define SEG_BASE_ADDR(s)	(((s).b & 0xff000000) \
-				 | (((s).b & 0xff) << 16) | ((s).a >> 16))
-#define SEG_LIMIT(s)		(((s).b & 0xff0000) | ((s).a & 0xffff))
-#define SEG_EXECUTE_ONLY(s)	(((s).b & ((1 << 11) | (1 << 9))) == (1 << 11))
-#define SEG_WRITE_PERM(s)	(((s).b & ((1 << 11) | (1 << 9))) == (1 << 9))
-#define SEG_EXPAND_DOWN(s)	(((s).b & ((1 << 11) | (1 << 10))) \
-				 == (1 << 10))
+#define SEG_TYPE_WRITABLE	(1U << 1)
+#define SEG_TYPE_EXPANDS_DOWN	(1U << 2)
+#define SEG_TYPE_EXECUTE	(1U << 3)
+#define SEG_TYPE_EXPAND_MASK	(SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE)
+#define SEG_TYPE_EXECUTE_MASK	(SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE)
+
+static inline unsigned long seg_get_base(struct desc_struct *d)
+{
+	unsigned long base = (unsigned long)d->base2 << 24;
+
+	return base | ((unsigned long)d->base1 << 16) | d->base0;
+}
+
+static inline unsigned long seg_get_limit(struct desc_struct *d)
+{
+	return ((unsigned long)d->limit1 << 16) | d->limit0;
+}
+
+static inline unsigned long seg_get_granularity(struct desc_struct *d)
+{
+	return d->g ? 4096 : 1;
+}
+
+static inline bool seg_expands_down(struct desc_struct *d)
+{
+	return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN;
+}
+
+static inline bool seg_execute_only(struct desc_struct *d)
+{
+	return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE;
+}
+
+static inline bool seg_writable(struct desc_struct *d)
+{
+	return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE;
+}
 
 #define I387			(&current->thread.fpu.state)
 #define FPU_info		(I387->soft.info)
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index b8ef9f9d2ffc..c48967c6a0e2 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -159,17 +159,18 @@ static long pm_address(u_char FPU_modrm, u_char segment,
 	}
 
 	descriptor = FPU_get_ldt_descriptor(addr->selector);
-	base_address = SEG_BASE_ADDR(descriptor);
+	base_address = seg_get_base(&descriptor);
 	address = base_address + offset;
-	limit = base_address
-	    + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1;
+	limit = seg_get_limit(&descriptor) + 1;
+	limit *= seg_get_granularity(&descriptor);
+	limit += base_address - 1;
 	if (limit < base_address)
 		limit = 0xffffffff;
 
-	if (SEG_EXPAND_DOWN(descriptor)) {
-		if (SEG_G_BIT(descriptor))
+	if (seg_expands_down(&descriptor)) {
+		if (descriptor.g) {
 			seg_top = 0xffffffff;
-		else {
+		} else {
 			seg_top = base_address + (1 << 20);
 			if (seg_top < base_address)
 				seg_top = 0xffffffff;
@@ -182,8 +183,8 @@ static long pm_address(u_char FPU_modrm, u_char segment,
 		    (address > limit) || (address < base_address) ? 0 :
 		    ((limit - address) >= 254 ? 255 : limit - address + 1);
 	}
-	if (SEG_EXECUTE_ONLY(descriptor) ||
-	    (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) {
+	if (seg_execute_only(&descriptor) ||
+	    (!seg_writable(&descriptor) && (FPU_modrm & FPU_WRITE_BIT))) {
 		access_limit = 0;
 	}
 	return address;
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
index 717785a53eb4..22e0631bb85a 100644
--- a/arch/x86/math-emu/mul_Xsig.S
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -62,6 +62,7 @@ ENTRY(mul32_Xsig)
 	popl %esi
 	leave
 	ret
+ENDPROC(mul32_Xsig)
 
 
 ENTRY(mul64_Xsig)
@@ -114,6 +115,7 @@ ENTRY(mul64_Xsig)
 	popl %esi
 	leave
 	ret
+ENDPROC(mul64_Xsig)
 
 
 
@@ -173,4 +175,4 @@ ENTRY(mul_Xsig_Xsig)
 	popl %esi
 	leave
 	ret
-
+ENDPROC(mul_Xsig_Xsig)
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
index 17315c89ff3d..a9aaf414135d 100644
--- a/arch/x86/math-emu/polynom_Xsig.S
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -133,3 +133,4 @@ L_accum_done:
 	popl	%esi
 	leave
 	ret
+ENDPROC(polynomial_Xsig)
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
index b77360fdbf4a..19b33b50adfa 100644
--- a/arch/x86/math-emu/reg_compare.c
+++ b/arch/x86/math-emu/reg_compare.c
@@ -168,7 +168,7 @@ static int compare(FPU_REG const *b, int tagb)
 /* This function requires that st(0) is not empty */
 int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
 {
-	int f = 0, c;
+	int f, c;
 
 	c = compare(loaded_data, loaded_tag);
 
@@ -189,12 +189,12 @@ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
 		case COMP_No_Comp:
 			f = SW_C3 | SW_C2 | SW_C0;
 			break;
-#ifdef PARANOID
 		default:
+#ifdef PARANOID
 			EXCEPTION(EX_INTERNAL | 0x121);
+#endif /* PARANOID */
 			f = SW_C3 | SW_C2 | SW_C0;
 			break;
-#endif /* PARANOID */
 		}
 	setcc(f);
 	if (c & COMP_Denormal) {
@@ -205,7 +205,7 @@ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
 
 static int compare_st_st(int nr)
 {
-	int f = 0, c;
+	int f, c;
 	FPU_REG *st_ptr;
 
 	if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
@@ -235,12 +235,12 @@ static int compare_st_st(int nr)
 		case COMP_No_Comp:
 			f = SW_C3 | SW_C2 | SW_C0;
 			break;
-#ifdef PARANOID
 		default:
+#ifdef PARANOID
 			EXCEPTION(EX_INTERNAL | 0x122);
+#endif /* PARANOID */
 			f = SW_C3 | SW_C2 | SW_C0;
 			break;
-#endif /* PARANOID */
 		}
 	setcc(f);
 	if (c & COMP_Denormal) {
@@ -283,12 +283,12 @@ static int compare_i_st_st(int nr)
 	case COMP_No_Comp:
 		f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
 		break;
-#ifdef PARANOID
 	default:
+#ifdef PARANOID
 		EXCEPTION(EX_INTERNAL | 0x122);
+#endif /* PARANOID */
 		f = 0;
 		break;
-#endif /* PARANOID */
 	}
 	FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
 	if (c & COMP_Denormal) {
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
index 8b6352efceef..53ac1a343c69 100644
--- a/arch/x86/math-emu/reg_norm.S
+++ b/arch/x86/math-emu/reg_norm.S
@@ -94,6 +94,7 @@ L_overflow:
 	call	arith_overflow
 	pop	%ebx
 	jmp	L_exit
+ENDPROC(FPU_normalize)
 
 
 
@@ -145,3 +146,4 @@ L_exit_nuo_zero:
 	popl	%ebx
 	leave
 	ret
+ENDPROC(FPU_normalize_nuo)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
index d1d4e48b4f67..41af5b208d88 100644
--- a/arch/x86/math-emu/reg_round.S
+++ b/arch/x86/math-emu/reg_round.S
@@ -706,3 +706,5 @@ L_exception_exit:
 	mov	$-1,%eax
 	jmp	fpu_reg_round_special_exit
 #endif /* PARANOID */ 
+
+ENDPROC(FPU_round)
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
index 47c4c2434d85..3b1bc5e9b2f6 100644
--- a/arch/x86/math-emu/reg_u_add.S
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -165,3 +165,4 @@ L_exit:
 	leave
 	ret
 #endif /* PARANOID */
+ENDPROC(FPU_u_add)
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
index cc00654b6f9a..796eb5ab921b 100644
--- a/arch/x86/math-emu/reg_u_div.S
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -469,3 +469,5 @@ L_exit:
 	leave
 	ret
 #endif /* PARANOID */ 
+
+ENDPROC(FPU_u_div)
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
index 973f12af97df..6196f68cf3c1 100644
--- a/arch/x86/math-emu/reg_u_mul.S
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -146,3 +146,4 @@ L_exit:
 	ret
 #endif /* PARANOID */ 
 
+ENDPROC(FPU_u_mul)
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
index 1b6c24801d22..d115b900919a 100644
--- a/arch/x86/math-emu/reg_u_sub.S
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -270,3 +270,4 @@ L_exit:
 	popl	%esi
 	leave
 	ret
+ENDPROC(FPU_u_sub)
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
index bbe0e87718e4..87c99749a495 100644
--- a/arch/x86/math-emu/round_Xsig.S
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -78,7 +78,7 @@ L_exit:
 	popl	%ebx
 	leave
 	ret
-
+ENDPROC(round_Xsig)
 
 
 
@@ -138,4 +138,4 @@ L_n_exit:
 	popl	%ebx
 	leave
 	ret
-
+ENDPROC(norm_Xsig)
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
index 31cdd118e918..c8552edeec75 100644
--- a/arch/x86/math-emu/shr_Xsig.S
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -85,3 +85,4 @@ L_more_than_95:
 	popl	%esi
 	leave
 	ret
+ENDPROC(shr_Xsig)
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
index 518428317985..340dd6897f85 100644
--- a/arch/x86/math-emu/wm_shrx.S
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -92,6 +92,7 @@ L_more_than_95:
 	popl	%esi
 	leave
 	ret
+ENDPROC(FPU_shrx)
 
 
 /*---------------------------------------------------------------------------+
@@ -202,3 +203,4 @@ Ls_more_than_95:
 	popl	%esi
 	leave
 	ret
+ENDPROC(FPU_shrxs)
diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S
index d258f59564e1..695afae38fdf 100644
--- a/arch/x86/math-emu/wm_sqrt.S
+++ b/arch/x86/math-emu/wm_sqrt.S
@@ -468,3 +468,4 @@ sqrt_more_prec_large:
 /* Our estimate is too large */
 	movl	$0x7fffff00,%eax
 	jmp	sqrt_round_result
+ENDPROC(wm_sqrt)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 0fbdcb64f9f8..72bf8c01c6e3 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -39,3 +39,5 @@ obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
 
+obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0470826d2bdc..5e3ac6fe6c9e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -13,12 +13,12 @@
  */
 
 #include <linux/debugfs.h>
+#include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 
-#include <asm/kasan.h>
 #include <asm/pgtable.h>
 
 /*
@@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 {
 	pgprotval_t pr = pgprot_val(prot);
 	static const char * const level_name[] =
-		{ "cr3", "pgd", "pud", "pmd", "pte" };
+		{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
 
 	if (!pgprot_val(prot)) {
 		/* Not present */
@@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 			pt_dump_cont_printf(m, dmsg, "    ");
 
 		/* Bit 7 has a different meaning on level 3 vs 4 */
-		if (level <= 3 && pr & _PAGE_PSE)
+		if (level <= 4 && pr & _PAGE_PSE)
 			pt_dump_cont_printf(m, dmsg, "PSE ");
 		else
 			pt_dump_cont_printf(m, dmsg, "    ");
-		if ((level == 4 && pr & _PAGE_PAT) ||
-		    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+		if ((level == 5 && pr & _PAGE_PAT) ||
+		    ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
 			pt_dump_cont_printf(m, dmsg, "PAT ");
 		else
 			pt_dump_cont_printf(m, dmsg, "    ");
@@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
  */
 static unsigned long normalize_addr(unsigned long u)
 {
-#ifdef CONFIG_X86_64
-	return (signed long)(u << 16) >> 16;
-#else
-	return u;
-#endif
+	int shift;
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return u;
+
+	shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+	return (signed long)(u << shift) >> shift;
 }
 
 /*
@@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
 	for (i = 0; i < PTRS_PER_PTE; i++) {
 		prot = pte_flags(*start);
 		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
-		note_page(m, st, __pgprot(prot), 4);
+		note_page(m, st, __pgprot(prot), 5);
 		start++;
 	}
 }
+#ifdef CONFIG_KASAN
+
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_zero_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+				void *pt)
+{
+	if (__pa(pt) == __pa(kasan_zero_pmd) ||
+#ifdef CONFIG_X86_5LEVEL
+	    __pa(pt) == __pa(kasan_zero_p4d) ||
+#endif
+	    __pa(pt) == __pa(kasan_zero_pud)) {
+		pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
+		note_page(m, st, __pgprot(prot), 5);
+		return true;
+	}
+	return false;
+}
+#else
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+				void *pt)
+{
+	return false;
+}
+#endif
 
 #if PTRS_PER_PMD > 1
 
 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
 {
 	int i;
-	pmd_t *start;
+	pmd_t *start, *pmd_start;
 	pgprotval_t prot;
 
-	start = (pmd_t *)pud_page_vaddr(addr);
+	pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
 	for (i = 0; i < PTRS_PER_PMD; i++) {
 		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
 		if (!pmd_none(*start)) {
 			if (pmd_large(*start) || !pmd_present(*start)) {
 				prot = pmd_flags(*start);
-				note_page(m, st, __pgprot(prot), 3);
-			} else {
+				note_page(m, st, __pgprot(prot), 4);
+			} else if (!kasan_page_table(m, st, pmd_start)) {
 				walk_pte_level(m, st, *start,
 					       P + i * PMD_LEVEL_MULT);
 			}
 		} else
-			note_page(m, st, __pgprot(0), 3);
+			note_page(m, st, __pgprot(0), 4);
 		start++;
 	}
 }
@@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
 
 #if PTRS_PER_PUD > 1
 
-/*
- * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
- * KASAN fills page tables with the same values. Since there is no
- * point in checking page table more than once we just skip repeated
- * entries. This saves us dozens of seconds during boot.
- */
-static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
-{
-	return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
-}
-
 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
 {
 	int i;
-	pud_t *start;
+	pud_t *start, *pud_start;
 	pgprotval_t prot;
 	pud_t *prev_pud = NULL;
 
-	start = (pud_t *)p4d_page_vaddr(addr);
+	pud_start = start = (pud_t *)p4d_page_vaddr(addr);
 
 	for (i = 0; i < PTRS_PER_PUD; i++) {
 		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
-		if (!pud_none(*start) &&
-		    !pud_already_checked(prev_pud, start, st->check_wx)) {
+		if (!pud_none(*start)) {
 			if (pud_large(*start) || !pud_present(*start)) {
 				prot = pud_flags(*start);
-				note_page(m, st, __pgprot(prot), 2);
-			} else {
+				note_page(m, st, __pgprot(prot), 3);
+			} else if (!kasan_page_table(m, st, pud_start)) {
 				walk_pmd_level(m, st, *start,
 					       P + i * PUD_LEVEL_MULT);
 			}
 		} else
-			note_page(m, st, __pgprot(0), 2);
+			note_page(m, st, __pgprot(0), 3);
 
 		prev_pud = start;
 		start++;
@@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
 {
 	int i;
-	p4d_t *start;
+	p4d_t *start, *p4d_start;
 	pgprotval_t prot;
 
-	start = (p4d_t *)pgd_page_vaddr(addr);
+	p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
 
 	for (i = 0; i < PTRS_PER_P4D; i++) {
 		st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
@@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 			if (p4d_large(*start) || !p4d_present(*start)) {
 				prot = p4d_flags(*start);
 				note_page(m, st, __pgprot(prot), 2);
-			} else {
+			} else if (!kasan_page_table(m, st, p4d_start)) {
 				walk_pud_level(m, st, *start,
 					       P + i * P4D_LEVEL_MULT);
 			}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 0ea8afcb929c..c076f710de4c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL_GPL(ex_handler_fault);
 
+/*
+ * Handler for UD0 exception following a failed test against the
+ * result of a refcount inc/dec/add/sub.
+ */
+bool ex_handler_refcount(const struct exception_table_entry *fixup,
+			 struct pt_regs *regs, int trapnr)
+{
+	/* First unconditionally saturate the refcount. */
+	*(int *)regs->cx = INT_MIN / 2;
+
+	/*
+	 * Strictly speaking, this reports the fixup destination, not
+	 * the fault location, and not the actually overflowing
+	 * instruction, which is the instruction before the "js", but
+	 * since that instruction could be a variety of lengths, just
+	 * report the location after the overflow, which should be close
+	 * enough for finding the overflow, as it's at least back in
+	 * the function, having returned from .text.unlikely.
+	 */
+	regs->ip = ex_fixup_addr(fixup);
+
+	/*
+	 * This function has been called because either a negative refcount
+	 * value was seen by any of the refcount functions, or a zero
+	 * refcount value was seen by refcount_dec().
+	 *
+	 * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
+	 * wrapped around) will be set. Additionally, seeing the refcount
+	 * reach 0 will set ZF (Zero Flag: result was zero). In each of
+	 * these cases we want a report, since it's a boundary condition.
+	 *
+	 */
+	if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
+		bool zero = regs->flags & X86_EFLAGS_ZF;
+
+		refcount_error_report(regs, zero ? "hit zero" : "overflow");
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_refcount);
+
 bool ex_handler_ext(const struct exception_table_entry *fixup,
 		   struct pt_regs *regs, int trapnr)
 {
@@ -142,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
 	 * undefined.  I'm not sure which CPUs do this, but at least
 	 * the 486 DX works this way.
 	 */
-	if ((regs->cs & 0xFFFF) != __KERNEL_CS)
+	if (regs->cs != __KERNEL_CS)
 		goto fail;
 
 	/*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..b836a7274e12 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -396,14 +396,18 @@ static void dump_pagetable(unsigned long address)
 	pte_t *pte;
 
 #ifdef CONFIG_X86_PAE
-	printk("*pdpt = %016Lx ", pgd_val(*pgd));
+	pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
 	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
 		goto out;
+#define pr_pde pr_cont
+#else
+#define pr_pde pr_info
 #endif
 	p4d = p4d_offset(pgd, address);
 	pud = pud_offset(p4d, address);
 	pmd = pmd_offset(pud, address);
-	printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+	pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+#undef pr_pde
 
 	/*
 	 * We must not directly access the pte in the highpte
@@ -415,9 +419,9 @@ static void dump_pagetable(unsigned long address)
 		goto out;
 
 	pte = pte_offset_kernel(pmd, address);
-	printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
+	pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
 out:
-	printk("\n");
+	pr_cont("\n");
 }
 
 #else /* CONFIG_X86_64: */
@@ -565,7 +569,7 @@ static void dump_pagetable(unsigned long address)
 	if (bad_address(pgd))
 		goto bad;
 
-	printk("PGD %lx ", pgd_val(*pgd));
+	pr_info("PGD %lx ", pgd_val(*pgd));
 
 	if (!pgd_present(*pgd))
 		goto out;
@@ -574,7 +578,7 @@ static void dump_pagetable(unsigned long address)
 	if (bad_address(p4d))
 		goto bad;
 
-	printk("P4D %lx ", p4d_val(*p4d));
+	pr_cont("P4D %lx ", p4d_val(*p4d));
 	if (!p4d_present(*p4d) || p4d_large(*p4d))
 		goto out;
 
@@ -582,7 +586,7 @@ static void dump_pagetable(unsigned long address)
 	if (bad_address(pud))
 		goto bad;
 
-	printk("PUD %lx ", pud_val(*pud));
+	pr_cont("PUD %lx ", pud_val(*pud));
 	if (!pud_present(*pud) || pud_large(*pud))
 		goto out;
 
@@ -590,7 +594,7 @@ static void dump_pagetable(unsigned long address)
 	if (bad_address(pmd))
 		goto bad;
 
-	printk("PMD %lx ", pmd_val(*pmd));
+	pr_cont("PMD %lx ", pmd_val(*pmd));
 	if (!pmd_present(*pmd) || pmd_large(*pmd))
 		goto out;
 
@@ -598,12 +602,12 @@ static void dump_pagetable(unsigned long address)
 	if (bad_address(pte))
 		goto bad;
 
-	printk("PTE %lx", pte_val(*pte));
+	pr_cont("PTE %lx", pte_val(*pte));
 out:
-	printk("\n");
+	pr_cont("\n");
 	return;
 bad:
-	printk("BAD\n");
+	pr_info("BAD\n");
 }
 
 #endif /* CONFIG_X86_64 */
@@ -1254,10 +1258,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
  * routines.
- *
- * This function must have noinline because both callers
- * {,trace_}do_page_fault() have notrace on. Having this an actual function
- * guarantees there's a function trace entry.
  */
 static noinline void
 __do_page_fault(struct pt_regs *regs, unsigned long error_code,
@@ -1490,27 +1490,6 @@ good_area:
 }
 NOKPROBE_SYMBOL(__do_page_fault);
 
-dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
-{
-	unsigned long address = read_cr2(); /* Get the faulting address */
-	enum ctx_state prev_state;
-
-	/*
-	 * We must have this function tagged with __kprobes, notrace and call
-	 * read_cr2() before calling anything else. To avoid calling any kind
-	 * of tracing machinery before we've observed the CR2 value.
-	 *
-	 * exception_{enter,exit}() contain all sorts of tracepoints.
-	 */
-
-	prev_state = exception_enter();
-	__do_page_fault(regs, error_code, address);
-	exception_exit(prev_state);
-}
-NOKPROBE_SYMBOL(do_page_fault);
-
-#ifdef CONFIG_TRACING
 static nokprobe_inline void
 trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
 			 unsigned long error_code)
@@ -1521,22 +1500,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
 		trace_page_fault_kernel(address, regs, error_code);
 }
 
+/*
+ * We must have this function blacklisted from kprobes, tagged with notrace
+ * and call read_cr2() before calling anything else. To avoid calling any
+ * kind of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contains all sorts of tracepoints.
+ */
 dotraplinkage void notrace
-trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
-	/*
-	 * The exception_enter and tracepoint processing could
-	 * trigger another page faults (user space callchain
-	 * reading) and destroy the original cr2 value, so read
-	 * the faulting address now.
-	 */
-	unsigned long address = read_cr2();
+	unsigned long address = read_cr2(); /* Get the faulting address */
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
-	trace_page_fault_entries(address, regs, error_code);
+	if (trace_pagefault_enabled())
+		trace_page_fault_entries(address, regs, error_code);
+
 	__do_page_fault(regs, error_code, address);
 	exception_exit(prev_state);
 }
-NOKPROBE_SYMBOL(trace_do_page_fault);
-#endif /* CONFIG_TRACING */
+NOKPROBE_SYMBOL(do_page_fault);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 2824607df108..6d06cf33e3de 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -18,6 +18,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/elf.h>
+#include <asm/mpx.h>
 
 #if 0	/* This is just for testing */
 struct page *
@@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = get_mmap_base(1);
+
+	/*
+	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+	 * in the full address space.
+	 */
 	info.high_limit = in_compat_syscall() ?
-		tasksize_32bit() : tasksize_64bit();
+		task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 	return vm_unmapped_area(&info);
 }
 
 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
-		unsigned long addr0, unsigned long len,
+		unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
 	struct hstate *h = hstate_file(file);
 	struct vm_unmapped_area_info info;
-	unsigned long addr;
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
 	info.high_limit = get_mmap_base(0);
+
+	/*
+	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+	 * in the full address space.
+	 */
+	if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+		info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 	addr = vm_unmapped_area(&info);
@@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE;
+		info.high_limit = TASK_SIZE_LOW;
 		addr = vm_unmapped_area(&info);
 	}
 
@@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
+
+	addr = mpx_unmapped_area_check(addr, len, flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index adab1595f4bd..31cea988fa36 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
 		if (!pmd)
 			return -ENOMEM;
 		ident_pmd_init(info, pmd, addr, next);
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+		set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
 	}
 
 	return 0;
@@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
 		if (!pud)
 			return -ENOMEM;
 		ident_pud_init(info, pud, addr, next);
-		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+		set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
 	}
 
 	return 0;
@@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 	unsigned long next;
 	int result;
 
+	/* Set the default pagetable flags if not supplied */
+	if (!info->kernpg_flag)
+		info->kernpg_flag = _KERNPG_TABLE;
+
 	for (; addr < end; addr = next) {
 		pgd_t *pgd = pgd_page + pgd_index(addr);
 		p4d_t *p4d;
@@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 		if (result)
 			return result;
 		if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
-			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+			set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
 		} else {
 			/*
 			 * With p4d folded, pgd is equal to p4d.
 			 * The pgd entry has to point to the pud page table in this case.
 			 */
 			pud_t *pud = pud_offset(p4d, 0);
-			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+			set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
 		}
 	}
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 673541eb3b3f..7777ccc0e9f9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,7 @@
 #include <asm/dma.h>		/* for MAX_DMA_PFN */
 #include <asm/microcode.h>
 #include <asm/kaslr.h>
+#include <asm/hypervisor.h>
 
 /*
  * We need to define the tracepoints somewhere, and tlb.c
@@ -636,6 +637,8 @@ void __init init_mem_mapping(void)
 	load_cr3(swapper_pg_dir);
 	__flush_tlb_all();
 
+	hypervisor_init_mem_mapping();
+
 	early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
 }
 
@@ -812,7 +815,7 @@ void __init zone_sizes_init(void)
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
-	.state = 0,
+	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
 EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 4c1b5fd0c7ad..34f0e1847dd6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,8 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mmiotrace.h>
+#include <linux/mem_encrypt.h>
+#include <linux/efi.h>
 
 #include <asm/set_memory.h>
 #include <asm/e820/api.h>
@@ -21,6 +23,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/pat.h>
+#include <asm/setup.h>
 
 #include "physaddr.h"
 
@@ -106,12 +109,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	}
 
 	/*
-	 * Don't remap the low PCI/ISA area, it's always mapped..
-	 */
-	if (is_ISA_range(phys_addr, last_addr))
-		return (__force void __iomem *)phys_to_virt(phys_addr);
-
-	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
 	pfn      = phys_addr >> PAGE_SHIFT;
@@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr)
 		return;
 
 	/*
-	 * __ioremap special-cases the PCI/ISA range by not instantiating a
-	 * vm_area and by simply returning an address into the kernel mapping
-	 * of ISA space.   So handle that here.
+	 * The PCI/ISA range special-casing was removed from __ioremap()
+	 * so this check, in theory, can be removed. However, there are
+	 * cases where iounmap() is called for addresses not obtained via
+	 * ioremap() (vga16fb for example). Add a warning so that these
+	 * cases can be caught and fixed.
 	 */
 	if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
-	    (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
+	    (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
+		WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
 		return;
+	}
 
 	addr = (volatile void __iomem *)
 		(PAGE_MASK & (unsigned long __force)addr);
@@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
 	unsigned long offset = phys & ~PAGE_MASK;
 	void *vaddr;
 
-	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
-	if (page_is_ram(start >> PAGE_SHIFT))
-		return __va(phys);
+	/* memremap() maps if RAM, otherwise falls back to ioremap() */
+	vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
 
-	vaddr = ioremap_cache(start, PAGE_SIZE);
-	/* Only add the offset on success and return NULL if the ioremap() failed: */
+	/* Only add the offset on success and return NULL if memremap() failed */
 	if (vaddr)
 		vaddr += offset;
 
@@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 {
-	if (page_is_ram(phys >> PAGE_SHIFT))
-		return;
+	memunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+
+/*
+ * Examine the physical address to determine if it is an area of memory
+ * that should be mapped decrypted.  If the memory is not part of the
+ * kernel usable area it was accessed and created decrypted, so these
+ * areas should be mapped decrypted. And since the encryption key can
+ * change across reboots, persistent memory should also be mapped
+ * decrypted.
+ */
+static bool memremap_should_map_decrypted(resource_size_t phys_addr,
+					  unsigned long size)
+{
+	int is_pmem;
+
+	/*
+	 * Check if the address is part of a persistent memory region.
+	 * This check covers areas added by E820, EFI and ACPI.
+	 */
+	is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
+				    IORES_DESC_PERSISTENT_MEMORY);
+	if (is_pmem != REGION_DISJOINT)
+		return true;
+
+	/*
+	 * Check if the non-volatile attribute is set for an EFI
+	 * reserved area.
+	 */
+	if (efi_enabled(EFI_BOOT)) {
+		switch (efi_mem_type(phys_addr)) {
+		case EFI_RESERVED_TYPE:
+			if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
+				return true;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Check if the address is outside kernel usable area */
+	switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
+	case E820_TYPE_RESERVED:
+	case E820_TYPE_ACPI:
+	case E820_TYPE_NVS:
+	case E820_TYPE_UNUSABLE:
+	case E820_TYPE_PRAM:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is EFI data. Check
+ * it against the boot params structure and EFI tables and memory types.
+ */
+static bool memremap_is_efi_data(resource_size_t phys_addr,
+				 unsigned long size)
+{
+	u64 paddr;
+
+	/* Check if the address is part of EFI boot/runtime data */
+	if (!efi_enabled(EFI_BOOT))
+		return false;
+
+	paddr = boot_params.efi_info.efi_memmap_hi;
+	paddr <<= 32;
+	paddr |= boot_params.efi_info.efi_memmap;
+	if (phys_addr == paddr)
+		return true;
+
+	paddr = boot_params.efi_info.efi_systab_hi;
+	paddr <<= 32;
+	paddr |= boot_params.efi_info.efi_systab;
+	if (phys_addr == paddr)
+		return true;
+
+	if (efi_is_table_address(phys_addr))
+		return true;
+
+	switch (efi_mem_type(phys_addr)) {
+	case EFI_BOOT_SERVICES_DATA:
+	case EFI_RUNTIME_SERVICES_DATA:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain.
+ */
+static bool memremap_is_setup_data(resource_size_t phys_addr,
+				   unsigned long size)
+{
+	struct setup_data *data;
+	u64 paddr, paddr_next;
+
+	paddr = boot_params.hdr.setup_data;
+	while (paddr) {
+		unsigned int len;
+
+		if (phys_addr == paddr)
+			return true;
+
+		data = memremap(paddr, sizeof(*data),
+				MEMREMAP_WB | MEMREMAP_DEC);
+
+		paddr_next = data->next;
+		len = data->len;
+
+		memunmap(data);
+
+		if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+			return true;
+
+		paddr = paddr_next;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain (early boot version).
+ */
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
+						unsigned long size)
+{
+	struct setup_data *data;
+	u64 paddr, paddr_next;
+
+	paddr = boot_params.hdr.setup_data;
+	while (paddr) {
+		unsigned int len;
+
+		if (phys_addr == paddr)
+			return true;
+
+		data = early_memremap_decrypted(paddr, sizeof(*data));
+
+		paddr_next = data->next;
+		len = data->len;
+
+		early_memunmap(data, sizeof(*data));
+
+		if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+			return true;
+
+		paddr = paddr_next;
+	}
+
+	return false;
+}
+
+/*
+ * Architecture function to determine if RAM remap is allowed. By default, a
+ * RAM remap will map the data as encrypted. Determine if a RAM remap should
+ * not be done so that the data will be mapped decrypted.
+ */
+bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
+				 unsigned long flags)
+{
+	if (!sme_active())
+		return true;
+
+	if (flags & MEMREMAP_ENC)
+		return true;
+
+	if (flags & MEMREMAP_DEC)
+		return false;
+
+	if (memremap_is_setup_data(phys_addr, size) ||
+	    memremap_is_efi_data(phys_addr, size) ||
+	    memremap_should_map_decrypted(phys_addr, size))
+		return false;
+
+	return true;
+}
+
+/*
+ * Architecture override of __weak function to adjust the protection attributes
+ * used when remapping memory. By default, early_memremap() will map the data
+ * as encrypted. Determine if an encrypted mapping should not be done and set
+ * the appropriate protection attributes.
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+					     unsigned long size,
+					     pgprot_t prot)
+{
+	if (!sme_active())
+		return prot;
+
+	if (early_memremap_is_setup_data(phys_addr, size) ||
+	    memremap_is_efi_data(phys_addr, size) ||
+	    memremap_should_map_decrypted(phys_addr, size))
+		prot = pgprot_decrypted(prot);
+	else
+		prot = pgprot_encrypted(prot);
+
+	return prot;
+}
+
+bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
+{
+	return arch_memremap_can_ram_remap(phys_addr, size, 0);
+}
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+/* Remap memory with encryption */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+				      unsigned long size)
+{
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
+}
+
+/*
+ * Remap memory with encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+					 unsigned long size)
+{
+	/* Be sure the write-protect PAT entry is set for write-protect */
+	if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+		return NULL;
+
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
+}
+
+/* Remap memory without encryption */
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+				      unsigned long size)
+{
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
+}
+
+/*
+ * Remap memory without encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+					 unsigned long size)
+{
+	/* Be sure the write-protect PAT entry is set for write-protect */
+	if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+		return NULL;
 
-	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
 }
+#endif	/* CONFIG_ARCH_USE_MEMREMAP_PROT */
 
 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..bc84b73684b7 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -11,8 +11,8 @@
 #include <asm/e820/types.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/pgtable.h>
 
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
 static int __init map_range(struct range *range)
@@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = {
 void __init kasan_early_init(void)
 {
 	int i;
-	pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+	pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
 	pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
 	pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
 	p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
@@ -153,7 +153,7 @@ void __init kasan_init(void)
 	 */
 	memset(kasan_zero_page, 0, PAGE_SIZE);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
 		set_pte(&kasan_zero_pte[i], pte);
 	}
 	/* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
new file mode 100644
index 000000000000..0fbd09269757
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt.c
@@ -0,0 +1,593 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[]  __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+unsigned long sme_me_mask __section(.data) = 0;
+EXPORT_SYMBOL_GPL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+				       unsigned long size, bool enc)
+{
+	void *src, *dst;
+	size_t len;
+
+	if (!sme_me_mask)
+		return;
+
+	local_flush_tlb();
+	wbinvd();
+
+	/*
+	 * There are limited number of early mapping slots, so map (at most)
+	 * one page at time.
+	 */
+	while (size) {
+		len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+		/*
+		 * Create mappings for the current and desired format of
+		 * the memory. Use a write-protected mapping for the source.
+		 */
+		src = enc ? early_memremap_decrypted_wp(paddr, len) :
+			    early_memremap_encrypted_wp(paddr, len);
+
+		dst = enc ? early_memremap_encrypted(paddr, len) :
+			    early_memremap_decrypted(paddr, len);
+
+		/*
+		 * If a mapping can't be obtained to perform the operation,
+		 * then eventual access of that area in the desired mode
+		 * will cause a crash.
+		 */
+		BUG_ON(!src || !dst);
+
+		/*
+		 * Use a temporary buffer, of cache-line multiple size, to
+		 * avoid data corruption as documented in the APM.
+		 */
+		memcpy(sme_early_buffer, src, len);
+		memcpy(dst, sme_early_buffer, len);
+
+		early_memunmap(dst, len);
+		early_memunmap(src, len);
+
+		paddr += len;
+		size -= len;
+	}
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+	__sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+	__sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+					     bool map)
+{
+	unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+	pmdval_t pmd_flags, pmd;
+
+	/* Use early_pmd_flags but remove the encryption mask */
+	pmd_flags = __sme_clr(early_pmd_flags);
+
+	do {
+		pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+		__early_make_pgtable((unsigned long)vaddr, pmd);
+
+		vaddr += PMD_SIZE;
+		paddr += PMD_SIZE;
+		size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+	} while (size);
+
+	__native_flush_tlb();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+	struct boot_params *boot_data;
+	unsigned long cmdline_paddr;
+
+	if (!sme_active())
+		return;
+
+	/* Get the command line address before unmapping the real_mode_data */
+	boot_data = (struct boot_params *)real_mode_data;
+	cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+	__sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+	if (!cmdline_paddr)
+		return;
+
+	__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+	struct boot_params *boot_data;
+	unsigned long cmdline_paddr;
+
+	if (!sme_active())
+		return;
+
+	__sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+	/* Get the command line address after mapping the real_mode_data */
+	boot_data = (struct boot_params *)real_mode_data;
+	cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+	if (!cmdline_paddr)
+		return;
+
+	__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+	unsigned int i;
+
+	if (!sme_me_mask)
+		return;
+
+	early_pmd_flags = __sme_set(early_pmd_flags);
+
+	__supported_pte_mask = __sme_set(__supported_pte_mask);
+
+	/* Update the protection map with memory encryption mask */
+	for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+		protection_map[i] = pgprot_encrypted(protection_map[i]);
+}
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void)
+{
+	if (!sme_me_mask)
+		return;
+
+	/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
+	swiotlb_update_mem_attributes();
+
+	pr_info("AMD Secure Memory Encryption (SME) active\n");
+}
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
+{
+	WARN(PAGE_ALIGN(size) != size,
+	     "size is not page-aligned (%#lx)\n", size);
+
+	/* Make the SWIOTLB buffer area decrypted */
+	set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
+}
+
+static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
+				 unsigned long end)
+{
+	unsigned long pgd_start, pgd_end, pgd_size;
+	pgd_t *pgd_p;
+
+	pgd_start = start & PGDIR_MASK;
+	pgd_end = end & PGDIR_MASK;
+
+	pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
+	pgd_size *= sizeof(pgd_t);
+
+	pgd_p = pgd_base + pgd_index(start);
+
+	memset(pgd_p, 0, pgd_size);
+}
+
+#define PGD_FLAGS	_KERNPG_TABLE_NOENC
+#define P4D_FLAGS	_KERNPG_TABLE_NOENC
+#define PUD_FLAGS	_KERNPG_TABLE_NOENC
+#define PMD_FLAGS	(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
+				     unsigned long vaddr, pmdval_t pmd_val)
+{
+	pgd_t *pgd_p;
+	p4d_t *p4d_p;
+	pud_t *pud_p;
+	pmd_t *pmd_p;
+
+	pgd_p = pgd_base + pgd_index(vaddr);
+	if (native_pgd_val(*pgd_p)) {
+		if (IS_ENABLED(CONFIG_X86_5LEVEL))
+			p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+		else
+			pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+	} else {
+		pgd_t pgd;
+
+		if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+			p4d_p = pgtable_area;
+			memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+			pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
+
+			pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
+		} else {
+			pud_p = pgtable_area;
+			memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+			pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+			pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
+		}
+		native_set_pgd(pgd_p, pgd);
+	}
+
+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+		p4d_p += p4d_index(vaddr);
+		if (native_p4d_val(*p4d_p)) {
+			pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
+		} else {
+			p4d_t p4d;
+
+			pud_p = pgtable_area;
+			memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+			pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+			p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
+			native_set_p4d(p4d_p, p4d);
+		}
+	}
+
+	pud_p += pud_index(vaddr);
+	if (native_pud_val(*pud_p)) {
+		if (native_pud_val(*pud_p) & _PAGE_PSE)
+			goto out;
+
+		pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
+	} else {
+		pud_t pud;
+
+		pmd_p = pgtable_area;
+		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+		pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
+
+		pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
+		native_set_pud(pud_p, pud);
+	}
+
+	pmd_p += pmd_index(vaddr);
+	if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
+		native_set_pmd(pmd_p, native_make_pmd(pmd_val));
+
+out:
+	return pgtable_area;
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+	unsigned long p4d_size, pud_size, pmd_size;
+	unsigned long total;
+
+	/*
+	 * Perform a relatively simplistic calculation of the pagetable
+	 * entries that are needed. That mappings will be covered by 2MB
+	 * PMD entries so we can conservatively calculate the required
+	 * number of P4D, PUD and PMD structures needed to perform the
+	 * mappings. Incrementing the count for each covers the case where
+	 * the addresses cross entries.
+	 */
+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+		p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+		p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+		pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
+		pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+	} else {
+		p4d_size = 0;
+		pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+		pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+	}
+	pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
+	pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+	total = p4d_size + pud_size + pmd_size;
+
+	/*
+	 * Now calculate the added pagetable structures needed to populate
+	 * the new pagetables.
+	 */
+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+		p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+		p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+		pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
+		pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+	} else {
+		p4d_size = 0;
+		pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+		pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+	}
+	pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
+	pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+	total += p4d_size + pud_size + pmd_size;
+
+	return total;
+}
+
+void __init sme_encrypt_kernel(void)
+{
+	unsigned long workarea_start, workarea_end, workarea_len;
+	unsigned long execute_start, execute_end, execute_len;
+	unsigned long kernel_start, kernel_end, kernel_len;
+	unsigned long pgtable_area_len;
+	unsigned long paddr, pmd_flags;
+	unsigned long decrypted_base;
+	void *pgtable_area;
+	pgd_t *pgd;
+
+	if (!sme_active())
+		return;
+
+	/*
+	 * Prepare for encrypting the kernel by building new pagetables with
+	 * the necessary attributes needed to encrypt the kernel in place.
+	 *
+	 *   One range of virtual addresses will map the memory occupied
+	 *   by the kernel as encrypted.
+	 *
+	 *   Another range of virtual addresses will map the memory occupied
+	 *   by the kernel as decrypted and write-protected.
+	 *
+	 *     The use of write-protect attribute will prevent any of the
+	 *     memory from being cached.
+	 */
+
+	/* Physical addresses gives us the identity mapped virtual addresses */
+	kernel_start = __pa_symbol(_text);
+	kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+	kernel_len = kernel_end - kernel_start;
+
+	/* Set the encryption workarea to be immediately after the kernel */
+	workarea_start = kernel_end;
+
+	/*
+	 * Calculate required number of workarea bytes needed:
+	 *   executable encryption area size:
+	 *     stack page (PAGE_SIZE)
+	 *     encryption routine page (PAGE_SIZE)
+	 *     intermediate copy buffer (PMD_PAGE_SIZE)
+	 *   pagetable structures for the encryption of the kernel
+	 *   pagetable structures for workarea (in case not currently mapped)
+	 */
+	execute_start = workarea_start;
+	execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+	execute_len = execute_end - execute_start;
+
+	/*
+	 * One PGD for both encrypted and decrypted mappings and a set of
+	 * PUDs and PMDs for each of the encrypted and decrypted mappings.
+	 */
+	pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+	pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+
+	/* PUDs and PMDs needed in the current pagetables for the workarea */
+	pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+	/*
+	 * The total workarea includes the executable encryption area and
+	 * the pagetable area.
+	 */
+	workarea_len = execute_len + pgtable_area_len;
+	workarea_end = workarea_start + workarea_len;
+
+	/*
+	 * Set the address to the start of where newly created pagetable
+	 * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+	 * structures are created when the workarea is added to the current
+	 * pagetables and when the new encrypted and decrypted kernel
+	 * mappings are populated.
+	 */
+	pgtable_area = (void *)execute_end;
+
+	/*
+	 * Make sure the current pagetable structure has entries for
+	 * addressing the workarea.
+	 */
+	pgd = (pgd_t *)native_read_cr3_pa();
+	paddr = workarea_start;
+	while (paddr < workarea_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + PMD_FLAGS);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Flush the TLB - no globals so cr3 is enough */
+	native_write_cr3(__native_read_cr3());
+
+	/*
+	 * A new pagetable structure is being built to allow for the kernel
+	 * to be encrypted. It starts with an empty PGD that will then be
+	 * populated with new PUDs and PMDs as the encrypted and decrypted
+	 * kernel mappings are created.
+	 */
+	pgd = pgtable_area;
+	memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
+	pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
+
+	/* Add encrypted kernel (identity) mappings */
+	pmd_flags = PMD_FLAGS | _PAGE_ENC;
+	paddr = kernel_start;
+	while (paddr < kernel_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + pmd_flags);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/*
+	 * A different PGD index/entry must be used to get different
+	 * pagetable entries for the decrypted mapping. Choose the next
+	 * PGD index and convert it to a virtual address to be used as
+	 * the base of the mapping.
+	 */
+	decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+	decrypted_base <<= PGDIR_SHIFT;
+
+	/* Add decrypted, write-protected kernel (non-identity) mappings */
+	pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
+	paddr = kernel_start;
+	while (paddr < kernel_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr + decrypted_base,
+						paddr + pmd_flags);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Add decrypted workarea mappings to both kernel mappings */
+	paddr = workarea_start;
+	while (paddr < workarea_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + PMD_FLAGS);
+
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr + decrypted_base,
+						paddr + PMD_FLAGS);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Perform the encryption */
+	sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+			    kernel_len, workarea_start, (unsigned long)pgd);
+
+	/*
+	 * At this point we are running encrypted.  Remove the mappings for
+	 * the decrypted areas - all that is needed for this is to remove
+	 * the PGD entry/entries.
+	 */
+	sme_clear_pgd(pgd, kernel_start + decrypted_base,
+		      kernel_end + decrypted_base);
+
+	sme_clear_pgd(pgd, workarea_start + decrypted_base,
+		      workarea_end + decrypted_base);
+
+	/* Flush the TLB - no globals so cr3 is enough */
+	native_write_cr3(__native_read_cr3());
+}
+
+void __init __nostackprotector sme_enable(struct boot_params *bp)
+{
+	const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+	unsigned int eax, ebx, ecx, edx;
+	bool active_by_default;
+	unsigned long me_mask;
+	char buffer[16];
+	u64 msr;
+
+	/* Check for the SME support leaf */
+	eax = 0x80000000;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	if (eax < 0x8000001f)
+		return;
+
+	/*
+	 * Check for the SME feature:
+	 *   CPUID Fn8000_001F[EAX] - Bit 0
+	 *     Secure Memory Encryption support
+	 *   CPUID Fn8000_001F[EBX] - Bits 5:0
+	 *     Pagetable bit position used to indicate encryption
+	 */
+	eax = 0x8000001f;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	if (!(eax & 1))
+		return;
+
+	me_mask = 1UL << (ebx & 0x3f);
+
+	/* Check if SME is enabled */
+	msr = __rdmsr(MSR_K8_SYSCFG);
+	if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+		return;
+
+	/*
+	 * Fixups have not been applied to phys_base yet and we're running
+	 * identity mapped, so we must obtain the address to the SME command
+	 * line argument data using rip-relative addressing.
+	 */
+	asm ("lea sme_cmdline_arg(%%rip), %0"
+	     : "=r" (cmdline_arg)
+	     : "p" (sme_cmdline_arg));
+	asm ("lea sme_cmdline_on(%%rip), %0"
+	     : "=r" (cmdline_on)
+	     : "p" (sme_cmdline_on));
+	asm ("lea sme_cmdline_off(%%rip), %0"
+	     : "=r" (cmdline_off)
+	     : "p" (sme_cmdline_off));
+
+	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+		active_by_default = true;
+	else
+		active_by_default = false;
+
+	cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+				     ((u64)bp->ext_cmd_line_ptr << 32));
+
+	cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+	if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+		sme_me_mask = me_mask;
+	else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+		sme_me_mask = 0;
+	else
+		sme_me_mask = active_by_default ? me_mask : 0;
+}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
new file mode 100644
index 000000000000..730e6d541df1
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -0,0 +1,149 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+
+	.text
+	.code64
+ENTRY(sme_encrypt_execute)
+
+	/*
+	 * Entry parameters:
+	 *   RDI - virtual address for the encrypted kernel mapping
+	 *   RSI - virtual address for the decrypted kernel mapping
+	 *   RDX - length of kernel
+	 *   RCX - virtual address of the encryption workarea, including:
+	 *     - stack page (PAGE_SIZE)
+	 *     - encryption routine page (PAGE_SIZE)
+	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
+	 *    R8 - physcial address of the pagetables to use for encryption
+	 */
+
+	push	%rbp
+	movq	%rsp, %rbp		/* RBP now has original stack pointer */
+
+	/* Set up a one page stack in the non-encrypted memory area */
+	movq	%rcx, %rax		/* Workarea stack page */
+	leaq	PAGE_SIZE(%rax), %rsp	/* Set new stack pointer */
+	addq	$PAGE_SIZE, %rax	/* Workarea encryption routine */
+
+	push	%r12
+	movq	%rdi, %r10		/* Encrypted kernel */
+	movq	%rsi, %r11		/* Decrypted kernel */
+	movq	%rdx, %r12		/* Kernel length */
+
+	/* Copy encryption routine into the workarea */
+	movq	%rax, %rdi				/* Workarea encryption routine */
+	leaq	__enc_copy(%rip), %rsi			/* Encryption routine */
+	movq	$(.L__enc_copy_end - __enc_copy), %rcx	/* Encryption routine length */
+	rep	movsb
+
+	/* Setup registers for call */
+	movq	%r10, %rdi		/* Encrypted kernel */
+	movq	%r11, %rsi		/* Decrypted kernel */
+	movq	%r8, %rdx		/* Pagetables used for encryption */
+	movq	%r12, %rcx		/* Kernel length */
+	movq	%rax, %r8		/* Workarea encryption routine */
+	addq	$PAGE_SIZE, %r8		/* Workarea intermediate copy buffer */
+
+	call	*%rax			/* Call the encryption routine */
+
+	pop	%r12
+
+	movq	%rbp, %rsp		/* Restore original stack pointer */
+	pop	%rbp
+
+	ret
+ENDPROC(sme_encrypt_execute)
+
+ENTRY(__enc_copy)
+/*
+ * Routine used to encrypt kernel.
+ *   This routine must be run outside of the kernel proper since
+ *   the kernel will be encrypted during the process. So this
+ *   routine is defined here and then copied to an area outside
+ *   of the kernel where it will remain and run decrypted
+ *   during execution.
+ *
+ *   On entry the registers must be:
+ *     RDI - virtual address for the encrypted kernel mapping
+ *     RSI - virtual address for the decrypted kernel mapping
+ *     RDX - address of the pagetables to use for encryption
+ *     RCX - length of kernel
+ *      R8 - intermediate copy buffer
+ *
+ *     RAX - points to this routine
+ *
+ * The kernel will be encrypted by copying from the non-encrypted
+ * kernel space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted kernel space. The physical
+ * addresses of the two kernel space mappings are the same which
+ * results in the kernel being encrypted "in place".
+ */
+	/* Enable the new page tables */
+	mov	%rdx, %cr3
+
+	/* Flush any global TLBs */
+	mov	%cr4, %rdx
+	andq	$~X86_CR4_PGE, %rdx
+	mov	%rdx, %cr4
+	orq	$X86_CR4_PGE, %rdx
+	mov	%rdx, %cr4
+
+	/* Set the PAT register PA5 entry to write-protect */
+	push	%rcx
+	movl	$MSR_IA32_CR_PAT, %ecx
+	rdmsr
+	push	%rdx			/* Save original PAT value */
+	andl	$0xffff00ff, %edx	/* Clear PA5 */
+	orl	$0x00000500, %edx	/* Set PA5 to WP */
+	wrmsr
+	pop	%rdx			/* RDX contains original PAT value */
+	pop	%rcx
+
+	movq	%rcx, %r9		/* Save kernel length */
+	movq	%rdi, %r10		/* Save encrypted kernel address */
+	movq	%rsi, %r11		/* Save decrypted kernel address */
+
+	wbinvd				/* Invalidate any cache entries */
+
+	/* Copy/encrypt 2MB at a time */
+1:
+	movq	%r11, %rsi		/* Source - decrypted kernel */
+	movq	%r8, %rdi		/* Dest   - intermediate copy buffer */
+	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
+	rep	movsb
+
+	movq	%r8, %rsi		/* Source - intermediate copy buffer */
+	movq	%r10, %rdi		/* Dest   - encrypted kernel */
+	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
+	rep	movsb
+
+	addq	$PMD_PAGE_SIZE, %r11
+	addq	$PMD_PAGE_SIZE, %r10
+	subq	$PMD_PAGE_SIZE, %r9	/* Kernel length decrement */
+	jnz	1b			/* Kernel length not zero? */
+
+	/* Restore PAT register */
+	push	%rdx			/* Save original PAT value */
+	movl	$MSR_IA32_CR_PAT, %ecx
+	rdmsr
+	pop	%rdx			/* Restore original PAT value */
+	wrmsr
+
+	ret
+.L__enc_copy_end:
+ENDPROC(__enc_copy)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 229d04a83f85..a99679826846 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -37,22 +37,21 @@ struct va_alignment __read_mostly va_align = {
 	.flags = -1,
 };
 
-unsigned long tasksize_32bit(void)
+unsigned long task_size_32bit(void)
 {
 	return IA32_PAGE_OFFSET;
 }
 
-unsigned long tasksize_64bit(void)
+unsigned long task_size_64bit(int full_addr_space)
 {
-	return TASK_SIZE_MAX;
+	return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
 }
 
 static unsigned long stack_maxrandom_size(unsigned long task_size)
 {
 	unsigned long max = 0;
-	if ((current->flags & PF_RANDOMIZE) &&
-		!(current->personality & ADDR_NO_RANDOMIZE)) {
-		max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+	if (current->flags & PF_RANDOMIZE) {
+		max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
 		max <<= PAGE_SHIFT;
 	}
 
@@ -79,13 +78,13 @@ static int mmap_is_legacy(void)
 
 static unsigned long arch_rnd(unsigned int rndbits)
 {
+	if (!(current->flags & PF_RANDOMIZE))
+		return 0;
 	return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
 }
 
 unsigned long arch_mmap_rnd(void)
 {
-	if (!(current->flags & PF_RANDOMIZE))
-		return 0;
 	return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
 }
 
@@ -142,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 
 	arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
-			arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+			arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
 
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
 	/*
@@ -152,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	 * mmap_base, the compat syscall uses mmap_compat_base.
 	 */
 	arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
-			arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+			arch_rnd(mmap32_rnd_bits), task_size_32bit());
 #endif
 }
 
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 1c34b767c84c..9ceaa955d2ba 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -355,10 +355,19 @@ int mpx_enable_management(void)
 	 */
 	bd_base = mpx_get_bounds_dir();
 	down_write(&mm->mmap_sem);
+
+	/* MPX doesn't support addresses above 47 bits yet. */
+	if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
+		pr_warn_once("%s (%d): MPX cannot handle addresses "
+				"above 47-bits. Disabling.",
+				current->comm, current->pid);
+		ret = -ENXIO;
+		goto out;
+	}
 	mm->context.bd_addr = bd_base;
 	if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
 		ret = -ENXIO;
-
+out:
 	up_write(&mm->mmap_sem);
 	return ret;
 }
@@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (ret)
 		force_sig(SIGSEGV, current);
 }
+
+/* MPX cannot handle addresses above 47 bits yet. */
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+		unsigned long flags)
+{
+	if (!kernel_managing_mpx_tables(current->mm))
+		return addr;
+	if (addr + len <= DEFAULT_MAP_WINDOW)
+		return addr;
+	if (flags & MAP_FIXED)
+		return -ENOMEM;
+
+	/*
+	 * Requested len is larger than the whole area we're allowed to map in.
+	 * Resetting hinting address wouldn't do much good -- fail early.
+	 */
+	if (len > DEFAULT_MAP_WINDOW)
+		return -ENOMEM;
+
+	/* Look for unmap area within DEFAULT_MAP_WINDOW */
+	return 0;
+}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index a8f90ce3dedf..d805162e6045 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 
 /*
  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.  The return value is the number of nodes allocated.
+ * to max_addr.
+ *
+ * Returns zero on success or negative on error.
  */
 static int __init split_nodes_interleave(struct numa_meminfo *ei,
 					 struct numa_meminfo *pi,
 					 u64 addr, u64 max_addr, int nr_nodes)
 {
-	nodemask_t physnode_mask = NODE_MASK_NONE;
+	nodemask_t physnode_mask = numa_nodes_parsed;
 	u64 size;
 	int big;
 	int nid = 0;
@@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 		return -1;
 	}
 
-	for (i = 0; i < pi->nr_blks; i++)
-		node_set(pi->blk[i].nid, physnode_mask);
-
 	/*
 	 * Continue to fill physical nodes with fake nodes until there is no
 	 * memory left on any of them.
@@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
 
 /*
  * Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'.  The return value is the number of nodes allocated.
+ * `addr' to `max_addr'.
+ *
+ * Returns zero on success or negative on error.
  */
 static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 					      struct numa_meminfo *pi,
 					      u64 addr, u64 max_addr, u64 size)
 {
-	nodemask_t physnode_mask = NODE_MASK_NONE;
+	nodemask_t physnode_mask = numa_nodes_parsed;
 	u64 min_size;
 	int nid = 0;
 	int i, ret;
@@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 	}
 	size &= FAKE_NODE_MIN_HASH_MASK;
 
-	for (i = 0; i < pi->nr_blks; i++)
-		node_set(pi->blk[i].nid, physnode_mask);
-
 	/*
 	 * Fill physical nodes with fake nodes of size until there is no memory
 	 * left on any of them.
@@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 	return 0;
 }
 
+int __init setup_emu2phys_nid(int *dfl_phys_nid)
+{
+	int i, max_emu_nid = 0;
+
+	*dfl_phys_nid = NUMA_NO_NODE;
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+			max_emu_nid = i;
+			if (*dfl_phys_nid == NUMA_NO_NODE)
+				*dfl_phys_nid = emu_nid_to_phys[i];
+		}
+	}
+
+	return max_emu_nid;
+}
+
 /**
  * numa_emulation - Emulate NUMA nodes
  * @numa_meminfo: NUMA configuration to massage
@@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	 * Determine the max emulated nid and the default phys nid to use
 	 * for unmapped nodes.
 	 */
-	max_emu_nid = 0;
-	dfl_phys_nid = NUMA_NO_NODE;
-	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
-		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
-			max_emu_nid = i;
-			if (dfl_phys_nid == NUMA_NO_NODE)
-				dfl_phys_nid = emu_nid_to_phys[i];
-		}
-	}
-	if (dfl_phys_nid == NUMA_NO_NODE) {
-		pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
-		goto no_emu;
-	}
+	max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
 
 	/* commit */
 	*numa_meminfo = ei;
 
+	/* Make sure numa_nodes_parsed only contains emulated nodes */
+	nodes_clear(numa_nodes_parsed);
+	for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
+		if (ei.blk[i].start != ei.blk[i].end &&
+		    ei.blk[i].nid != NUMA_NO_NODE)
+			node_set(ei.blk[i].nid, numa_nodes_parsed);
+
 	/*
 	 * Transform __apicid_to_node table to use emulated nids by
 	 * reverse-mapping phys_nid.  The maps should always exist but fall
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 757b0bcdf712..dfb7d657cf43 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages)
 					__pgprot(0), 1, 0, NULL);
 }
 
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+	struct cpa_data cpa;
+	unsigned long start;
+	int ret;
+
+	/* Nothing to do if the SME is not active */
+	if (!sme_active())
+		return 0;
+
+	/* Should not be working on unaligned addresses */
+	if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
+		addr &= PAGE_MASK;
+
+	start = addr;
+
+	memset(&cpa, 0, sizeof(cpa));
+	cpa.vaddr = &addr;
+	cpa.numpages = numpages;
+	cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
+	cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+	cpa.pgd = init_mm.pgd;
+
+	/* Must avoid aliasing mappings in the highmem code */
+	kmap_flush_unused();
+	vm_unmap_aliases();
+
+	/*
+	 * Before changing the encryption attribute, we need to flush caches.
+	 */
+	if (static_cpu_has(X86_FEATURE_CLFLUSH))
+		cpa_flush_range(start, numpages, 1);
+	else
+		cpa_flush_all(1);
+
+	ret = __change_page_attr_set_clr(&cpa, 1);
+
+	/*
+	 * After changing the encryption attribute, we need to flush TLBs
+	 * again in case any speculative TLB caching occurred (but no need
+	 * to flush caches again).  We could just use cpa_flush_all(), but
+	 * in case TLB flushing gets optimized in the cpa_flush_range()
+	 * path use the same logic as above.
+	 */
+	if (static_cpu_has(X86_FEATURE_CLFLUSH))
+		cpa_flush_range(start, numpages, 0);
+	else
+		cpa_flush_all(0);
+
+	return ret;
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+	return __set_memory_enc_dec(addr, numpages, true);
+}
+EXPORT_SYMBOL_GPL(set_memory_encrypted);
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+	return __set_memory_enc_dec(addr, numpages, false);
+}
+EXPORT_SYMBOL_GPL(set_memory_decrypted);
+
 int set_pages_uc(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
@@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 	if (!(page_flags & _PAGE_RW))
 		cpa.mask_clr = __pgprot(_PAGE_RW);
 
+	if (!(page_flags & _PAGE_ENC))
+		cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
+
 	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
 
 	retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 45979502f64b..fe7d57a8fb60 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -293,7 +293,7 @@ void init_cache_modes(void)
  * pat_init - Initialize PAT MSR and PAT table
  *
  * This function initializes PAT MSR and PAT table with an OS-defined value
- * to enable additional cache attributes, WC and WT.
+ * to enable additional cache attributes, WC, WT and WP.
  *
  * This function must be called on all CPUs using the specific sequence of
  * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
@@ -352,7 +352,7 @@ void pat_init(void)
 		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
 		 *      011    3    UC : _PAGE_CACHE_MODE_UC
 		 *      100    4    WB : Reserved
-		 *      101    5    WC : Reserved
+		 *      101    5    WP : _PAGE_CACHE_MODE_WP
 		 *      110    6    UC-: Reserved
 		 *      111    7    WT : _PAGE_CACHE_MODE_WT
 		 *
@@ -360,7 +360,7 @@ void pat_init(void)
 		 * corresponding types in the presence of PAT errata.
 		 */
 		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
-		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+		      PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
 	}
 
 	if (!boot_cpu_done) {
@@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc);
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t vma_prot)
 {
+	if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
+		vma_prot = pgprot_decrypted(vma_prot);
+
 	return vma_prot;
 }
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 508a708eb9a6..218834a3e9ad 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -56,7 +56,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
 	pgtable_page_dtor(pte);
 	paravirt_release_pte(page_to_pfn(pte));
-	tlb_remove_page(tlb, pte);
+	tlb_remove_table(tlb, pte);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 2
@@ -72,21 +72,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 	tlb->need_flush_all = 1;
 #endif
 	pgtable_pmd_page_dtor(page);
-	tlb_remove_page(tlb, page);
+	tlb_remove_table(tlb, page);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
-	tlb_remove_page(tlb, virt_to_page(pud));
+	tlb_remove_table(tlb, virt_to_page(pud));
 }
 
 #if CONFIG_PGTABLE_LEVELS > 4
 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
 {
 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
-	tlb_remove_page(tlb, virt_to_page(p4d));
+	tlb_remove_table(tlb, virt_to_page(p4d));
 }
 #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 014d07a80053..ce104b962a17 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,42 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+			    u16 *new_asid, bool *need_flush)
+{
+	u16 asid;
+
+	if (!static_cpu_has(X86_FEATURE_PCID)) {
+		*new_asid = 0;
+		*need_flush = true;
+		return;
+	}
+
+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+		    next->context.ctx_id)
+			continue;
+
+		*new_asid = asid;
+		*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+			       next_tlb_gen);
+		return;
+	}
+
+	/*
+	 * We don't currently own an ASID slot on this CPU.
+	 * Allocate a slot.
+	 */
+	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+	if (*new_asid >= TLB_NR_DYN_ASIDS) {
+		*new_asid = 0;
+		this_cpu_write(cpu_tlbstate.next_asid, 1);
+	}
+	*need_flush = true;
+}
+
 void leave_mm(int cpu)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -43,12 +79,11 @@ void leave_mm(int cpu)
 	if (loaded_mm == &init_mm)
 		return;
 
-	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-		BUG();
+	/* Warn if we're not lazy. */
+	WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
 
 	switch_mm(NULL, &init_mm, NULL);
 }
-EXPORT_SYMBOL_GPL(leave_mm);
 
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	       struct task_struct *tsk)
@@ -63,115 +98,219 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			struct task_struct *tsk)
 {
-	unsigned cpu = smp_processor_id();
 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+	unsigned cpu = smp_processor_id();
+	u64 next_tlb_gen;
 
 	/*
-	 * NB: The scheduler will call us with prev == next when
-	 * switching from lazy TLB mode to normal mode if active_mm
-	 * isn't changing.  When this happens, there is no guarantee
-	 * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+	 * NB: The scheduler will call us with prev == next when switching
+	 * from lazy TLB mode to normal mode if active_mm isn't changing.
+	 * When this happens, we don't assume that CR3 (and hence
+	 * cpu_tlbstate.loaded_mm) matches next.
 	 *
 	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
 	 */
 
-	this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+	/* We don't want flush_tlb_func_* to run concurrently with us. */
+	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+		WARN_ON_ONCE(!irqs_disabled());
+
+	/*
+	 * Verify that CR3 is what we think it is.  This will catch
+	 * hypothetical buggy code that directly switches to swapper_pg_dir
+	 * without going through leave_mm() / switch_mm_irqs_off() or that
+	 * does something like write_cr3(read_cr3_pa()).
+	 */
+	VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
 
 	if (real_prev == next) {
-		/*
-		 * There's nothing to do: we always keep the per-mm control
-		 * regs in sync with cpu_tlbstate.loaded_mm.  Just
-		 * sanity-check mm_cpumask.
-		 */
-		if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
-			cpumask_set_cpu(cpu, mm_cpumask(next));
-		return;
-	}
+		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+			  next->context.ctx_id);
+
+		if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
+			/*
+			 * There's nothing to do: we weren't lazy, and we
+			 * aren't changing our mm.  We don't need to flush
+			 * anything, nor do we need to update CR3, CR4, or
+			 * LDTR.
+			 */
+			return;
+		}
+
+		/* Resume remote flushes and then read tlb_gen. */
+		cpumask_set_cpu(cpu, mm_cpumask(next));
+		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
+		    next_tlb_gen) {
+			/*
+			 * Ideally, we'd have a flush_tlb() variant that
+			 * takes the known CR3 value as input.  This would
+			 * be faster on Xen PV and on hypothetical CPUs
+			 * on which INVPCID is fast.
+			 */
+			this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
+				       next_tlb_gen);
+			write_cr3(__sme_pa(next->pgd) | prev_asid);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+					TLB_FLUSH_ALL);
+		}
 
-	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
 		/*
-		 * If our current stack is in vmalloc space and isn't
-		 * mapped in the new pgd, we'll double-fault.  Forcibly
-		 * map it.
+		 * We just exited lazy mode, which means that CR4 and/or LDTR
+		 * may be stale.  (Changes to the required CR4 and LDTR states
+		 * are not reflected in tlb_gen.)
 		 */
-		unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
-		pgd_t *pgd = next->pgd + stack_pgd_index;
-
-		if (unlikely(pgd_none(*pgd)))
-			set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
-	}
+	} else {
+		u16 new_asid;
+		bool need_flush;
+
+		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+			/*
+			 * If our current stack is in vmalloc space and isn't
+			 * mapped in the new pgd, we'll double-fault.  Forcibly
+			 * map it.
+			 */
+			unsigned int index = pgd_index(current_stack_pointer());
+			pgd_t *pgd = next->pgd + index;
+
+			if (unlikely(pgd_none(*pgd)))
+				set_pgd(pgd, init_mm.pgd[index]);
+		}
 
-	this_cpu_write(cpu_tlbstate.loaded_mm, next);
+		/* Stop remote flushes for the previous mm */
+		if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
+			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 
-	WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
-	cpumask_set_cpu(cpu, mm_cpumask(next));
+		VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
 
-	/*
-	 * Re-load page tables.
-	 *
-	 * This logic has an ordering constraint:
-	 *
-	 *  CPU 0: Write to a PTE for 'next'
-	 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-	 *  CPU 1: set bit 1 in next's mm_cpumask
-	 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-	 *
-	 * We need to prevent an outcome in which CPU 1 observes
-	 * the new PTE value and CPU 0 observes bit 1 clear in
-	 * mm_cpumask.  (If that occurs, then the IPI will never
-	 * be sent, and CPU 0's TLB will contain a stale entry.)
-	 *
-	 * The bad outcome can occur if either CPU's load is
-	 * reordered before that CPU's store, so both CPUs must
-	 * execute full barriers to prevent this from happening.
-	 *
-	 * Thus, switch_mm needs a full barrier between the
-	 * store to mm_cpumask and any operation that could load
-	 * from next->pgd.  TLB fills are special and can happen
-	 * due to instruction fetches or for no reason at all,
-	 * and neither LOCK nor MFENCE orders them.
-	 * Fortunately, load_cr3() is serializing and gives the
-	 * ordering guarantee we need.
-	 */
-	load_cr3(next->pgd);
-
-	/*
-	 * This gets called via leave_mm() in the idle path where RCU
-	 * functions differently.  Tracing normally uses RCU, so we have to
-	 * call the tracepoint specially here.
-	 */
-	trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+		/*
+		 * Start remote flushes and then read tlb_gen.
+		 */
+		cpumask_set_cpu(cpu, mm_cpumask(next));
+		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+
+		if (need_flush) {
+			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+			write_cr3(__sme_pa(next->pgd) | new_asid);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+					TLB_FLUSH_ALL);
+		} else {
+			/* The new ASID is already up to date. */
+			write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+		}
 
-	/* Stop flush ipis for the previous mm */
-	WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
-		     real_prev != &init_mm);
-	cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+		this_cpu_write(cpu_tlbstate.loaded_mm, next);
+		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+	}
 
-	/* Load per-mm CR4 and LDTR state */
 	load_mm_cr4(next);
 	switch_ldt(real_prev, next);
 }
 
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen.  We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
 static void flush_tlb_func_common(const struct flush_tlb_info *f,
 				  bool local, enum tlb_flush_reason reason)
 {
+	/*
+	 * We have three different tlb_gen values in here.  They are:
+	 *
+	 * - mm_tlb_gen:     the latest generation.
+	 * - local_tlb_gen:  the generation that this CPU has already caught
+	 *                   up to.
+	 * - f->new_tlb_gen: the generation that the requester of the flush
+	 *                   wants us to catch up to.
+	 */
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
 
-	if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
-		leave_mm(smp_processor_id());
+	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+		   loaded_mm->context.ctx_id);
+
+	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+		/*
+		 * We're in lazy mode -- don't flush.  We can get here on
+		 * remote flushes due to races and on local flushes if a
+		 * kernel thread coincidentally flushes the mm it's lazily
+		 * still using.
+		 */
 		return;
 	}
 
-	if (f->end == TLB_FLUSH_ALL) {
-		local_flush_tlb();
-		if (local)
-			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-		trace_tlb_flush(reason, TLB_FLUSH_ALL);
-	} else {
+	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+		/*
+		 * There's nothing to do: we're already up to date.  This can
+		 * happen if two concurrent flushes happen -- the first flush to
+		 * be handled can catch us all the way up, leaving no work for
+		 * the second flush.
+		 */
+		trace_tlb_flush(reason, 0);
+		return;
+	}
+
+	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+	/*
+	 * If we get to this point, we know that our TLB is out of date.
+	 * This does not strictly imply that we need to flush (it's
+	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+	 * going to need to flush in the very near future, so we might
+	 * as well get it over with.
+	 *
+	 * The only question is whether to do a full or partial flush.
+	 *
+	 * We do a partial flush if requested and two extra conditions
+	 * are met:
+	 *
+	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
+	 *    we've always done all needed flushes to catch up to
+	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
+	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
+	 *    us up to date for tlb_gen 3 is the partial flush we're
+	 *    processing.
+	 *
+	 *    As an example of why this check is needed, suppose that there
+	 *    are two concurrent flushes.  The first is a full flush that
+	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
+	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
+	 *    processed on this CPU in reverse order, we'll see
+	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+	 *    If we were to use __flush_tlb_single() and set local_tlb_gen to
+	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
+	 *    1 without the full flush that's needed for tlb_gen 2.
+	 *
+	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
+	 *    Partial TLB flushes are not all that much cheaper than full TLB
+	 *    flushes, so it seems unlikely that it would be a performance win
+	 *    to do a partial flush if that won't bring our TLB fully up to
+	 *    date.  By doing a full flush instead, we can increase
+	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
+	 *    avoid another flush in the very near future.
+	 */
+	if (f->end != TLB_FLUSH_ALL &&
+	    f->new_tlb_gen == local_tlb_gen + 1 &&
+	    f->new_tlb_gen == mm_tlb_gen) {
+		/* Partial flush */
 		unsigned long addr;
 		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
 		addr = f->start;
 		while (addr < f->end) {
 			__flush_tlb_single(addr);
@@ -180,7 +319,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 		if (local)
 			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
 		trace_tlb_flush(reason, nr_pages);
+	} else {
+		/* Full flush. */
+		local_flush_tlb();
+		if (local)
+			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+		trace_tlb_flush(reason, TLB_FLUSH_ALL);
 	}
+
+	/* Both paths above update our state to mm_tlb_gen. */
+	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
 }
 
 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -214,6 +362,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 				(info->end - info->start) >> PAGE_SHIFT);
 
 	if (is_uv_system()) {
+		/*
+		 * This whole special case is confused.  UV has a "Broadcast
+		 * Assist Unit", which seems to be a fancy way to send IPIs.
+		 * Back when x86 used an explicit TLB flush IPI, UV was
+		 * optimized to use its own mechanism.  These days, x86 uses
+		 * smp_call_function_many(), but UV still uses a manual IPI,
+		 * and that IPI's action is out of date -- it does a manual
+		 * flush instead of calling flush_tlb_func_remote().  This
+		 * means that the percpu tlb_gen variables won't be updated
+		 * and we'll do pointless flushes on future context switches.
+		 *
+		 * Rather than hooking native_flush_tlb_others() here, I think
+		 * that UV should be updated so that smp_call_function_many(),
+		 * etc, are optimal on UV.
+		 */
 		unsigned int cpu;
 
 		cpu = smp_processor_id();
@@ -250,8 +413,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
 	cpu = get_cpu();
 
-	/* Synchronize with switch_mm. */
-	smp_mb();
+	/* This is also a barrier that synchronizes with switch_mm(). */
+	info.new_tlb_gen = inc_mm_tlb_gen(mm);
 
 	/* Should we flush just the requested range? */
 	if ((end != TLB_FLUSH_ALL) &&
@@ -273,6 +436,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
 	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
 		flush_tlb_others(mm_cpumask(mm), &info);
+
 	put_cpu();
 }
 
@@ -281,8 +445,6 @@ static void do_flush_tlb_all(void *info)
 {
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 	__flush_tlb_all();
-	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
-		leave_mm(smp_processor_id());
 }
 
 void flush_tlb_all(void)
@@ -335,6 +497,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 
 	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
 		flush_tlb_others(&batch->cpumask, &info);
+
 	cpumask_clear(&batch->cpumask);
 
 	put_cpu();
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dbe2132b0ed4..7a5350d08cef 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev)
 
 	pa_data = boot_params.hdr.setup_data;
 	while (pa_data) {
-		data = ioremap(pa_data, sizeof(*rom));
+		data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB);
 		if (!data)
 			return -ENOMEM;
 
@@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev)
 			}
 		}
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 	}
 	set_dma_domain_ops(dev);
 	set_dev_domain_options(dev);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f084d8718ac4..6217b23e85f6 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void)
 /*
  * Convenience functions to obtain memory types and attributes
  */
-u32 efi_mem_type(unsigned long phys_addr)
+int efi_mem_type(unsigned long phys_addr)
 {
 	efi_memory_desc_t *md;
 
 	if (!efi_enabled(EFI_MEMMAP))
-		return 0;
+		return -ENOTSUPP;
 
 	for_each_efi_memory_desc(md) {
 		if ((md->phys_addr <= phys_addr) &&
@@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr)
 				  (md->num_pages << EFI_PAGE_SHIFT))))
 			return md->type;
 	}
-	return 0;
+	return -EINVAL;
 }
 
 static int __init arch_parse_efi_cmdline(char *str)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9bf72f5bfedb..12e83888e5b9 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -327,7 +327,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size)
 
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
-	unsigned long pfn, text;
+	unsigned long pfn, text, pf;
 	struct page *page;
 	unsigned npages;
 	pgd_t *pgd;
@@ -335,7 +335,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
-	efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
+	/*
+	 * Since the PGD is encrypted, set the encryption mask so that when
+	 * this value is loaded into cr3 the PGD will be decrypted during
+	 * the pagetable walk.
+	 */
+	efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd);
 	pgd = efi_pgd;
 
 	/*
@@ -345,7 +350,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	 * phys_efi_set_virtual_address_map().
 	 */
 	pfn = pa_memmap >> PAGE_SHIFT;
-	if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) {
+	pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC;
+	if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) {
 		pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
 		return 1;
 	}
@@ -388,7 +394,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	text = __pa(_text);
 	pfn = text >> PAGE_SHIFT;
 
-	if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) {
+	pf = _PAGE_RW | _PAGE_ENC;
+	if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
 		pr_err("Failed to map kernel text 1:1\n");
 		return 1;
 	}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
index 6e075afa7877..58337b2bc682 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -38,8 +38,10 @@ static void __init *max7315_platform_data(void *info)
 	 */
 	strcpy(i2c_info->type, "max7315");
 	if (nr++) {
-		sprintf(base_pin_name, "max7315_%d_base", nr);
-		sprintf(intr_pin_name, "max7315_%d_int", nr);
+		snprintf(base_pin_name, sizeof(base_pin_name),
+			 "max7315_%d_base", nr);
+		snprintf(intr_pin_name, sizeof(intr_pin_name),
+			 "max7315_%d_int", nr);
 	} else {
 		strcpy(base_pin_name, "max7315_base");
 		strcpy(intr_pin_name, "max7315_int");
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index d4a61ddf9e62..f44c0bc95aa2 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -26,7 +26,7 @@
 static struct bau_operations ops __ro_after_init;
 
 /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
-static int timeout_base_ns[] = {
+static const int timeout_base_ns[] = {
 		20,
 		160,
 		1280,
@@ -40,7 +40,6 @@ static int timeout_base_ns[] = {
 static int timeout_us;
 static bool nobau = true;
 static int nobau_perm;
-static cycles_t congested_cycles;
 
 /* tunables: */
 static int max_concurr		= MAX_BAU_CONCURRENT;
@@ -829,10 +828,10 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
 		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
 			bcp->period_requests++;
 			bcp->period_time += elapsed;
-			if ((elapsed > congested_cycles) &&
+			if ((elapsed > usec_2_cycles(bcp->cong_response_us)) &&
 			    (bcp->period_requests > bcp->cong_reps) &&
 			    ((bcp->period_time / bcp->period_requests) >
-							congested_cycles)) {
+					usec_2_cycles(bcp->cong_response_us))) {
 				stat->s_congested++;
 				disable_for_period(bcp, stat);
 			}
@@ -1217,7 +1216,7 @@ static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
  * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
  * Such a message must be ignored.
  */
-void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
+static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
 {
 	unsigned long mmr_image;
 	unsigned char swack_vec;
@@ -2222,14 +2221,17 @@ static int __init uv_bau_init(void)
 	else if (is_uv1_hub())
 		ops = uv1_bau_ops;
 
+	nuvhubs = uv_num_possible_blades();
+	if (nuvhubs < 2) {
+		pr_crit("UV: BAU disabled - insufficient hub count\n");
+		goto err_bau_disable;
+	}
+
 	for_each_possible_cpu(cur_cpu) {
 		mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
 		zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
 	}
 
-	nuvhubs = uv_num_possible_blades();
-	congested_cycles = usec_2_cycles(congested_respns_us);
-
 	uv_base_pnode = 0x7fffffff;
 	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
 		cpus = uv_blade_nr_possible_cpus(uvhub);
@@ -2242,9 +2244,8 @@ static int __init uv_bau_init(void)
 		enable_timeouts();
 
 	if (init_per_cpu(nuvhubs, uv_base_pnode)) {
-		set_bau_off();
-		nobau_perm = 1;
-		return 0;
+		pr_crit("UV: BAU disabled - per CPU init failed\n");
+		goto err_bau_disable;
 	}
 
 	vector = UV_BAU_MESSAGE;
@@ -2270,6 +2271,16 @@ static int __init uv_bau_init(void)
 	}
 
 	return 0;
+
+err_bau_disable:
+
+	for_each_possible_cpu(cur_cpu)
+		free_cpumask_var(per_cpu(uv_flush_tlb_mask, cur_cpu));
+
+	set_bau_off();
+	nobau_perm = 1;
+
+	return -EINVAL;
 }
 core_initcall(uv_bau_init);
 fs_initcall(uv_ptc_init);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cd4be19c36dc..1f71980fc5e0 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,6 +1,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/set_memory.h>
 #include <asm/pgtable.h>
@@ -59,6 +60,13 @@ static void __init setup_real_mode(void)
 
 	base = (unsigned char *)real_mode_header;
 
+	/*
+	 * If SME is active, the trampoline area will need to be in
+	 * decrypted memory in order to bring up other processors
+	 * successfully.
+	 */
+	set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
+
 	memcpy(base, real_mode_blob, size);
 
 	phys_base = __pa(base);
@@ -100,6 +108,10 @@ static void __init setup_real_mode(void)
 	trampoline_cr4_features = &trampoline_header->cr4;
 	*trampoline_cr4_features = mmu_cr4_features;
 
+	trampoline_header->flags = 0;
+	if (sme_active())
+		trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
+
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 	trampoline_pgd[0] = trampoline_pgd_entry.pgd;
 	trampoline_pgd[511] = init_top_pgt[511].pgd;
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index dac7b20d2f9d..614fd7064d0a 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -30,6 +30,7 @@
 #include <asm/msr.h>
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
+#include <asm/realmode.h>
 #include "realmode.h"
 
 	.text
@@ -92,6 +93,28 @@ ENTRY(startup_32)
 	movl	%edx, %fs
 	movl	%edx, %gs
 
+	/*
+	 * Check for memory encryption support. This is a safety net in
+	 * case BIOS hasn't done the necessary step of setting the bit in
+	 * the MSR for this AP. If SME is active and we've gotten this far
+	 * then it is safe for us to set the MSR bit and continue. If we
+	 * don't we'll eventually crash trying to execute encrypted
+	 * instructions.
+	 */
+	bt	$TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
+	jnc	.Ldone
+	movl	$MSR_K8_SYSCFG, %ecx
+	rdmsr
+	bts	$MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
+	jc	.Ldone
+
+	/*
+	 * Memory encryption is enabled but the SME enable bit for this
+	 * CPU has has not been set.  It is safe to set it, so do so.
+	 */
+	wrmsr
+.Ldone:
+
 	movl	pa_tr_cr4, %eax
 	movl	%eax, %cr4		# Enable PAE mode
 
@@ -147,6 +170,7 @@ GLOBAL(trampoline_header)
 	tr_start:		.space	8
 	GLOBAL(tr_efer)		.space	8
 	GLOBAL(tr_cr4)		.space	4
+	GLOBAL(tr_flags)	.space	4
 END(trampoline_header)
 
 #include "trampoline_common.S"
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ae4cd58c0c7a..02250b2633b8 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -50,7 +50,7 @@ void foo(void)
 	DEFINE(HOST_GS, GS);
 	DEFINE(HOST_ORIG_AX, ORIG_EAX);
 #else
-#if defined(PTRACE_GETREGSET) && defined(PTRACE_SETREGSET)
+#ifdef FP_XSTATE_MAGIC1
 	DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long));
 #else
 	DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long));
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 027987638e98..1ecd419811a2 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -17,6 +17,9 @@ config XEN_PV
 	bool "Xen PV guest support"
 	default y
 	depends on XEN
+	# XEN_PV is not ready to work with 5-level paging.
+	# Changes to hypervisor are also required.
+	depends on !X86_5LEVEL
 	select XEN_HAVE_PVMMU
 	select XEN_HAVE_VPMU
 	help
@@ -75,4 +78,6 @@ config XEN_DEBUG_FS
 config XEN_PVH
 	bool "Support for running as a PVH guest"
 	depends on XEN && XEN_PVHVM && ACPI
+	# Pre-built page tables are not ready to handle 5-level paging.
+	depends on !X86_5LEVEL
 	def_bool n
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 87d791356ea9..de503c225ae1 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -12,6 +12,7 @@
 #include <asm/setup.h>
 #include <asm/hypervisor.h>
 #include <asm/e820/api.h>
+#include <asm/early_ioremap.h>
 
 #include <asm/xen/cpuid.h>
 #include <asm/xen/hypervisor.h>
@@ -21,38 +22,50 @@
 #include "mmu.h"
 #include "smp.h"
 
-void __ref xen_hvm_init_shared_info(void)
+static unsigned long shared_info_pfn;
+
+void xen_hvm_init_shared_info(void)
 {
 	struct xen_add_to_physmap xatp;
-	u64 pa;
-
-	if (HYPERVISOR_shared_info == &xen_dummy_shared_info) {
-		/*
-		 * Search for a free page starting at 4kB physical address.
-		 * Low memory is preferred to avoid an EPT large page split up
-		 * by the mapping.
-		 * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
-		 * the BIOS used for HVM guests is well behaved and won't
-		 * clobber memory other than the first 4kB.
-		 */
-		for (pa = PAGE_SIZE;
-		     !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
-		     memblock_is_reserved(pa);
-		     pa += PAGE_SIZE)
-			;
-
-		memblock_reserve(pa, PAGE_SIZE);
-		HYPERVISOR_shared_info = __va(pa);
-	}
 
 	xatp.domid = DOMID_SELF;
 	xatp.idx = 0;
 	xatp.space = XENMAPSPACE_shared_info;
-	xatp.gpfn = virt_to_pfn(HYPERVISOR_shared_info);
+	xatp.gpfn = shared_info_pfn;
 	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
 		BUG();
 }
 
+static void __init reserve_shared_info(void)
+{
+	u64 pa;
+
+	/*
+	 * Search for a free page starting at 4kB physical address.
+	 * Low memory is preferred to avoid an EPT large page split up
+	 * by the mapping.
+	 * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
+	 * the BIOS used for HVM guests is well behaved and won't
+	 * clobber memory other than the first 4kB.
+	 */
+	for (pa = PAGE_SIZE;
+	     !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
+	     memblock_is_reserved(pa);
+	     pa += PAGE_SIZE)
+		;
+
+	shared_info_pfn = PHYS_PFN(pa);
+
+	memblock_reserve(pa, PAGE_SIZE);
+	HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE);
+}
+
+static void __init xen_hvm_init_mem_mapping(void)
+{
+	early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE);
+	HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+}
+
 static void __init init_hvm_pv_info(void)
 {
 	int major, minor;
@@ -153,6 +166,7 @@ static void __init xen_hvm_guest_init(void)
 
 	init_hvm_pv_info();
 
+	reserve_shared_info();
 	xen_hvm_init_shared_info();
 
 	/*
@@ -218,5 +232,6 @@ const struct hypervisor_x86 x86_hyper_xen_hvm = {
 	.init_platform          = xen_hvm_guest_init,
 	.pin_vcpu               = xen_pin_vcpu,
 	.x2apic_available       = xen_x2apic_para_available,
+	.init_mem_mapping	= xen_hvm_init_mem_mapping,
 };
 EXPORT_SYMBOL(x86_hyper_xen_hvm);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 811e4ddb3f37..ae2a2e2d6362 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -263,6 +263,13 @@ static void __init xen_init_capabilities(void)
 	setup_clear_cpu_cap(X86_FEATURE_MTRR);
 	setup_clear_cpu_cap(X86_FEATURE_ACC);
 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+	setup_clear_cpu_cap(X86_FEATURE_SME);
+
+	/*
+	 * Xen PV would need some work to support PCID: CR3 handling as well
+	 * as xen_flush_tlb_others() would need updating.
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
 
 	if (!xen_initial_domain())
 		setup_clear_cpu_cap(X86_FEATURE_ACPI);
@@ -494,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
 static inline bool desc_equal(const struct desc_struct *d1,
 			      const struct desc_struct *d2)
 {
-	return d1->a == d2->a && d1->b == d2->b;
+	return !memcmp(d1, d2, sizeof(*d1));
 }
 
 static void load_TLS_descriptor(struct thread_struct *t,
@@ -579,59 +586,91 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 	preempt_enable();
 }
 
+#ifdef CONFIG_X86_64
+struct trap_array_entry {
+	void (*orig)(void);
+	void (*xen)(void);
+	bool ist_okay;
+};
+
+static struct trap_array_entry trap_array[] = {
+	{ debug,                       xen_xendebug,                    true },
+	{ int3,                        xen_xenint3,                     true },
+	{ double_fault,                xen_double_fault,                true },
+#ifdef CONFIG_X86_MCE
+	{ machine_check,               xen_machine_check,               true },
+#endif
+	{ nmi,                         xen_nmi,                         true },
+	{ overflow,                    xen_overflow,                    false },
+#ifdef CONFIG_IA32_EMULATION
+	{ entry_INT80_compat,          xen_entry_INT80_compat,          false },
+#endif
+	{ page_fault,                  xen_page_fault,                  false },
+	{ divide_error,                xen_divide_error,                false },
+	{ bounds,                      xen_bounds,                      false },
+	{ invalid_op,                  xen_invalid_op,                  false },
+	{ device_not_available,        xen_device_not_available,        false },
+	{ coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false },
+	{ invalid_TSS,                 xen_invalid_TSS,                 false },
+	{ segment_not_present,         xen_segment_not_present,         false },
+	{ stack_segment,               xen_stack_segment,               false },
+	{ general_protection,          xen_general_protection,          false },
+	{ spurious_interrupt_bug,      xen_spurious_interrupt_bug,      false },
+	{ coprocessor_error,           xen_coprocessor_error,           false },
+	{ alignment_check,             xen_alignment_check,             false },
+	{ simd_coprocessor_error,      xen_simd_coprocessor_error,      false },
+};
+
+static bool get_trap_addr(void **addr, unsigned int ist)
+{
+	unsigned int nr;
+	bool ist_okay = false;
+
+	/*
+	 * Replace trap handler addresses by Xen specific ones.
+	 * Check for known traps using IST and whitelist them.
+	 * The debugger ones are the only ones we care about.
+	 * Xen will handle faults like double_fault, * so we should never see
+	 * them.  Warn if there's an unexpected IST-using fault handler.
+	 */
+	for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
+		struct trap_array_entry *entry = trap_array + nr;
+
+		if (*addr == entry->orig) {
+			*addr = entry->xen;
+			ist_okay = entry->ist_okay;
+			break;
+		}
+	}
+
+	if (WARN_ON(ist != 0 && !ist_okay))
+		return false;
+
+	return true;
+}
+#endif
+
 static int cvt_gate_to_trap(int vector, const gate_desc *val,
 			    struct trap_info *info)
 {
 	unsigned long addr;
 
-	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
+	if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
 		return 0;
 
 	info->vector = vector;
 
-	addr = gate_offset(*val);
+	addr = gate_offset(val);
 #ifdef CONFIG_X86_64
-	/*
-	 * Look for known traps using IST, and substitute them
-	 * appropriately.  The debugger ones are the only ones we care
-	 * about.  Xen will handle faults like double_fault,
-	 * so we should never see them.  Warn if
-	 * there's an unexpected IST-using fault handler.
-	 */
-	if (addr == (unsigned long)debug)
-		addr = (unsigned long)xen_debug;
-	else if (addr == (unsigned long)int3)
-		addr = (unsigned long)xen_int3;
-	else if (addr == (unsigned long)stack_segment)
-		addr = (unsigned long)xen_stack_segment;
-	else if (addr == (unsigned long)double_fault) {
-		/* Don't need to handle these */
+	if (!get_trap_addr((void **)&addr, val->bits.ist))
 		return 0;
-#ifdef CONFIG_X86_MCE
-	} else if (addr == (unsigned long)machine_check) {
-		/*
-		 * when xen hypervisor inject vMCE to guest,
-		 * use native mce handler to handle it
-		 */
-		;
-#endif
-	} else if (addr == (unsigned long)nmi)
-		/*
-		 * Use the native version as well.
-		 */
-		;
-	else {
-		/* Some other trap using IST? */
-		if (WARN_ON(val->ist != 0))
-			return 0;
-	}
 #endif	/* CONFIG_X86_64 */
 	info->address = addr;
 
-	info->cs = gate_segment(*val);
-	info->flags = val->dpl;
+	info->cs = gate_segment(val);
+	info->flags = val->bits.dpl;
 	/* interrupt gates clear IF */
-	if (val->type == GATE_INTERRUPT)
+	if (val->bits.type == GATE_INTERRUPT)
 		info->flags |= 1 << 2;
 
 	return 1;
@@ -981,59 +1020,6 @@ void __ref xen_setup_vcpu_info_placement(void)
 	}
 }
 
-static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
-			  unsigned long addr, unsigned len)
-{
-	char *start, *end, *reloc;
-	unsigned ret;
-
-	start = end = reloc = NULL;
-
-#define SITE(op, x)							\
-	case PARAVIRT_PATCH(op.x):					\
-	if (xen_have_vcpu_info_placement) {				\
-		start = (char *)xen_##x##_direct;			\
-		end = xen_##x##_direct_end;				\
-		reloc = xen_##x##_direct_reloc;				\
-	}								\
-	goto patch_site
-
-	switch (type) {
-		SITE(pv_irq_ops, irq_enable);
-		SITE(pv_irq_ops, irq_disable);
-		SITE(pv_irq_ops, save_fl);
-		SITE(pv_irq_ops, restore_fl);
-#undef SITE
-
-	patch_site:
-		if (start == NULL || (end-start) > len)
-			goto default_patch;
-
-		ret = paravirt_patch_insns(insnbuf, len, start, end);
-
-		/* Note: because reloc is assigned from something that
-		   appears to be an array, gcc assumes it's non-null,
-		   but doesn't know its relationship with start and
-		   end. */
-		if (reloc > start && reloc < end) {
-			int reloc_off = reloc - start;
-			long *relocp = (long *)(insnbuf + reloc_off);
-			long delta = start - (char *)addr;
-
-			*relocp += delta;
-		}
-		break;
-
-	default_patch:
-	default:
-		ret = paravirt_patch_default(type, clobbers, insnbuf,
-					     addr, len);
-		break;
-	}
-
-	return ret;
-}
-
 static const struct pv_info xen_info __initconst = {
 	.shared_kernel_pmd = 0,
 
@@ -1043,10 +1029,6 @@ static const struct pv_info xen_info __initconst = {
 	.name = "Xen",
 };
 
-static const struct pv_init_ops xen_init_ops __initconst = {
-	.patch = xen_patch,
-};
-
 static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.cpuid = xen_cpuid,
 
@@ -1244,7 +1226,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
-	pv_init_ops = xen_init_ops;
+	pv_init_ops.patch = paravirt_patch_default;
 	pv_cpu_ops = xen_cpu_ops;
 
 	x86_platform.get_nmi_reason = xen_get_nmi_reason;
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 33e92955e09d..d4eff5676cfa 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
 
 	.safe_halt = xen_safe_halt,
 	.halt = xen_halt,
-#ifdef CONFIG_X86_64
-	.adjust_exception_frame = xen_adjust_exception_frame,
-#endif
 };
 
 void __init xen_init_irq_ops(void)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index cab28cf2cffb..e437714750f8 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 	/* Get the "official" set of cpus referring to our pagetable. */
 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
 		for_each_online_cpu(cpu) {
-			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
-			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+			if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
 				continue;
 			smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
 		}
 		return;
 	}
-	cpumask_copy(mask, mm_cpumask(mm));
 
 	/*
 	 * It's possible that a vcpu may have a stale reference to our
@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 	 * look at its actual current cr3 value, and force it to flush
 	 * if needed.
 	 */
+	cpumask_clear(mask);
 	for_each_online_cpu(cpu) {
 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
 			cpumask_set_cpu(cpu, mask);
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 1ea598e5f030..51471408fdd1 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -19,6 +19,7 @@
 #include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/nmi.h>
+#include <linux/cpuhotplug.h>
 
 #include <asm/paravirt.h>
 #include <asm/desc.h>
@@ -413,7 +414,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
 	 */
 	tick_nohz_idle_enter();
 
-	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+	cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
 }
 
 #else /* !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index a1895a8e85c1..1ecb05db3632 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -309,7 +309,6 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 void xen_teardown_timer(int cpu)
 {
 	struct clock_event_device *evt;
-	BUG_ON(cpu == 0);
 	evt = &per_cpu(xen_clock_events, cpu).evt;
 
 	if (evt->irq >= 0) {
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index eff224df813f..dcd31fa39b5d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -1,14 +1,8 @@
 /*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining.  The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
  *
  * We only bother with direct forms (ie, vcpu in percpu data) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
  */
 
 #include <asm/asm-offsets.h>
@@ -16,7 +10,7 @@
 #include <asm/processor-flags.h>
 #include <asm/frame.h>
 
-#include "xen-asm.h"
+#include <linux/linkage.h>
 
 /*
  * Enable events.  This clears the event mask and tests the pending
@@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct)
 	testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
 	jz 1f
 
-2:	call check_events
+	call check_events
 1:
-ENDPATCH(xen_irq_enable_direct)
 	FRAME_END
 	ret
 	ENDPROC(xen_irq_enable_direct)
-	RELOC(xen_irq_enable_direct, 2b+1)
 
 
 /*
@@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct)
  */
 ENTRY(xen_irq_disable_direct)
 	movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
 	ret
-	ENDPROC(xen_irq_disable_direct)
-	RELOC(xen_irq_disable_direct, 0)
+ENDPROC(xen_irq_disable_direct)
 
 /*
  * (xen_)save_fl is used to get the current interrupt enable status.
@@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct)
 	testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
 	setz %ah
 	addb %ah, %ah
-ENDPATCH(xen_save_fl_direct)
 	ret
 	ENDPROC(xen_save_fl_direct)
-	RELOC(xen_save_fl_direct, 0)
 
 
 /*
@@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct)
 	/* check for unmasked and pending */
 	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
 	jnz 1f
-2:	call check_events
+	call check_events
 1:
-ENDPATCH(xen_restore_fl_direct)
 	FRAME_END
 	ret
 	ENDPROC(xen_restore_fl_direct)
-	RELOC(xen_restore_fl_direct, 2b+1)
 
 
 /*
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
deleted file mode 100644
index 465276467a47..000000000000
--- a/arch/x86/xen/xen-asm.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _XEN_XEN_ASM_H
-#define _XEN_XEN_ASM_H
-
-#include <linux/linkage.h>
-
-#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x)	.globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI	0x80000000
-
-#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index feb6d40a0860..1200e262a116 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,14 +1,8 @@
 /*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining.  The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
  *
  * We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
  */
 
 #include <asm/thread_info.h>
@@ -18,21 +12,10 @@
 
 #include <xen/interface/xen.h>
 
-#include "xen-asm.h"
+#include <linux/linkage.h>
 
-/*
- * Force an event check by making a hypercall, but preserve regs
- * before making the call.
- */
-check_events:
-	push %eax
-	push %ecx
-	push %edx
-	call xen_force_evtchn_callback
-	pop %edx
-	pop %ecx
-	pop %eax
-	ret
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI  0x80000000
 
 /*
  * This is run where a normal iret would be run, with the same stack setup:
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c3df43141e70..dae2cc33afb5 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,14 +1,8 @@
 /*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining.  The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
  *
  * We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
  */
 
 #include <asm/errno.h>
@@ -20,13 +14,44 @@
 
 #include <xen/interface/xen.h>
 
-#include "xen-asm.h"
+#include <linux/linkage.h>
+
+.macro xen_pv_trap name
+ENTRY(xen_\name)
+	pop %rcx
+	pop %r11
+	jmp  \name
+END(xen_\name)
+.endm
 
-ENTRY(xen_adjust_exception_frame)
-	mov 8+0(%rsp), %rcx
-	mov 8+8(%rsp), %r11
-	ret $16
-ENDPROC(xen_adjust_exception_frame)
+xen_pv_trap divide_error
+xen_pv_trap debug
+xen_pv_trap xendebug
+xen_pv_trap int3
+xen_pv_trap xenint3
+xen_pv_trap nmi
+xen_pv_trap overflow
+xen_pv_trap bounds
+xen_pv_trap invalid_op
+xen_pv_trap device_not_available
+xen_pv_trap double_fault
+xen_pv_trap coprocessor_segment_overrun
+xen_pv_trap invalid_TSS
+xen_pv_trap segment_not_present
+xen_pv_trap stack_segment
+xen_pv_trap general_protection
+xen_pv_trap page_fault
+xen_pv_trap spurious_interrupt_bug
+xen_pv_trap coprocessor_error
+xen_pv_trap alignment_check
+#ifdef CONFIG_X86_MCE
+xen_pv_trap machine_check
+#endif /* CONFIG_X86_MCE */
+xen_pv_trap simd_coprocessor_error
+#ifdef CONFIG_IA32_EMULATION
+xen_pv_trap entry_INT80_compat
+#endif
+xen_pv_trap hypervisor_callback
 
 hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
 /*
@@ -46,9 +71,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
  */
 ENTRY(xen_iret)
 	pushq $0
-1:	jmp hypercall_iret
-ENDPATCH(xen_iret)
-RELOC(xen_iret, 1b+1)
+	jmp hypercall_iret
 
 ENTRY(xen_sysret64)
 	/*
@@ -65,9 +88,7 @@ ENTRY(xen_sysret64)
 	pushq %rcx
 
 	pushq $VGCF_in_syscall
-1:	jmp hypercall_iret
-ENDPATCH(xen_sysret64)
-RELOC(xen_sysret64, 1b+1)
+	jmp hypercall_iret
 
 /*
  * Xen handles syscall callbacks much like ordinary exceptions, which
@@ -82,34 +103,47 @@ RELOC(xen_sysret64, 1b+1)
  *	rip
  *	r11
  * rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
  */
 
-.macro undo_xen_syscall
-	mov 0*8(%rsp), %rcx
-	mov 1*8(%rsp), %r11
-	mov 5*8(%rsp), %rsp
-.endm
-
 /* Normal 64-bit system call target */
 ENTRY(xen_syscall_target)
-	undo_xen_syscall
-	jmp entry_SYSCALL_64_after_swapgs
+	popq %rcx
+	popq %r11
+
+	/*
+	 * Neither Xen nor the kernel really knows what the old SS and
+	 * CS were.  The kernel expects __USER_DS and __USER_CS, so
+	 * report those values even though Xen will guess its own values.
+	 */
+	movq $__USER_DS, 4*8(%rsp)
+	movq $__USER_CS, 1*8(%rsp)
+
+	jmp entry_SYSCALL_64_after_hwframe
 ENDPROC(xen_syscall_target)
 
 #ifdef CONFIG_IA32_EMULATION
 
 /* 32-bit compat syscall target */
 ENTRY(xen_syscall32_target)
-	undo_xen_syscall
-	jmp entry_SYSCALL_compat
+	popq %rcx
+	popq %r11
+
+	/*
+	 * Neither Xen nor the kernel really knows what the old SS and
+	 * CS were.  The kernel expects __USER32_DS and __USER32_CS, so
+	 * report those values even though Xen will guess its own values.
+	 */
+	movq $__USER32_DS, 4*8(%rsp)
+	movq $__USER32_CS, 1*8(%rsp)
+
+	jmp entry_SYSCALL_compat_after_hwframe
 ENDPROC(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
 ENTRY(xen_sysenter_target)
-	undo_xen_syscall
+	mov 0*8(%rsp), %rcx
+	mov 1*8(%rsp), %r11
+	mov 5*8(%rsp), %rsp
 	jmp entry_SYSENTER_compat
 ENDPROC(xen_sysenter_target)
 
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 72a8e6adebe6..a7525e95d53f 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -58,7 +58,7 @@ ENTRY(hypercall_page)
 #else
 	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
 	/* Map the p2m table to a 512GB-aligned user address. */
-	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad PGDIR_SIZE)
+	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad (PUD_SIZE * PTRS_PER_PUD))
 #endif
 #ifdef CONFIG_XEN_PV
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 0d5004477db6..c8a6d224f7ed 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -129,23 +129,15 @@ static inline void __init xen_efi_init(void)
 }
 #endif
 
-/* Declare an asm function, along with symbols needed to make it
-   inlineable */
-#define DECL_ASM(ret, name, ...)		\
-	__visible ret name(__VA_ARGS__);	\
-	extern char name##_end[] __visible;	\
-	extern char name##_reloc[] __visible
-
-DECL_ASM(void, xen_irq_enable_direct, void);
-DECL_ASM(void, xen_irq_disable_direct, void);
-DECL_ASM(unsigned long, xen_save_fl_direct, void);
-DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+__visible void xen_irq_enable_direct(void);
+__visible void xen_irq_disable_direct(void);
+__visible unsigned long xen_save_fl_direct(void);
+__visible void xen_restore_fl_direct(unsigned long);
 
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 __visible void xen_sysret32(void);
 __visible void xen_sysret64(void);
-__visible void xen_adjust_exception_frame(void);
 
 extern int xen_panic_handler_init(void);
 
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 2d716ebc5a5e..dff7cc39437c 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -1,5 +1,6 @@
 generic-y += bug.h
 generic-y += clkdev.h
+generic-y += device.h
 generic-y += div64.h
 generic-y += dma-contiguous.h
 generic-y += emergency-restart.h
@@ -17,6 +18,7 @@ generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
+generic-y += param.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += rwsem.h
diff --git a/arch/xtensa/include/asm/device.h b/arch/xtensa/include/asm/device.h
deleted file mode 100644
index 1deeb8ebbb1b..000000000000
--- a/arch/xtensa/include/asm/device.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Arch specific extensions to struct device
- *
- * This file is released under the GPLv2
- */
-#ifndef _ASM_XTENSA_DEVICE_H
-#define _ASM_XTENSA_DEVICE_H
-
-struct dev_archdata {
-};
-
-struct pdev_archdata {
-};
-
-#endif /* _ASM_XTENSA_DEVICE_H */
diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index b39531babec0..eaaf1ebcc7a4 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -44,18 +44,10 @@
 	: "r" (uaddr), "I" (-EFAULT), "r" (oparg)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 #if !XCHAL_HAVE_S32C1I
 	return -ENOSYS;
@@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (ret)
-		return ret;
+	if (!ret)
+		*oval = oldval;
 
-	switch (cmp) {
-	case FUTEX_OP_CMP_EQ: return (oldval == cmparg);
-	case FUTEX_OP_CMP_NE: return (oldval != cmparg);
-	case FUTEX_OP_CMP_LT: return (oldval < cmparg);
-	case FUTEX_OP_CMP_GE: return (oldval >= cmparg);
-	case FUTEX_OP_CMP_LE: return (oldval <= cmparg);
-	case FUTEX_OP_CMP_GT: return (oldval > cmparg);
-	}
-
-	return -ENOSYS;
+	return ret;
 }
 
 static inline int
diff --git a/arch/xtensa/include/asm/param.h b/arch/xtensa/include/asm/param.h
deleted file mode 100644
index 0a70e780ef2a..000000000000
--- a/arch/xtensa/include/asm/param.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * include/asm-xtensa/param.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-#ifndef _XTENSA_PARAM_H
-#define _XTENSA_PARAM_H
-
-#include <uapi/asm/param.h>
-
-# define HZ		CONFIG_HZ	/* internal timer frequency */
-# define USER_HZ	100		/* for user interfaces in "ticks" */
-# define CLOCKS_PER_SEC (USER_HZ)	/* frequnzy at which times() counts */
-#endif /* _XTENSA_PARAM_H */
diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h
index a36221cf6363..3bb49681ee24 100644
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -33,11 +33,6 @@
 
 #define arch_spin_is_locked(x) ((x)->slock != 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/xtensa/include/uapi/asm/ioctls.h b/arch/xtensa/include/uapi/asm/ioctls.h
index 98b004e24e85..47d82c09be7b 100644
--- a/arch/xtensa/include/uapi/asm/ioctls.h
+++ b/arch/xtensa/include/uapi/asm/ioctls.h
@@ -105,7 +105,7 @@
 #define TIOCGPKT	_IOR('T', 0x38, int) /* Get packet mode state */
 #define TIOCGPTLCK	_IOR('T', 0x39, int) /* Get Pty lock state */
 #define TIOCGEXCL	_IOR('T', 0x40, int) /* Get exclusive mode state */
-#define TIOCGPTPEER	_IOR('T', 0x41, int) /* Safely open the slave */
+#define TIOCGPTPEER	_IO('T', 0x41) /* Safely open the slave */
 
 #define TIOCSERCONFIG	_IO('T', 83)
 #define TIOCSERGWILD	_IOR('T', 84,  int)
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c
index 33bfa5270d95..08175df7a69e 100644
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -273,8 +273,8 @@ void __init init_arch(bp_tag_t *bp_start)
  * Initialize system. Setup memory and reserve regions.
  */
 
-extern char _end;
-extern char _stext;
+extern char _end[];
+extern char _stext[];
 extern char _WindowVectors_text_start;
 extern char _WindowVectors_text_end;
 extern char _DebugInterruptVector_literal_start;
@@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p)
 	}
 #endif
 
-	mem_reserve(__pa(&_stext), __pa(&_end));
+	mem_reserve(__pa(_stext), __pa(_end));
 
 #ifdef CONFIG_VECTORS_OFFSET
 	mem_reserve(__pa(&_WindowVectors_text_start),
diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c
index d159e9b9c018..672391003e40 100644
--- a/arch/xtensa/kernel/xtensa_ksyms.c
+++ b/arch/xtensa/kernel/xtensa_ksyms.c
@@ -94,13 +94,11 @@ unsigned long __sync_fetch_and_or_4(unsigned long *p, unsigned long v)
 }
 EXPORT_SYMBOL(__sync_fetch_and_or_4);
 
-#ifdef CONFIG_NET
 /*
  * Networking support
  */
 EXPORT_SYMBOL(csum_partial);
 EXPORT_SYMBOL(csum_partial_copy_generic);
-#endif /* CONFIG_NET */
 
 /*
  * Architecture-specific symbols
diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c
index 1a804a2f9a5b..3c75c4e597da 100644
--- a/arch/xtensa/mm/cache.c
+++ b/arch/xtensa/mm/cache.c
@@ -103,6 +103,7 @@ void clear_user_highpage(struct page *page, unsigned long vaddr)
 	clear_page_alias(kvaddr, paddr);
 	preempt_enable();
 }
+EXPORT_SYMBOL(clear_user_highpage);
 
 void copy_user_highpage(struct page *dst, struct page *src,
 			unsigned long vaddr, struct vm_area_struct *vma)
@@ -119,10 +120,7 @@ void copy_user_highpage(struct page *dst, struct page *src,
 	copy_page_alias(dst_vaddr, src_vaddr, dst_paddr, src_paddr);
 	preempt_enable();
 }
-
-#endif /* DCACHE_WAY_SIZE > PAGE_SIZE */
-
-#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK
+EXPORT_SYMBOL(copy_user_highpage);
 
 /*
  * Any time the kernel writes to a user page cache page, or it is about to
@@ -176,7 +174,7 @@ void flush_dcache_page(struct page *page)
 
 	/* There shouldn't be an entry in the cache for this page anymore. */
 }
-
+EXPORT_SYMBOL(flush_dcache_page);
 
 /*
  * For now, flush the whole cache. FIXME??
@@ -188,6 +186,7 @@ void local_flush_cache_range(struct vm_area_struct *vma,
 	__flush_invalidate_dcache_all();
 	__invalidate_icache_all();
 }
+EXPORT_SYMBOL(local_flush_cache_range);
 
 /* 
  * Remove any entry in the cache for this page. 
@@ -207,8 +206,9 @@ void local_flush_cache_page(struct vm_area_struct *vma, unsigned long address,
 	__flush_invalidate_dcache_page_alias(virt, phys);
 	__invalidate_icache_page_alias(virt, phys);
 }
+EXPORT_SYMBOL(local_flush_cache_page);
 
-#endif
+#endif /* DCACHE_WAY_SIZE > PAGE_SIZE */
 
 void
 update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep)
@@ -225,7 +225,7 @@ update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep)
 
 	flush_tlb_page(vma, addr);
 
-#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK
+#if (DCACHE_WAY_SIZE > PAGE_SIZE)
 
 	if (!PageReserved(page) && test_bit(PG_arch_1, &page->flags)) {
 		unsigned long phys = page_to_phys(page);
@@ -256,7 +256,7 @@ update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep)
  * flush_dcache_page() on the page.
  */
 
-#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK
+#if (DCACHE_WAY_SIZE > PAGE_SIZE)
 
 void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
 		unsigned long vaddr, void *dst, const void *src,