97 files changed, 1505 insertions, 874 deletions
diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h
index 4cb4b6d3452c..0bc66e1d3a7e 100644
--- a/arch/alpha/include/asm/types.h
+++ b/arch/alpha/include/asm/types.h
@@ -1,6 +1,6 @@
 #ifndef _ALPHA_TYPES_H
 #define _ALPHA_TYPES_H
 
-#include <asm-generic/int-ll64.h>
+#include <uapi/asm/types.h>
 
 #endif /* _ALPHA_TYPES_H */
diff --git a/arch/alpha/include/uapi/asm/types.h b/arch/alpha/include/uapi/asm/types.h
index 9fd3cd459777..8d1024d7be05 100644
--- a/arch/alpha/include/uapi/asm/types.h
+++ b/arch/alpha/include/uapi/asm/types.h
@@ -9,8 +9,18 @@
  * need to be careful to avoid a name clashes.
  */
 
-#ifndef __KERNEL__
+/*
+ * This is here because we used to use l64 for alpha
+ * and we don't want to impact user mode with our change to ll64
+ * in the kernel.
+ *
+ * However, some user programs are fine with this.  They can
+ * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here.
+ */
+#if !defined(__SANE_USERSPACE_TYPES__) && !defined(__KERNEL__)
 #include <asm-generic/int-l64.h>
+#else
+#include <asm-generic/int-ll64.h>
 #endif
 
 #endif /* _UAPI_ALPHA_TYPES_H */
diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S
index 1eea99beecc3..85d9ea4a0acc 100644
--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@@ -92,6 +92,12 @@ ENTRY(EV_MachineCheck)
 	lr  r0, [efa]
 	mov r1, sp
 
+	; hardware auto-disables MMU, re-enable it to allow kernel vaddr
+	; access for say stack unwinding of modules for crash dumps
+	lr	r3, [ARC_REG_PID]
+	or	r3, r3, MMU_ENABLE
+	sr	r3, [ARC_REG_PID]
+
 	lsr  	r3, r2, 8
 	bmsk 	r3, r3, 7
 	brne    r3, ECR_C_MCHK_DUP_TLB, 1f
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index bdb295e09160..a4dc881da277 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -896,9 +896,6 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address,
 
 	local_irq_save(flags);
 
-	/* re-enable the MMU */
-	write_aux_reg(ARC_REG_PID, MMU_ENABLE | read_aux_reg(ARC_REG_PID));
-
 	/* loop thru all sets of TLB */
 	for (set = 0; set < mmu->sets; set++) {
 
diff --git a/arch/arm/Kconfig-nommu b/arch/arm/Kconfig-nommu
index aed66d5df7f1..b7576349528c 100644
--- a/arch/arm/Kconfig-nommu
+++ b/arch/arm/Kconfig-nommu
@@ -34,8 +34,7 @@ config PROCESSOR_ID
 	  used instead of the auto-probing which utilizes the register.
 
 config REMAP_VECTORS_TO_RAM
-	bool 'Install vectors to the beginning of RAM' if DRAM_BASE
-	depends on DRAM_BASE
+	bool 'Install vectors to the beginning of RAM'
 	help
 	  The kernel needs to change the hardware exception vectors.
 	  In nommu mode, the hardware exception vectors are normally
diff --git a/arch/arm/boot/dts/am335x-chilisom.dtsi b/arch/arm/boot/dts/am335x-chilisom.dtsi
index f9ee5859c154..1b43ebd08b38 100644
--- a/arch/arm/boot/dts/am335x-chilisom.dtsi
+++ b/arch/arm/boot/dts/am335x-chilisom.dtsi
@@ -124,6 +124,14 @@
 
 &rtc {
 	system-power-controller;
+
+	pinctrl-0 = <&ext_wakeup>;
+	pinctrl-names = "default";
+
+	ext_wakeup: ext-wakeup {
+		pins = "ext_wakeup0";
+		input-enable;
+	};
 };
 
 /* NAND Flash */
diff --git a/arch/arm/boot/dts/armada-375.dtsi b/arch/arm/boot/dts/armada-375.dtsi
index cc952cf8ec30..024f1b75b0a3 100644
--- a/arch/arm/boot/dts/armada-375.dtsi
+++ b/arch/arm/boot/dts/armada-375.dtsi
@@ -176,9 +176,9 @@
 				reg = <0x8000 0x1000>;
 				cache-unified;
 				cache-level = <2>;
-				arm,double-linefill-incr = <1>;
+				arm,double-linefill-incr = <0>;
 				arm,double-linefill-wrap = <0>;
-				arm,double-linefill = <1>;
+				arm,double-linefill = <0>;
 				prefetch-data = <1>;
 			};
 
diff --git a/arch/arm/boot/dts/armada-38x.dtsi b/arch/arm/boot/dts/armada-38x.dtsi
index 2d7668848c5a..c60cfe9fd033 100644
--- a/arch/arm/boot/dts/armada-38x.dtsi
+++ b/arch/arm/boot/dts/armada-38x.dtsi
@@ -143,9 +143,9 @@
 				reg = <0x8000 0x1000>;
 				cache-unified;
 				cache-level = <2>;
-				arm,double-linefill-incr = <1>;
+				arm,double-linefill-incr = <0>;
 				arm,double-linefill-wrap = <0>;
-				arm,double-linefill = <1>;
+				arm,double-linefill = <0>;
 				prefetch-data = <1>;
 			};
 
diff --git a/arch/arm/boot/dts/armada-39x.dtsi b/arch/arm/boot/dts/armada-39x.dtsi
index 34cba87f9200..aeecfa7e5ea3 100644
--- a/arch/arm/boot/dts/armada-39x.dtsi
+++ b/arch/arm/boot/dts/armada-39x.dtsi
@@ -111,9 +111,9 @@
 				reg = <0x8000 0x1000>;
 				cache-unified;
 				cache-level = <2>;
-				arm,double-linefill-incr = <1>;
+				arm,double-linefill-incr = <0>;
 				arm,double-linefill-wrap = <0>;
-				arm,double-linefill = <1>;
+				arm,double-linefill = <0>;
 				prefetch-data = <1>;
 			};
 
diff --git a/arch/arm/boot/dts/bcm953012k.dts b/arch/arm/boot/dts/bcm953012k.dts
index 05a985a20378..6208e85acd9d 100644
--- a/arch/arm/boot/dts/bcm953012k.dts
+++ b/arch/arm/boot/dts/bcm953012k.dts
@@ -48,7 +48,7 @@
 	};
 
 	memory {
-		reg = <0x00000000 0x10000000>;
+		reg = <0x80000000 0x10000000>;
 	};
 };
 
diff --git a/arch/arm/boot/dts/exynos4412-odroid-common.dtsi b/arch/arm/boot/dts/exynos4412-odroid-common.dtsi
index 8aa19ba14436..5282d69e55bd 100644
--- a/arch/arm/boot/dts/exynos4412-odroid-common.dtsi
+++ b/arch/arm/boot/dts/exynos4412-odroid-common.dtsi
@@ -97,11 +97,11 @@
 	thermal-zones {
 		cpu_thermal: cpu-thermal {
 			cooling-maps {
-				map0 {
+				cooling_map0: map0 {
 				     /* Corresponds to 800MHz at freq_table */
 				     cooling-device = <&cpu0 7 7>;
 				};
-				map1 {
+				cooling_map1: map1 {
 				     /* Corresponds to 200MHz at freq_table */
 				     cooling-device = <&cpu0 13 13>;
 			       };
diff --git a/arch/arm/boot/dts/exynos4412-odroidu3.dts b/arch/arm/boot/dts/exynos4412-odroidu3.dts
index 99634c54dca9..7504a5aa538e 100644
--- a/arch/arm/boot/dts/exynos4412-odroidu3.dts
+++ b/arch/arm/boot/dts/exynos4412-odroidu3.dts
@@ -13,6 +13,7 @@
 
 /dts-v1/;
 #include "exynos4412-odroid-common.dtsi"
+#include "exynos4412-prime.dtsi"
 
 / {
 	model = "Hardkernel ODROID-U3 board based on Exynos4412";
@@ -47,11 +48,11 @@
 			cooling-maps {
 				map0 {
 				     trip = <&cpu_alert1>;
-				     cooling-device = <&cpu0 7 7>;
+				     cooling-device = <&cpu0 9 9>;
 				};
 				map1 {
 				     trip = <&cpu_alert2>;
-				     cooling-device = <&cpu0 13 13>;
+				     cooling-device = <&cpu0 15 15>;
 				};
 				map2 {
 				     trip = <&cpu_alert0>;
diff --git a/arch/arm/boot/dts/exynos4412-odroidx2.dts b/arch/arm/boot/dts/exynos4412-odroidx2.dts
index 4d228858f172..d6e92ebc3874 100644
--- a/arch/arm/boot/dts/exynos4412-odroidx2.dts
+++ b/arch/arm/boot/dts/exynos4412-odroidx2.dts
@@ -12,6 +12,7 @@
 */
 
 #include "exynos4412-odroidx.dts"
+#include "exynos4412-prime.dtsi"
 
 / {
 	model = "Hardkernel ODROID-X2 board based on Exynos4412";
diff --git a/arch/arm/boot/dts/exynos4412-prime.dtsi b/arch/arm/boot/dts/exynos4412-prime.dtsi
new file mode 100644
index 000000000000..e75bc170c89c
--- /dev/null
+++ b/arch/arm/boot/dts/exynos4412-prime.dtsi
@@ -0,0 +1,41 @@
+/*
+ * Samsung's Exynos4412 Prime SoC device tree source
+ *
+ * Copyright (c) 2016 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Exynos4412 Prime SoC revision supports higher CPU frequencies than
+ * non-Prime version.  Therefore we need to update OPPs table and
+ * thermal maps accordingly.
+ */
+
+&cpu0_opp_1500 {
+	/delete-property/turbo-mode;
+};
+
+&cpu0_opp_table {
+	opp@1600000000 {
+		opp-hz = /bits/ 64 <1600000000>;
+		opp-microvolt = <1350000>;
+		clock-latency-ns = <200000>;
+	};
+	opp@1704000000 {
+		opp-hz = /bits/ 64 <1704000000>;
+		opp-microvolt = <1350000>;
+		clock-latency-ns = <200000>;
+	};
+};
+
+&cooling_map0 {
+	cooling-device = <&cpu0 9 9>;
+};
+
+&cooling_map1 {
+	cooling-device = <&cpu0 15 15>;
+};
diff --git a/arch/arm/boot/dts/exynos4412.dtsi b/arch/arm/boot/dts/exynos4412.dtsi
index 40beede46e55..3ebdf01d814c 100644
--- a/arch/arm/boot/dts/exynos4412.dtsi
+++ b/arch/arm/boot/dts/exynos4412.dtsi
@@ -130,7 +130,7 @@
 			opp-microvolt = <1287500>;
 			clock-latency-ns = <200000>;
 		};
-		opp@1500000000 {
+		cpu0_opp_1500: opp@1500000000 {
 			opp-hz = /bits/ 64 <1500000000>;
 			opp-microvolt = <1350000>;
 			clock-latency-ns = <200000>;
diff --git a/arch/arm/boot/dts/mt2701.dtsi b/arch/arm/boot/dts/mt2701.dtsi
index 18596a2c58a1..77c6b931dc24 100644
--- a/arch/arm/boot/dts/mt2701.dtsi
+++ b/arch/arm/boot/dts/mt2701.dtsi
@@ -174,4 +174,40 @@
 		clocks = <&uart_clk>;
 		status = "disabled";
 	};
+
+	mmsys: syscon@14000000 {
+		compatible = "mediatek,mt2701-mmsys", "syscon";
+		reg = <0 0x14000000 0 0x1000>;
+		#clock-cells = <1>;
+	};
+
+	imgsys: syscon@15000000 {
+		compatible = "mediatek,mt2701-imgsys", "syscon";
+		reg = <0 0x15000000 0 0x1000>;
+		#clock-cells = <1>;
+	};
+
+	vdecsys: syscon@16000000 {
+		compatible = "mediatek,mt2701-vdecsys", "syscon";
+		reg = <0 0x16000000 0 0x1000>;
+		#clock-cells = <1>;
+	};
+
+	hifsys: syscon@1a000000 {
+		compatible = "mediatek,mt2701-hifsys", "syscon";
+		reg = <0 0x1a000000 0 0x1000>;
+		#clock-cells = <1>;
+	};
+
+	ethsys: syscon@1b000000 {
+		compatible = "mediatek,mt2701-ethsys", "syscon";
+		reg = <0 0x1b000000 0 0x1000>;
+		#clock-cells = <1>;
+	};
+
+	bdpsys: syscon@1c000000 {
+		compatible = "mediatek,mt2701-bdpsys", "syscon";
+		reg = <0 0x1c000000 0 0x1000>;
+		#clock-cells = <1>;
+	};
 };
diff --git a/arch/arm/boot/dts/r8a7790.dtsi b/arch/arm/boot/dts/r8a7790.dtsi
index 351fcc2f87df..b6c6410ca384 100644
--- a/arch/arm/boot/dts/r8a7790.dtsi
+++ b/arch/arm/boot/dts/r8a7790.dtsi
@@ -1493,7 +1493,8 @@
 	};
 
 	msiof0: spi@e6e20000 {
-		compatible = "renesas,msiof-r8a7790";
+		compatible = "renesas,msiof-r8a7790",
+			     "renesas,rcar-gen2-msiof";
 		reg = <0 0xe6e20000 0 0x0064>;
 		interrupts = <GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&mstp0_clks R8A7790_CLK_MSIOF0>;
@@ -1507,7 +1508,8 @@
 	};
 
 	msiof1: spi@e6e10000 {
-		compatible = "renesas,msiof-r8a7790";
+		compatible = "renesas,msiof-r8a7790",
+			     "renesas,rcar-gen2-msiof";
 		reg = <0 0xe6e10000 0 0x0064>;
 		interrupts = <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&mstp2_clks R8A7790_CLK_MSIOF1>;
@@ -1521,7 +1523,8 @@
 	};
 
 	msiof2: spi@e6e00000 {
-		compatible = "renesas,msiof-r8a7790";
+		compatible = "renesas,msiof-r8a7790",
+			     "renesas,rcar-gen2-msiof";
 		reg = <0 0xe6e00000 0 0x0064>;
 		interrupts = <GIC_SPI 158 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&mstp2_clks R8A7790_CLK_MSIOF2>;
@@ -1535,7 +1538,8 @@
 	};
 
 	msiof3: spi@e6c90000 {
-		compatible = "renesas,msiof-r8a7790";
+		compatible = "renesas,msiof-r8a7790",
+			     "renesas,rcar-gen2-msiof";
 		reg = <0 0xe6c90000 0 0x0064>;
 		interrupts = <GIC_SPI 159 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&mstp2_clks R8A7790_CLK_MSIOF3>;
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 55e0e3ea9cb6..bd12b98e2589 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -37,4 +37,3 @@ generic-y += termbits.h
 generic-y += termios.h
 generic-y += timex.h
 generic-y += trace_clock.h
-generic-y += unaligned.h
diff --git a/arch/arm/include/asm/unaligned.h b/arch/arm/include/asm/unaligned.h
new file mode 100644
index 000000000000..ab905ffcf193
--- /dev/null
+++ b/arch/arm/include/asm/unaligned.h
@@ -0,0 +1,27 @@
+#ifndef __ASM_ARM_UNALIGNED_H
+#define __ASM_ARM_UNALIGNED_H
+
+/*
+ * We generally want to set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS on ARMv6+,
+ * but we don't want to use linux/unaligned/access_ok.h since that can lead
+ * to traps on unaligned stm/ldm or strd/ldrd.
+ */
+#include <asm/byteorder.h>
+
+#if defined(__LITTLE_ENDIAN)
+# include <linux/unaligned/le_struct.h>
+# include <linux/unaligned/be_byteshift.h>
+# include <linux/unaligned/generic.h>
+# define get_unaligned	__get_unaligned_le
+# define put_unaligned	__put_unaligned_le
+#elif defined(__BIG_ENDIAN)
+# include <linux/unaligned/be_struct.h>
+# include <linux/unaligned/le_byteshift.h>
+# include <linux/unaligned/generic.h>
+# define get_unaligned	__get_unaligned_be
+# define put_unaligned	__put_unaligned_be
+#else
+# error need to define endianess
+#endif
+
+#endif /* __ASM_ARM_UNALIGNED_H */
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index 0064b86a2c87..30a13647c54c 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -227,7 +227,7 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
 	u32 return_offset = (is_thumb) ? 2 : 4;
 
 	kvm_update_psr(vcpu, UND_MODE);
-	*vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) - return_offset;
+	*vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
 
 	/* Branch to exception vector */
 	*vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset;
@@ -239,10 +239,8 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
  */
 static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr)
 {
-	unsigned long cpsr = *vcpu_cpsr(vcpu);
-	bool is_thumb = (cpsr & PSR_T_BIT);
 	u32 vect_offset;
-	u32 return_offset = (is_thumb) ? 4 : 0;
+	u32 return_offset = (is_pabt) ? 4 : 8;
 	bool is_lpae;
 
 	kvm_update_psr(vcpu, ABT_MODE);
diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile
index 8679405b0b2b..92eab1d51785 100644
--- a/arch/arm/kvm/hyp/Makefile
+++ b/arch/arm/kvm/hyp/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module, HYP part
 #
 
-ccflags-y += -fno-stack-protector
+ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING
 
 KVM=../../../../virt/kvm
 
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 0c060c5e844a..2206e0e00934 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -837,7 +837,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
 	spin_lock(&kvm->mmu_lock);
 	if (kvm->arch.pgd) {
 		unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
-		pgd = kvm->arch.pgd;
+		pgd = READ_ONCE(kvm->arch.pgd);
 		kvm->arch.pgd = NULL;
 	}
 	spin_unlock(&kvm->mmu_lock);
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 31dde8b6f2ea..8ba0e2e5ad97 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -335,7 +335,7 @@ static void at91sam9_sdram_standby(void)
 		at91_ramc_write(1, AT91_SDRAMC_LPR, saved_lpr1);
 }
 
-static const struct of_device_id const ramc_ids[] __initconst = {
+static const struct of_device_id ramc_ids[] __initconst = {
 	{ .compatible = "atmel,at91rm9200-sdramc", .data = at91rm9200_standby },
 	{ .compatible = "atmel,at91sam9260-sdramc", .data = at91sam9_sdram_standby },
 	{ .compatible = "atmel,at91sam9g45-ddramc", .data = at91_ddr_standby },
diff --git a/arch/arm/mach-bcm/bcm_kona_smc.c b/arch/arm/mach-bcm/bcm_kona_smc.c
index cf3f8658f0e5..a55a7ecf146a 100644
--- a/arch/arm/mach-bcm/bcm_kona_smc.c
+++ b/arch/arm/mach-bcm/bcm_kona_smc.c
@@ -33,7 +33,7 @@ struct bcm_kona_smc_data {
 	unsigned result;
 };
 
-static const struct of_device_id const bcm_kona_smc_ids[] __initconst = {
+static const struct of_device_id bcm_kona_smc_ids[] __initconst = {
 	{.compatible = "brcm,kona-smc"},
 	{.compatible = "bcm,kona-smc"}, /* deprecated name */
 	{},
diff --git a/arch/arm/mach-cns3xxx/core.c b/arch/arm/mach-cns3xxx/core.c
index 03da3813f1ab..7d5a44a06648 100644
--- a/arch/arm/mach-cns3xxx/core.c
+++ b/arch/arm/mach-cns3xxx/core.c
@@ -346,7 +346,7 @@ static struct usb_ohci_pdata cns3xxx_usb_ohci_pdata = {
 	.power_off	= csn3xxx_usb_power_off,
 };
 
-static const struct of_dev_auxdata const cns3xxx_auxdata[] __initconst = {
+static const struct of_dev_auxdata cns3xxx_auxdata[] __initconst = {
 	{ "intel,usb-ehci", CNS3XXX_USB_BASE, "ehci-platform", &cns3xxx_usb_ehci_pdata },
 	{ "intel,usb-ohci", CNS3XXX_USB_OHCI_BASE, "ohci-platform", &cns3xxx_usb_ohci_pdata },
 	{ "cavium,cns3420-ahci", CNS3XXX_SATA2_BASE, "ahci", NULL },
diff --git a/arch/arm/mach-omap2/prm_common.c b/arch/arm/mach-omap2/prm_common.c
index 5b2f5138d938..f1ca9479491b 100644
--- a/arch/arm/mach-omap2/prm_common.c
+++ b/arch/arm/mach-omap2/prm_common.c
@@ -713,7 +713,7 @@ static struct omap_prcm_init_data scrm_data __initdata = {
 };
 #endif
 
-static const struct of_device_id const omap_prcm_dt_match_table[] __initconst = {
+static const struct of_device_id omap_prcm_dt_match_table[] __initconst = {
 #ifdef CONFIG_SOC_AM33XX
 	{ .compatible = "ti,am3-prcm", .data = &am3_prm_data },
 #endif
diff --git a/arch/arm/mach-omap2/vc.c b/arch/arm/mach-omap2/vc.c
index 2028167fff31..d76b1e5eb8ba 100644
--- a/arch/arm/mach-omap2/vc.c
+++ b/arch/arm/mach-omap2/vc.c
@@ -559,7 +559,7 @@ struct i2c_init_data {
 	u8 hsscll_12;
 };
 
-static const struct i2c_init_data const omap4_i2c_timing_data[] __initconst = {
+static const struct i2c_init_data omap4_i2c_timing_data[] __initconst = {
 	{
 		.load = 50,
 		.loadbits = 0x3,
diff --git a/arch/arm/mach-spear/time.c b/arch/arm/mach-spear/time.c
index 9ccffc1d0f28..aaaa6781b9fe 100644
--- a/arch/arm/mach-spear/time.c
+++ b/arch/arm/mach-spear/time.c
@@ -204,7 +204,7 @@ static void __init spear_clockevent_init(int irq)
 	setup_irq(irq, &spear_timer_irq);
 }
 
-static const struct of_device_id const timer_of_match[] __initconst = {
+static const struct of_device_id timer_of_match[] __initconst = {
 	{ .compatible = "st,spear-timer", },
 	{ },
 };
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 926b1be48043..ce47dfe25fb0 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -314,8 +314,11 @@ retry:
 	 * signal first. We do not need to release the mmap_sem because
 	 * it would already be released in __lock_page_or_retry in
 	 * mm/filemap.c. */
-	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
+	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
+		if (!user_mode(regs))
+			goto no_context;
 		return 0;
+	}
 
 	/*
 	 * Major/minor page fault accounting is only done on the
diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index d062f08f5020..4b24964a520a 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -199,6 +199,7 @@ static struct dma_map_ops xen_swiotlb_dma_ops = {
 	.unmap_page = xen_swiotlb_unmap_page,
 	.dma_supported = xen_swiotlb_dma_supported,
 	.set_dma_mask = xen_swiotlb_set_dma_mask,
+	.mmap = xen_swiotlb_dma_mmap,
 };
 
 int __init xen_mm_init(void)
diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
index 49a5d8ccae27..68e6f88bdcfe 100644
--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
@@ -170,6 +170,7 @@
 				interrupt-controller;
 				reg = <0x1d00000 0x10000>, /* GICD */
 				      <0x1d40000 0x40000>; /* GICR */
+				interrupts = <GIC_PPI 9 IRQ_TYPE_LEVEL_HIGH>;
 			};
 		};
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 7ee6d74101cf..c18658690993 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -487,6 +487,7 @@ ENTRY(kimage_vaddr)
  * booted in EL1 or EL2 respectively.
  */
 ENTRY(el2_setup)
+	msr	SPsel, #1			// We want to use SP_EL{1,2}
 	mrs	x0, CurrentEL
 	cmp	x0, #CurrentEL_EL2
 	b.ne	1f
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 7501b4097c7a..f6c003e9edbf 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -113,7 +113,7 @@ static void __dump_instr(const char *lvl, struct pt_regs *regs)
 	for (i = -4; i < 1; i++) {
 		unsigned int val, bad;
 
-		bad = __get_user(val, &((u32 *)addr)[i]);
+		bad = get_user(val, &((u32 *)addr)[i]);
 
 		if (!bad)
 			p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val);
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 14c4e3b14bcb..48b03547a969 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module, HYP part
 #
 
-ccflags-y += -fno-stack-protector
+ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING
 
 KVM=../../../../virt/kvm
 
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index da6a8cfa54a0..3556715a774e 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -33,12 +33,26 @@
 #define LOWER_EL_AArch64_VECTOR		0x400
 #define LOWER_EL_AArch32_VECTOR		0x600
 
+/*
+ * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
+ */
+static const u8 return_offsets[8][2] = {
+	[0] = { 0, 0 },		/* Reset, unused */
+	[1] = { 4, 2 },		/* Undefined */
+	[2] = { 0, 0 },		/* SVC, unused */
+	[3] = { 4, 4 },		/* Prefetch abort */
+	[4] = { 8, 8 },		/* Data abort */
+	[5] = { 0, 0 },		/* HVC, unused */
+	[6] = { 4, 4 },		/* IRQ, unused */
+	[7] = { 4, 4 },		/* FIQ, unused */
+};
+
 static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
 {
 	unsigned long cpsr;
 	unsigned long new_spsr_value = *vcpu_cpsr(vcpu);
 	bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT);
-	u32 return_offset = (is_thumb) ? 4 : 0;
+	u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
 	u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
 
 	cpsr = mode | COMPAT_PSR_I_BIT;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 523b9d3c8143..cf4e3f53a1e3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -515,7 +515,7 @@ static const struct fault_info fault_info[] = {
 	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 0 translation fault"	},
 	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 1 translation fault"	},
 	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 2 translation fault"	},
-	{ do_page_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	},
+	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	},
 	{ do_bad,		SIGBUS,  0,		"unknown 8"			},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 access flag fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 access flag fault"	},
diff --git a/arch/mips/ath79/clock.c b/arch/mips/ath79/clock.c
index cc3a1e33a600..7e2bb12b64ea 100644
--- a/arch/mips/ath79/clock.c
+++ b/arch/mips/ath79/clock.c
@@ -508,16 +508,19 @@ static void __init ath79_clocks_init_dt_ng(struct device_node *np)
 		ar9330_clk_init(ref_clk, pll_base);
 	else {
 		pr_err("%s: could not find any appropriate clk_init()\n", dnfn);
-		goto err_clk;
+		goto err_iounmap;
 	}
 
 	if (of_clk_add_provider(np, of_clk_src_onecell_get, &clk_data)) {
 		pr_err("%s: could not register clk provider\n", dnfn);
-		goto err_clk;
+		goto err_iounmap;
 	}
 
 	return;
 
+err_iounmap:
+	iounmap(pll_base);
+
 err_clk:
 	clk_put(ref_clk);
 
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 956db6e201d1..c5d351786416 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -18,9 +18,24 @@
 #include <irq.h>
 
 #define IRQ_STACK_SIZE			THREAD_SIZE
+#define IRQ_STACK_START			(IRQ_STACK_SIZE - 16)
 
 extern void *irq_stack[NR_CPUS];
 
+/*
+ * The highest address on the IRQ stack contains a dummy frame put down in
+ * genex.S (handle_int & except_vec_vi_handler) which is structured as follows:
+ *
+ *   top ------------
+ *       | task sp  | <- irq_stack[cpu] + IRQ_STACK_START
+ *       ------------
+ *       |          | <- First frame of IRQ context
+ *       ------------
+ *
+ * task sp holds a copy of the task stack pointer where the struct pt_regs
+ * from exception entry can be found.
+ */
+
 static inline bool on_irq_stack(int cpu, unsigned long sp)
 {
 	unsigned long low = (unsigned long)irq_stack[cpu];
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 4be2763f835d..bfff6ea45d51 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -103,6 +103,7 @@ void output_thread_info_defines(void)
 	DEFINE(_THREAD_SIZE, THREAD_SIZE);
 	DEFINE(_THREAD_MASK, THREAD_MASK);
 	DEFINE(_IRQ_STACK_SIZE, IRQ_STACK_SIZE);
+	DEFINE(_IRQ_STACK_START, IRQ_STACK_START);
 	BLANK();
 }
 
diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S
index 59476a607add..a00e87b0256d 100644
--- a/arch/mips/kernel/cps-vec.S
+++ b/arch/mips/kernel/cps-vec.S
@@ -361,7 +361,7 @@ LEAF(mips_cps_get_bootcfg)
 	END(mips_cps_get_bootcfg)
 
 LEAF(mips_cps_boot_vpes)
-	PTR_L	ta2, COREBOOTCFG_VPEMASK(a0)
+	lw	ta2, COREBOOTCFG_VPEMASK(a0)
 	PTR_L	ta3, COREBOOTCFG_VPECONFIG(a0)
 
 #if defined(CONFIG_CPU_MIPSR6)
diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S
index 2ac6c2625c13..ae810da4d499 100644
--- a/arch/mips/kernel/genex.S
+++ b/arch/mips/kernel/genex.S
@@ -215,9 +215,11 @@ NESTED(handle_int, PT_SIZE, sp)
 	beq	t0, t1, 2f
 
 	/* Switch to IRQ stack */
-	li	t1, _IRQ_STACK_SIZE
+	li	t1, _IRQ_STACK_START
 	PTR_ADD sp, t0, t1
 
+	/* Save task's sp on IRQ stack so that unwinding can follow it */
+	LONG_S	s1, 0(sp)
 2:
 	jal	plat_irq_dispatch
 
@@ -325,9 +327,11 @@ NESTED(except_vec_vi_handler, 0, sp)
 	beq	t0, t1, 2f
 
 	/* Switch to IRQ stack */
-	li	t1, _IRQ_STACK_SIZE
+	li	t1, _IRQ_STACK_START
 	PTR_ADD sp, t0, t1
 
+	/* Save task's sp on IRQ stack so that unwinding can follow it */
+	LONG_S	s1, 0(sp)
 2:
 	jalr	v0
 
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index fbbf5fcc695a..1b50958a1373 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -487,31 +487,52 @@ unsigned long notrace unwind_stack_by_address(unsigned long stack_page,
 					      unsigned long pc,
 					      unsigned long *ra)
 {
+	unsigned long low, high, irq_stack_high;
 	struct mips_frame_info info;
 	unsigned long size, ofs;
+	struct pt_regs *regs;
 	int leaf;
-	extern void ret_from_irq(void);
-	extern void ret_from_exception(void);
 
 	if (!stack_page)
 		return 0;
 
 	/*
-	 * If we reached the bottom of interrupt context,
-	 * return saved pc in pt_regs.
+	 * IRQ stacks start at IRQ_STACK_START
+	 * task stacks at THREAD_SIZE - 32
 	 */
-	if (pc == (unsigned long)ret_from_irq ||
-	    pc == (unsigned long)ret_from_exception) {
-		struct pt_regs *regs;
-		if (*sp >= stack_page &&
-		    *sp + sizeof(*regs) <= stack_page + THREAD_SIZE - 32) {
-			regs = (struct pt_regs *)*sp;
-			pc = regs->cp0_epc;
-			if (!user_mode(regs) && __kernel_text_address(pc)) {
-				*sp = regs->regs[29];
-				*ra = regs->regs[31];
-				return pc;
-			}
+	low = stack_page;
+	if (!preemptible() && on_irq_stack(raw_smp_processor_id(), *sp)) {
+		high = stack_page + IRQ_STACK_START;
+		irq_stack_high = high;
+	} else {
+		high = stack_page + THREAD_SIZE - 32;
+		irq_stack_high = 0;
+	}
+
+	/*
+	 * If we reached the top of the interrupt stack, start unwinding
+	 * the interrupted task stack.
+	 */
+	if (unlikely(*sp == irq_stack_high)) {
+		unsigned long task_sp = *(unsigned long *)*sp;
+
+		/*
+		 * Check that the pointer saved in the IRQ stack head points to
+		 * something within the stack of the current task
+		 */
+		if (!object_is_on_stack((void *)task_sp))
+			return 0;
+
+		/*
+		 * Follow pointer to tasks kernel stack frame where interrupted
+		 * state was saved.
+		 */
+		regs = (struct pt_regs *)task_sp;
+		pc = regs->cp0_epc;
+		if (!user_mode(regs) && __kernel_text_address(pc)) {
+			*sp = regs->regs[29];
+			*ra = regs->regs[31];
+			return pc;
 		}
 		return 0;
 	}
@@ -532,8 +553,7 @@ unsigned long notrace unwind_stack_by_address(unsigned long stack_page,
 	if (leaf < 0)
 		return 0;
 
-	if (*sp < stack_page ||
-	    *sp + info.frame_size > stack_page + THREAD_SIZE - 32)
+	if (*sp < low || *sp + info.frame_size > high)
 		return 0;
 
 	if (leaf)
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index d5de67591735..f0a0e6d62be3 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -182,7 +182,7 @@ SECTIONS
 	 * Force .bss to 64K alignment so that .bss..swapper_pg_dir
 	 * gets that alignment.	 .sbss should be empty, so there will be
 	 * no holes after __init_end. */
-	BSS_SECTION(0, 0x10000, 0)
+	BSS_SECTION(0, 0x10000, 8)
 
 	_end = . ;
 
diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c
index 90565477dfbd..95bec460b651 100644
--- a/arch/mips/lantiq/xway/sysctrl.c
+++ b/arch/mips/lantiq/xway/sysctrl.c
@@ -469,8 +469,8 @@ void __init ltq_soc_init(void)
 			panic("Failed to load xbar nodes from devicetree");
 		if (of_address_to_resource(np_xbar, 0, &res_xbar))
 			panic("Failed to get xbar resources");
-		if (request_mem_region(res_xbar.start, resource_size(&res_xbar),
-			res_xbar.name) < 0)
+		if (!request_mem_region(res_xbar.start, resource_size(&res_xbar),
+			res_xbar.name))
 			panic("Failed to get xbar resources");
 
 		ltq_xbar_membase = ioremap_nocache(res_xbar.start,
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index e9385bcd9723..9ade60ca08e0 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -2386,7 +2386,6 @@ dcopuop:
 					break;
 				default:
 					/* Reserved R6 ops */
-					pr_err("Reserved MIPS R6 CMP.condn.S operation\n");
 					return SIGILL;
 				}
 			}
@@ -2460,7 +2459,6 @@ dcopuop:
 					break;
 				default:
 					/* Reserved R6 ops */
-					pr_err("Reserved MIPS R6 CMP.condn.D operation\n");
 					return SIGILL;
 				}
 			}
diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c
index fd71b8daaaf2..5bec64f2884e 100644
--- a/arch/mips/math-emu/dp_fmax.c
+++ b/arch/mips/math-emu/dp_fmax.c
@@ -47,14 +47,26 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y)
 	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
 		return ieee754dp_nanxcpt(x);
 
-	/* numbers are preferred to NaNs */
+	/*
+	 * Quiet NaN handling
+	 */
+
+	/*
+	 *    The case of both inputs quiet NaNs
+	 */
+	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
+		return x;
+
+	/*
+	 *    The cases of exactly one input quiet NaN (numbers
+	 *    are here preferred as returned values to NaNs)
+	 */
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
@@ -80,9 +92,7 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y)
 		return ys ? x : y;
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-		if (xs == ys)
-			return x;
-		return ieee754dp_zero(1);
+		return ieee754dp_zero(xs & ys);
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
@@ -106,16 +116,32 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y)
 	else if (xs < ys)
 		return x;
 
-	/* Compare exponent */
-	if (xe > ye)
-		return x;
-	else if (xe < ye)
-		return y;
+	/* Signs of inputs are equal, let's compare exponents */
+	if (xs == 0) {
+		/* Inputs are both positive */
+		if (xe > ye)
+			return x;
+		else if (xe < ye)
+			return y;
+	} else {
+		/* Inputs are both negative */
+		if (xe > ye)
+			return y;
+		else if (xe < ye)
+			return x;
+	}
 
-	/* Compare mantissa */
+	/* Signs and exponents of inputs are equal, let's compare mantissas */
+	if (xs == 0) {
+		/* Inputs are both positive, with equal signs and exponents */
+		if (xm <= ym)
+			return y;
+		return x;
+	}
+	/* Inputs are both negative, with equal signs and exponents */
 	if (xm <= ym)
-		return y;
-	return x;
+		return x;
+	return y;
 }
 
 union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y)
@@ -147,14 +173,26 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y)
 	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
 		return ieee754dp_nanxcpt(x);
 
-	/* numbers are preferred to NaNs */
+	/*
+	 * Quiet NaN handling
+	 */
+
+	/*
+	 *    The case of both inputs quiet NaNs
+	 */
+	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
+		return x;
+
+	/*
+	 *    The cases of exactly one input quiet NaN (numbers
+	 *    are here preferred as returned values to NaNs)
+	 */
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
@@ -164,6 +202,9 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y)
 	/*
 	 * Infinity and zero handling
 	 */
+	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
+		return ieee754dp_inf(xs & ys);
+
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
@@ -171,7 +212,6 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y)
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
@@ -180,9 +220,7 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y)
 		return y;
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-		if (xs == ys)
-			return x;
-		return ieee754dp_zero(1);
+		return ieee754dp_zero(xs & ys);
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
@@ -207,7 +245,11 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y)
 		return y;
 
 	/* Compare mantissa */
-	if (xm <= ym)
+	if (xm < ym)
 		return y;
-	return x;
+	else if (xm > ym)
+		return x;
+	else if (xs == 0)
+		return x;
+	return y;
 }
diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c
index c1072b0dfb95..a287b23818d8 100644
--- a/arch/mips/math-emu/dp_fmin.c
+++ b/arch/mips/math-emu/dp_fmin.c
@@ -47,14 +47,26 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y)
 	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
 		return ieee754dp_nanxcpt(x);
 
-	/* numbers are preferred to NaNs */
+	/*
+	 * Quiet NaN handling
+	 */
+
+	/*
+	 *    The case of both inputs quiet NaNs
+	 */
+	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
+		return x;
+
+	/*
+	 *    The cases of exactly one input quiet NaN (numbers
+	 *    are here preferred as returned values to NaNs)
+	 */
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
@@ -80,9 +92,7 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y)
 		return ys ? y : x;
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-		if (xs == ys)
-			return x;
-		return ieee754dp_zero(1);
+		return ieee754dp_zero(xs | ys);
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
@@ -106,16 +116,32 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y)
 	else if (xs < ys)
 		return y;
 
-	/* Compare exponent */
-	if (xe > ye)
-		return y;
-	else if (xe < ye)
-		return x;
+	/* Signs of inputs are the same, let's compare exponents */
+	if (xs == 0) {
+		/* Inputs are both positive */
+		if (xe > ye)
+			return y;
+		else if (xe < ye)
+			return x;
+	} else {
+		/* Inputs are both negative */
+		if (xe > ye)
+			return x;
+		else if (xe < ye)
+			return y;
+	}
 
-	/* Compare mantissa */
+	/* Signs and exponents of inputs are equal, let's compare mantissas */
+	if (xs == 0) {
+		/* Inputs are both positive, with equal signs and exponents */
+		if (xm <= ym)
+			return x;
+		return y;
+	}
+	/* Inputs are both negative, with equal signs and exponents */
 	if (xm <= ym)
-		return x;
-	return y;
+		return y;
+	return x;
 }
 
 union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y)
@@ -147,14 +173,26 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y)
 	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
 		return ieee754dp_nanxcpt(x);
 
-	/* numbers are preferred to NaNs */
+	/*
+	 * Quiet NaN handling
+	 */
+
+	/*
+	 *    The case of both inputs quiet NaNs
+	 */
+	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
+		return x;
+
+	/*
+	 *    The cases of exactly one input quiet NaN (numbers
+	 *    are here preferred as returned values to NaNs)
+	 */
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
@@ -164,25 +202,25 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y)
 	/*
 	 * Infinity and zero handling
 	 */
+	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
+		return ieee754dp_inf(xs | ys);
+
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
-		return x;
+		return y;
 
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM):
-		return y;
+		return x;
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-		if (xs == ys)
-			return x;
-		return ieee754dp_zero(1);
+		return ieee754dp_zero(xs | ys);
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
@@ -207,7 +245,11 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y)
 		return x;
 
 	/* Compare mantissa */
-	if (xm <= ym)
+	if (xm < ym)
+		return x;
+	else if (xm > ym)
+		return y;
+	else if (xs == 1)
 		return x;
 	return y;
 }
diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 4a2d03c72959..e0d9be5fbf4c 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -14,22 +14,45 @@
 
 #include "ieee754dp.h"
 
-enum maddf_flags {
-	maddf_negate_product	= 1 << 0,
-};
+
+/* 128 bits shift right logical with rounding. */
+void srl128(u64 *hptr, u64 *lptr, int count)
+{
+	u64 low;
+
+	if (count >= 128) {
+		*lptr = *hptr != 0 || *lptr != 0;
+		*hptr = 0;
+	} else if (count >= 64) {
+		if (count == 64) {
+			*lptr = *hptr | (*lptr != 0);
+		} else {
+			low = *lptr;
+			*lptr = *hptr >> (count - 64);
+			*lptr |= (*hptr << (128 - count)) != 0 || low != 0;
+		}
+		*hptr = 0;
+	} else {
+		low = *lptr;
+		*lptr = low >> count | *hptr << (64 - count);
+		*lptr |= (low << (64 - count)) != 0;
+		*hptr = *hptr >> count;
+	}
+}
 
 static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 				 union ieee754dp y, enum maddf_flags flags)
 {
 	int re;
 	int rs;
-	u64 rm;
 	unsigned lxm;
 	unsigned hxm;
 	unsigned lym;
 	unsigned hym;
 	u64 lrm;
 	u64 hrm;
+	u64 lzm;
+	u64 hzm;
 	u64 t;
 	u64 at;
 	int s;
@@ -48,52 +71,34 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	ieee754_clearcx();
 
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
+	/*
+	 * Handle the cases when at least one of x, y or z is a NaN.
+	 * Order of precedence is sNaN, qNaN and z, x, y.
+	 */
+	if (zc == IEEE754_CLASS_SNAN)
 		return ieee754dp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		DPDNORMZ;
-	/* QNAN is handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
-		return ieee754dp_nanxcpt(y);
-
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
+	if (xc == IEEE754_CLASS_SNAN)
 		return ieee754dp_nanxcpt(x);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
+	if (yc == IEEE754_CLASS_SNAN)
+		return ieee754dp_nanxcpt(y);
+	if (zc == IEEE754_CLASS_QNAN)
+		return z;
+	if (xc == IEEE754_CLASS_QNAN)
+		return x;
+	if (yc == IEEE754_CLASS_QNAN)
 		return y;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
+	if (zc == IEEE754_CLASS_DNORM)
+		DPDNORMZ;
+	/* ZERO z cases are handled separately below */
 
+	switch (CLPAIR(xc, yc)) {
 
 	/*
 	 * Infinity handling
 	 */
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754dp_indef();
 
@@ -102,9 +107,27 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		return ieee754dp_inf(xs ^ ys);
+		if ((zc == IEEE754_CLASS_INF) &&
+		    ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) ||
+		     ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) {
+			/*
+			 * Cases of addition of infinities with opposite signs
+			 * or subtraction of infinities with same signs.
+			 */
+			ieee754_setcx(IEEE754_INVALID_OPERATION);
+			return ieee754dp_indef();
+		}
+		/*
+		 * z is here either not an infinity, or an infinity having the
+		 * same sign as product (x*y) (in case of MADDF.D instruction)
+		 * or product -(x*y) (in MSUBF.D case). The result must be an
+		 * infinity, and its sign is determined only by the value of
+		 * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y.
+		 */
+		if (flags & MADDF_NEGATE_PRODUCT)
+			return ieee754dp_inf(1 ^ (xs ^ ys));
+		else
+			return ieee754dp_inf(xs ^ ys);
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
@@ -113,32 +136,42 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
 		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
-		/* Multiplication is 0 so just return z */
+		if (zc == IEEE754_CLASS_ZERO) {
+			/* Handle cases +0 + (-0) and similar ones. */
+			if ((!(flags & MADDF_NEGATE_PRODUCT)
+					&& (zs == (xs ^ ys))) ||
+			    ((flags & MADDF_NEGATE_PRODUCT)
+					&& (zs != (xs ^ ys))))
+				/*
+				 * Cases of addition of zeros of equal signs
+				 * or subtraction of zeroes of opposite signs.
+				 * The sign of the resulting zero is in any
+				 * such case determined only by the sign of z.
+				 */
+				return z;
+
+			return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD);
+		}
+		/* x*y is here 0, and z is not 0, so just return z */
 		return z;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
 		DPDNORMY;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
 		DPDNORMX;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
 		/* fall through to real computations */
 	}
@@ -157,7 +190,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	re = xe + ye;
 	rs = xs ^ ys;
-	if (flags & maddf_negate_product)
+	if (flags & MADDF_NEGATE_PRODUCT)
 		rs ^= 1;
 
 	/* shunt to top of word */
@@ -165,7 +198,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 	ym <<= 64 - (DP_FBITS + 1);
 
 	/*
-	 * Multiply 64 bits xm, ym to give high 64 bits rm with stickness.
+	 * Multiply 64 bits xm and ym to give 128 bits result in hrm:lrm.
 	 */
 
 	/* 32 * 32 => 64 */
@@ -195,78 +228,110 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	hrm = hrm + (t >> 32);
 
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((s64) rm < 0) {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3))) |
-		     ((rm << (DP_FBITS + 1 + 3)) != 0);
+	/* Put explicit bit at bit 126 if necessary */
+	if ((int64_t)hrm < 0) {
+		lrm = (hrm << 63) | (lrm >> 1);
+		hrm = hrm >> 1;
 		re++;
-	} else {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (DP_FBITS + 1 + 3 + 1)) != 0);
 	}
-	assert(rm & (DP_HIDDEN_BIT << 3));
 
-	/* And now the addition */
-	assert(zm & DP_HIDDEN_BIT);
+	assert(hrm & (1 << 62));
 
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
+	if (zc == IEEE754_CLASS_ZERO) {
+		/*
+		 * Move explicit bit from bit 126 to bit 55 since the
+		 * ieee754dp_format code expects the mantissa to be
+		 * 56 bits wide (53 + 3 rounding bits).
+		 */
+		srl128(&hrm, &lrm, (126 - 55));
+		return ieee754dp_format(rs, re, lrm);
+	}
+
+	/* Move explicit bit from bit 52 to bit 126 */
+	lzm = 0;
+	hzm = zm << 10;
+	assert(hzm & (1 << 62));
 
+	/* Make the exponents the same */
 	if (ze > re) {
 		/*
 		 * Have to shift y fraction right to align.
 		 */
 		s = ze - re;
-		rm = XDPSRS(rm, s);
+		srl128(&hrm, &lrm, s);
 		re += s;
 	} else if (re > ze) {
 		/*
 		 * Have to shift x fraction right to align.
 		 */
 		s = re - ze;
-		zm = XDPSRS(zm, s);
+		srl128(&hzm, &lzm, s);
 		ze += s;
 	}
 	assert(ze == re);
 	assert(ze <= DP_EMAX);
 
+	/* Do the addition */
 	if (zs == rs) {
 		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in xm, xs and xe.
+		 * Generate 128 bit result by adding two 127 bit numbers
+		 * leaving result in hzm:lzm, zs and ze.
 		 */
-		zm = zm + rm;
-
-		if (zm >> (DP_FBITS + 1 + 3)) { /* carry out */
-			zm = XDPSRS1(zm);
+		hzm = hzm + hrm + (lzm > (lzm + lrm));
+		lzm = lzm + lrm;
+		if ((int64_t)hzm < 0) {        /* carry out */
+			srl128(&hzm, &lzm, 1);
 			ze++;
 		}
 	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
+		if (hzm > hrm || (hzm == hrm && lzm >= lrm)) {
+			hzm = hzm - hrm - (lzm < lrm);
+			lzm = lzm - lrm;
 		} else {
-			zm = rm - zm;
+			hzm = hrm - hzm - (lrm < lzm);
+			lzm = lrm - lzm;
 			zs = rs;
 		}
-		if (zm == 0)
+		if (lzm == 0 && hzm == 0)
 			return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD);
 
 		/*
-		 * Normalize to rounding precision.
+		 * Put explicit bit at bit 126 if necessary.
 		 */
-		while ((zm >> (DP_FBITS + 3)) == 0) {
-			zm <<= 1;
-			ze--;
+		if (hzm == 0) {
+			/* left shift by 63 or 64 bits */
+			if ((int64_t)lzm < 0) {
+				/* MSB of lzm is the explicit bit */
+				hzm = lzm >> 1;
+				lzm = lzm << 63;
+				ze -= 63;
+			} else {
+				hzm = lzm;
+				lzm = 0;
+				ze -= 64;
+			}
+		}
+
+		t = 0;
+		while ((hzm >> (62 - t)) == 0)
+			t++;
+
+		assert(t <= 62);
+		if (t) {
+			hzm = hzm << t | lzm >> (64 - t);
+			lzm = lzm << t;
+			ze -= t;
 		}
 	}
 
-	return ieee754dp_format(zs, ze, zm);
+	/*
+	 * Move explicit bit from bit 126 to bit 55 since the
+	 * ieee754dp_format code expects the mantissa to be
+	 * 56 bits wide (53 + 3 rounding bits).
+	 */
+	srl128(&hzm, &lzm, (126 - 55));
+
+	return ieee754dp_format(zs, ze, lzm);
 }
 
 union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
@@ -278,5 +343,5 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
 union ieee754dp ieee754dp_msubf(union ieee754dp z, union ieee754dp x,
 				union ieee754dp y)
 {
-	return _dp_maddf(z, x, y, maddf_negate_product);
+	return _dp_maddf(z, x, y, MADDF_NEGATE_PRODUCT);
 }
diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h
index 8bc2f6963324..dd2071f430e0 100644
--- a/arch/mips/math-emu/ieee754int.h
+++ b/arch/mips/math-emu/ieee754int.h
@@ -26,6 +26,10 @@
 
 #define CLPAIR(x, y)	((x)*6+(y))
 
+enum maddf_flags {
+	MADDF_NEGATE_PRODUCT	= 1 << 0,
+};
+
 static inline void ieee754_clearcx(void)
 {
 	ieee754_csr.cx = 0;
diff --git a/arch/mips/math-emu/ieee754sp.h b/arch/mips/math-emu/ieee754sp.h
index 8476067075fe..0f63e4202cff 100644
--- a/arch/mips/math-emu/ieee754sp.h
+++ b/arch/mips/math-emu/ieee754sp.h
@@ -45,6 +45,10 @@ static inline int ieee754sp_finite(union ieee754sp x)
 	return SPBEXP(x) != SP_EMAX + 1 + SP_EBIAS;
 }
 
+/* 64 bit right shift with rounding */
+#define XSPSRS64(v, rs)						\
+	(((rs) >= 64) ? ((v) != 0) : ((v) >> (rs)) | ((v) << (64-(rs)) != 0))
+
 /* 3bit extended single precision sticky right shift */
 #define XSPSRS(v, rs)						\
 	((rs > (SP_FBITS+3))?1:((v) >> (rs)) | ((v) << (32-(rs)) != 0))
diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c
index 4d000844e48e..74a5a00d2f22 100644
--- a/arch/mips/math-emu/sp_fmax.c
+++ b/arch/mips/math-emu/sp_fmax.c
@@ -47,14 +47,26 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y)
 	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
 		return ieee754sp_nanxcpt(x);
 
-	/* numbers are preferred to NaNs */
+	/*
+	 * Quiet NaN handling
+	 */
+
+	/*
+	 *    The case of both inputs quiet NaNs
+	 */
+	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
+		return x;
+
+	/*
+	 *    The cases of exactly one input quiet NaN (numbers
+	 *    are here preferred as returned values to NaNs)
+	 */
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
@@ -80,9 +92,7 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y)
 		return ys ? x : y;
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-		if (xs == ys)
-			return x;
-		return ieee754sp_zero(1);
+		return ieee754sp_zero(xs & ys);
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
@@ -106,16 +116,32 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y)
 	else if (xs < ys)
 		return x;
 
-	/* Compare exponent */
-	if (xe > ye)
-		return x;
-	else if (xe < ye)
-		return y;
+	/* Signs of inputs are equal, let's compare exponents */
+	if (xs == 0) {
+		/* Inputs are both positive */
+		if (xe > ye)
+			return x;
+		else if (xe < ye)
+			return y;
+	} else {
+		/* Inputs are both negative */
+		if (xe > ye)
+			return y;
+		else if (xe < ye)
+			return x;
+	}
 
-	/* Compare mantissa */
+	/* Signs and exponents of inputs are equal, let's compare mantissas */
+	if (xs == 0) {
+		/* Inputs are both positive, with equal signs and exponents */
+		if (xm <= ym)
+			return y;
+		return x;
+	}
+	/* Inputs are both negative, with equal signs and exponents */
 	if (xm <= ym)
-		return y;
-	return x;
+		return x;
+	return y;
 }
 
 union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y)
@@ -147,14 +173,26 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y)
 	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
 		return ieee754sp_nanxcpt(x);
 
-	/* numbers are preferred to NaNs */
+	/*
+	 * Quiet NaN handling
+	 */
+
+	/*
+	 *    The case of both inputs quiet NaNs
+	 */
+	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
+		return x;
+
+	/*
+	 *    The cases of exactly one input quiet NaN (numbers
+	 *    are here preferred as returned values to NaNs)
+	 */
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
@@ -164,6 +202,9 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y)
 	/*
 	 * Infinity and zero handling
 	 */
+	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
+		return ieee754sp_inf(xs & ys);
+
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
@@ -171,7 +212,6 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y)
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
@@ -180,9 +220,7 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y)
 		return y;
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-		if (xs == ys)
-			return x;
-		return ieee754sp_zero(1);
+		return ieee754sp_zero(xs & ys);
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
@@ -207,7 +245,11 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y)
 		return y;
 
 	/* Compare mantissa */
-	if (xm <= ym)
+	if (xm < ym)
 		return y;
-	return x;
+	else if (xm > ym)
+		return x;
+	else if (xs == 0)
+		return x;
+	return y;
 }
diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c
index 4eb1bb9e9dec..c51385f46b09 100644
--- a/arch/mips/math-emu/sp_fmin.c
+++ b/arch/mips/math-emu/sp_fmin.c
@@ -47,14 +47,26 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y)
 	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
 		return ieee754sp_nanxcpt(x);
 
-	/* numbers are preferred to NaNs */
+	/*
+	 * Quiet NaN handling
+	 */
+
+	/*
+	 *    The case of both inputs quiet NaNs
+	 */
+	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
+		return x;
+
+	/*
+	 *    The cases of exactly one input quiet NaN (numbers
+	 *    are here preferred as returned values to NaNs)
+	 */
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
@@ -80,9 +92,7 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y)
 		return ys ? y : x;
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-		if (xs == ys)
-			return x;
-		return ieee754sp_zero(1);
+		return ieee754sp_zero(xs | ys);
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
@@ -106,16 +116,32 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y)
 	else if (xs < ys)
 		return y;
 
-	/* Compare exponent */
-	if (xe > ye)
-		return y;
-	else if (xe < ye)
-		return x;
+	/* Signs of inputs are the same, let's compare exponents */
+	if (xs == 0) {
+		/* Inputs are both positive */
+		if (xe > ye)
+			return y;
+		else if (xe < ye)
+			return x;
+	} else {
+		/* Inputs are both negative */
+		if (xe > ye)
+			return x;
+		else if (xe < ye)
+			return y;
+	}
 
-	/* Compare mantissa */
+	/* Signs and exponents of inputs are equal, let's compare mantissas */
+	if (xs == 0) {
+		/* Inputs are both positive, with equal signs and exponents */
+		if (xm <= ym)
+			return x;
+		return y;
+	}
+	/* Inputs are both negative, with equal signs and exponents */
 	if (xm <= ym)
-		return x;
-	return y;
+		return y;
+	return x;
 }
 
 union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y)
@@ -147,14 +173,26 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y)
 	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
 		return ieee754sp_nanxcpt(x);
 
-	/* numbers are preferred to NaNs */
+	/*
+	 * Quiet NaN handling
+	 */
+
+	/*
+	 *    The case of both inputs quiet NaNs
+	 */
+	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
+		return x;
+
+	/*
+	 *    The cases of exactly one input quiet NaN (numbers
+	 *    are here preferred as returned values to NaNs)
+	 */
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
 		return x;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
@@ -164,25 +202,25 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y)
 	/*
 	 * Infinity and zero handling
 	 */
+	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
+		return ieee754sp_inf(xs | ys);
+
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
-		return x;
+		return y;
 
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM):
-		return y;
+		return x;
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-		if (xs == ys)
-			return x;
-		return ieee754sp_zero(1);
+		return ieee754sp_zero(xs | ys);
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
@@ -207,7 +245,11 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y)
 		return x;
 
 	/* Compare mantissa */
-	if (xm <= ym)
+	if (xm < ym)
+		return x;
+	else if (xm > ym)
+		return y;
+	else if (xs == 1)
 		return x;
 	return y;
 }
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index a8cd8b4f235e..7195fe785d81 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -14,9 +14,6 @@
 
 #include "ieee754sp.h"
 
-enum maddf_flags {
-	maddf_negate_product	= 1 << 0,
-};
 
 static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 				 union ieee754sp y, enum maddf_flags flags)
@@ -24,14 +21,8 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	int re;
 	int rs;
 	unsigned rm;
-	unsigned short lxm;
-	unsigned short hxm;
-	unsigned short lym;
-	unsigned short hym;
-	unsigned lrm;
-	unsigned hrm;
-	unsigned t;
-	unsigned at;
+	uint64_t rm64;
+	uint64_t zm64;
 	int s;
 
 	COMPXSP;
@@ -48,51 +39,35 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	ieee754_clearcx();
 
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
+	/*
+	 * Handle the cases when at least one of x, y or z is a NaN.
+	 * Order of precedence is sNaN, qNaN and z, x, y.
+	 */
+	if (zc == IEEE754_CLASS_SNAN)
 		return ieee754sp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		SPDNORMZ;
-	/* QNAN is handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
+	if (xc == IEEE754_CLASS_SNAN)
+		return ieee754sp_nanxcpt(x);
+	if (yc == IEEE754_CLASS_SNAN)
 		return ieee754sp_nanxcpt(y);
+	if (zc == IEEE754_CLASS_QNAN)
+		return z;
+	if (xc == IEEE754_CLASS_QNAN)
+		return x;
+	if (yc == IEEE754_CLASS_QNAN)
+		return y;
 
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
-		return ieee754sp_nanxcpt(x);
+	if (zc == IEEE754_CLASS_DNORM)
+		SPDNORMZ;
+	/* ZERO z cases are handled separately below */
 
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
-		return y;
+	switch (CLPAIR(xc, yc)) {
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
 
 	/*
 	 * Infinity handling
 	 */
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754sp_indef();
 
@@ -101,9 +76,27 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		return ieee754sp_inf(xs ^ ys);
+		if ((zc == IEEE754_CLASS_INF) &&
+		    ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) ||
+		     ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) {
+			/*
+			 * Cases of addition of infinities with opposite signs
+			 * or subtraction of infinities with same signs.
+			 */
+			ieee754_setcx(IEEE754_INVALID_OPERATION);
+			return ieee754sp_indef();
+		}
+		/*
+		 * z is here either not an infinity, or an infinity having the
+		 * same sign as product (x*y) (in case of MADDF.D instruction)
+		 * or product -(x*y) (in MSUBF.D case). The result must be an
+		 * infinity, and its sign is determined only by the value of
+		 * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y.
+		 */
+		if (flags & MADDF_NEGATE_PRODUCT)
+			return ieee754sp_inf(1 ^ (xs ^ ys));
+		else
+			return ieee754sp_inf(xs ^ ys);
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
@@ -112,32 +105,42 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
 		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
-		/* Multiplication is 0 so just return z */
+		if (zc == IEEE754_CLASS_ZERO) {
+			/* Handle cases +0 + (-0) and similar ones. */
+			if ((!(flags & MADDF_NEGATE_PRODUCT)
+					&& (zs == (xs ^ ys))) ||
+			    ((flags & MADDF_NEGATE_PRODUCT)
+					&& (zs != (xs ^ ys))))
+				/*
+				 * Cases of addition of zeros of equal signs
+				 * or subtraction of zeroes of opposite signs.
+				 * The sign of the resulting zero is in any
+				 * such case determined only by the sign of z.
+				 */
+				return z;
+
+			return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD);
+		}
+		/* x*y is here 0, and z is not 0, so just return z */
 		return z;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
 		SPDNORMY;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
 		SPDNORMX;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
 		/* fall through to real computations */
 	}
@@ -158,108 +161,93 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	re = xe + ye;
 	rs = xs ^ ys;
-	if (flags & maddf_negate_product)
+	if (flags & MADDF_NEGATE_PRODUCT)
 		rs ^= 1;
 
-	/* shunt to top of word */
-	xm <<= 32 - (SP_FBITS + 1);
-	ym <<= 32 - (SP_FBITS + 1);
-
-	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
-	 */
-	lxm = xm & 0xffff;
-	hxm = xm >> 16;
-	lym = ym & 0xffff;
-	hym = ym >> 16;
-
-	lrm = lxm * lym;	/* 16 * 16 => 32 */
-	hrm = hxm * hym;	/* 16 * 16 => 32 */
+	/* Multiple 24 bit xm and ym to give 48 bit results */
+	rm64 = (uint64_t)xm * ym;
 
-	t = lxm * hym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
+	/* Shunt to top of word */
+	rm64 = rm64 << 16;
 
-	t = hxm * lym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((int) rm < 0) {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3))) |
-		    ((rm << (SP_FBITS + 1 + 3)) != 0);
+	/* Put explicit bit at bit 62 if necessary */
+	if ((int64_t) rm64 < 0) {
+		rm64 = rm64 >> 1;
 		re++;
-	} else {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (SP_FBITS + 1 + 3 + 1)) != 0);
 	}
-	assert(rm & (SP_HIDDEN_BIT << 3));
 
-	/* And now the addition */
+	assert(rm64 & (1 << 62));
 
-	assert(zm & SP_HIDDEN_BIT);
+	if (zc == IEEE754_CLASS_ZERO) {
+		/*
+		 * Move explicit bit from bit 62 to bit 26 since the
+		 * ieee754sp_format code expects the mantissa to be
+		 * 27 bits wide (24 + 3 rounding bits).
+		 */
+		rm = XSPSRS64(rm64, (62 - 26));
+		return ieee754sp_format(rs, re, rm);
+	}
 
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
+	/* Move explicit bit from bit 23 to bit 62 */
+	zm64 = (uint64_t)zm << (62 - 23);
+	assert(zm64 & (1 << 62));
 
+	/* Make the exponents the same */
 	if (ze > re) {
 		/*
 		 * Have to shift r fraction right to align.
 		 */
 		s = ze - re;
-		rm = XSPSRS(rm, s);
+		rm64 = XSPSRS64(rm64, s);
 		re += s;
 	} else if (re > ze) {
 		/*
 		 * Have to shift z fraction right to align.
 		 */
 		s = re - ze;
-		zm = XSPSRS(zm, s);
+		zm64 = XSPSRS64(zm64, s);
 		ze += s;
 	}
 	assert(ze == re);
 	assert(ze <= SP_EMAX);
 
+	/* Do the addition */
 	if (zs == rs) {
 		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in zm, zs and ze.
+		 * Generate 64 bit result by adding two 63 bit numbers
+		 * leaving result in zm64, zs and ze.
 		 */
-		zm = zm + rm;
-
-		if (zm >> (SP_FBITS + 1 + 3)) { /* carry out */
-			zm = XSPSRS1(zm);
+		zm64 = zm64 + rm64;
+		if ((int64_t)zm64 < 0) {	/* carry out */
+			zm64 = XSPSRS1(zm64);
 			ze++;
 		}
 	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
+		if (zm64 >= rm64) {
+			zm64 = zm64 - rm64;
 		} else {
-			zm = rm - zm;
+			zm64 = rm64 - zm64;
 			zs = rs;
 		}
-		if (zm == 0)
+		if (zm64 == 0)
 			return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD);
 
 		/*
-		 * Normalize in extended single precision
+		 * Put explicit bit at bit 62 if necessary.
 		 */
-		while ((zm >> (SP_MBITS + 3)) == 0) {
-			zm <<= 1;
+		while ((zm64 >> 62) == 0) {
+			zm64 <<= 1;
 			ze--;
 		}
-
 	}
+
+	/*
+	 * Move explicit bit from bit 62 to bit 26 since the
+	 * ieee754sp_format code expects the mantissa to be
+	 * 27 bits wide (24 + 3 rounding bits).
+	 */
+	zm = XSPSRS64(zm64, (62 - 26));
+
 	return ieee754sp_format(zs, ze, zm);
 }
 
@@ -272,5 +260,5 @@ union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
 union ieee754sp ieee754sp_msubf(union ieee754sp z, union ieee754sp x,
 				union ieee754sp y)
 {
-	return _sp_maddf(z, x, y, maddf_negate_product);
+	return _sp_maddf(z, x, y, MADDF_NEGATE_PRODUCT);
 }
diff --git a/arch/mips/ralink/mt7620.c b/arch/mips/ralink/mt7620.c
index 3c7c9bf57bf3..6f892c1f3ad7 100644
--- a/arch/mips/ralink/mt7620.c
+++ b/arch/mips/ralink/mt7620.c
@@ -176,7 +176,7 @@ static struct rt2880_pmx_func spi_cs1_grp_mt7628[] = {
 
 static struct rt2880_pmx_func spis_grp_mt7628[] = {
 	FUNC("pwm_uart2", 3, 14, 4),
-	FUNC("util", 2, 14, 4),
+	FUNC("utif", 2, 14, 4),
 	FUNC("gpio", 1, 14, 4),
 	FUNC("spis", 0, 14, 4),
 };
@@ -190,28 +190,28 @@ static struct rt2880_pmx_func gpio_grp_mt7628[] = {
 
 static struct rt2880_pmx_func p4led_kn_grp_mt7628[] = {
 	FUNC("jtag", 3, 30, 1),
-	FUNC("util", 2, 30, 1),
+	FUNC("utif", 2, 30, 1),
 	FUNC("gpio", 1, 30, 1),
 	FUNC("p4led_kn", 0, 30, 1),
 };
 
 static struct rt2880_pmx_func p3led_kn_grp_mt7628[] = {
 	FUNC("jtag", 3, 31, 1),
-	FUNC("util", 2, 31, 1),
+	FUNC("utif", 2, 31, 1),
 	FUNC("gpio", 1, 31, 1),
 	FUNC("p3led_kn", 0, 31, 1),
 };
 
 static struct rt2880_pmx_func p2led_kn_grp_mt7628[] = {
 	FUNC("jtag", 3, 32, 1),
-	FUNC("util", 2, 32, 1),
+	FUNC("utif", 2, 32, 1),
 	FUNC("gpio", 1, 32, 1),
 	FUNC("p2led_kn", 0, 32, 1),
 };
 
 static struct rt2880_pmx_func p1led_kn_grp_mt7628[] = {
 	FUNC("jtag", 3, 33, 1),
-	FUNC("util", 2, 33, 1),
+	FUNC("utif", 2, 33, 1),
 	FUNC("gpio", 1, 33, 1),
 	FUNC("p1led_kn", 0, 33, 1),
 };
@@ -232,28 +232,28 @@ static struct rt2880_pmx_func wled_kn_grp_mt7628[] = {
 
 static struct rt2880_pmx_func p4led_an_grp_mt7628[] = {
 	FUNC("jtag", 3, 39, 1),
-	FUNC("util", 2, 39, 1),
+	FUNC("utif", 2, 39, 1),
 	FUNC("gpio", 1, 39, 1),
 	FUNC("p4led_an", 0, 39, 1),
 };
 
 static struct rt2880_pmx_func p3led_an_grp_mt7628[] = {
 	FUNC("jtag", 3, 40, 1),
-	FUNC("util", 2, 40, 1),
+	FUNC("utif", 2, 40, 1),
 	FUNC("gpio", 1, 40, 1),
 	FUNC("p3led_an", 0, 40, 1),
 };
 
 static struct rt2880_pmx_func p2led_an_grp_mt7628[] = {
 	FUNC("jtag", 3, 41, 1),
-	FUNC("util", 2, 41, 1),
+	FUNC("utif", 2, 41, 1),
 	FUNC("gpio", 1, 41, 1),
 	FUNC("p2led_an", 0, 41, 1),
 };
 
 static struct rt2880_pmx_func p1led_an_grp_mt7628[] = {
 	FUNC("jtag", 3, 42, 1),
-	FUNC("util", 2, 42, 1),
+	FUNC("utif", 2, 42, 1),
 	FUNC("gpio", 1, 42, 1),
 	FUNC("p1led_an", 0, 42, 1),
 };
diff --git a/arch/mips/ralink/rt3883.c b/arch/mips/ralink/rt3883.c
index 9e4631acfcb5..3e68e35daf21 100644
--- a/arch/mips/ralink/rt3883.c
+++ b/arch/mips/ralink/rt3883.c
@@ -145,5 +145,5 @@ void prom_soc_init(struct ralink_soc_info *soc_info)
 
 	rt2880_pinmux_data = rt3883_pinmux_data;
 
-	ralink_soc == RT3883_SOC;
+	ralink_soc = RT3883_SOC;
 }
diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c
index 518f4f5f1f43..d63d42533133 100644
--- a/arch/parisc/kernel/perf.c
+++ b/arch/parisc/kernel/perf.c
@@ -39,7 +39,7 @@
  *  the PDC INTRIGUE calls.  This is done to eliminate bugs introduced
  *  in various PDC revisions.  The code is much more maintainable
  *  and reliable this way vs having to debug on every version of PDC
- *  on every box. 
+ *  on every box.
  */
 
 #include <linux/capability.h>
@@ -195,8 +195,8 @@ static int perf_config(uint32_t *image_ptr);
 static int perf_release(struct inode *inode, struct file *file);
 static int perf_open(struct inode *inode, struct file *file);
 static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t *ppos);
-static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, 
-	loff_t *ppos);
+static ssize_t perf_write(struct file *file, const char __user *buf,
+	size_t count, loff_t *ppos);
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 static void perf_start_counters(void);
 static int perf_stop_counters(uint32_t *raddr);
@@ -222,7 +222,7 @@ extern void perf_intrigue_disable_perf_counters (void);
 /*
  * configure:
  *
- * Configure the cpu with a given data image.  First turn off the counters, 
+ * Configure the cpu with a given data image.  First turn off the counters,
  * then download the image, then turn the counters back on.
  */
 static int perf_config(uint32_t *image_ptr)
@@ -234,7 +234,7 @@ static int perf_config(uint32_t *image_ptr)
 	error = perf_stop_counters(raddr);
 	if (error != 0) {
 		printk("perf_config: perf_stop_counters = %ld\n", error);
-		return -EINVAL; 
+		return -EINVAL;
 	}
 
 printk("Preparing to write image\n");
@@ -242,7 +242,7 @@ printk("Preparing to write image\n");
 	error = perf_write_image((uint64_t *)image_ptr);
 	if (error != 0) {
 		printk("perf_config: DOWNLOAD = %ld\n", error);
-		return -EINVAL; 
+		return -EINVAL;
 	}
 
 printk("Preparing to start counters\n");
@@ -254,7 +254,7 @@ printk("Preparing to start counters\n");
 }
 
 /*
- * Open the device and initialize all of its memory.  The device is only 
+ * Open the device and initialize all of its memory.  The device is only
  * opened once, but can be "queried" by multiple processes that know its
  * file descriptor.
  */
@@ -298,8 +298,8 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t
  * called on the processor that the download should happen
  * on.
  */
-static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, 
-	loff_t *ppos)
+static ssize_t perf_write(struct file *file, const char __user *buf,
+	size_t count, loff_t *ppos)
 {
 	int err;
 	size_t image_size;
@@ -307,11 +307,11 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun
 	uint32_t interface_type;
 	uint32_t test;
 
-	if (perf_processor_interface == ONYX_INTF) 
+	if (perf_processor_interface == ONYX_INTF)
 		image_size = PCXU_IMAGE_SIZE;
-	else if (perf_processor_interface == CUDA_INTF) 
+	else if (perf_processor_interface == CUDA_INTF)
 		image_size = PCXW_IMAGE_SIZE;
-	else 
+	else
 		return -EFAULT;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -331,22 +331,22 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun
 
 	/* First check the machine type is correct for
 	   the requested image */
-        if (((perf_processor_interface == CUDA_INTF) &&
-		       (interface_type != CUDA_INTF)) ||
-	    ((perf_processor_interface == ONYX_INTF) &&
-	               (interface_type != ONYX_INTF))) 
+	if (((perf_processor_interface == CUDA_INTF) &&
+			(interface_type != CUDA_INTF)) ||
+		((perf_processor_interface == ONYX_INTF) &&
+			(interface_type != ONYX_INTF)))
 		return -EINVAL;
 
 	/* Next check to make sure the requested image
 	   is valid */
-	if (((interface_type == CUDA_INTF) && 
+	if (((interface_type == CUDA_INTF) &&
 		       (test >= MAX_CUDA_IMAGES)) ||
-	    ((interface_type == ONYX_INTF) && 
-		       (test >= MAX_ONYX_IMAGES))) 
+	    ((interface_type == ONYX_INTF) &&
+		       (test >= MAX_ONYX_IMAGES)))
 		return -EINVAL;
 
 	/* Copy the image into the processor */
-	if (interface_type == CUDA_INTF) 
+	if (interface_type == CUDA_INTF)
 		return perf_config(cuda_images[test]);
 	else
 		return perf_config(onyx_images[test]);
@@ -360,7 +360,7 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun
 static void perf_patch_images(void)
 {
 #if 0 /* FIXME!! */
-/* 
+/*
  * NOTE:  this routine is VERY specific to the current TLB image.
  * If the image is changed, this routine might also need to be changed.
  */
@@ -368,9 +368,9 @@ static void perf_patch_images(void)
 	extern void $i_dtlb_miss_2_0();
 	extern void PA2_0_iva();
 
-	/* 
+	/*
 	 * We can only use the lower 32-bits, the upper 32-bits should be 0
-	 * anyway given this is in the kernel 
+	 * anyway given this is in the kernel
 	 */
 	uint32_t itlb_addr  = (uint32_t)&($i_itlb_miss_2_0);
 	uint32_t dtlb_addr  = (uint32_t)&($i_dtlb_miss_2_0);
@@ -378,21 +378,21 @@ static void perf_patch_images(void)
 
 	if (perf_processor_interface == ONYX_INTF) {
 		/* clear last 2 bytes */
-		onyx_images[TLBMISS][15] &= 0xffffff00;  
+		onyx_images[TLBMISS][15] &= 0xffffff00;
 		/* set 2 bytes */
 		onyx_images[TLBMISS][15] |= (0x000000ff&((dtlb_addr) >> 24));
 		onyx_images[TLBMISS][16] = (dtlb_addr << 8)&0xffffff00;
 		onyx_images[TLBMISS][17] = itlb_addr;
 
 		/* clear last 2 bytes */
-		onyx_images[TLBHANDMISS][15] &= 0xffffff00;  
+		onyx_images[TLBHANDMISS][15] &= 0xffffff00;
 		/* set 2 bytes */
 		onyx_images[TLBHANDMISS][15] |= (0x000000ff&((dtlb_addr) >> 24));
 		onyx_images[TLBHANDMISS][16] = (dtlb_addr << 8)&0xffffff00;
 		onyx_images[TLBHANDMISS][17] = itlb_addr;
 
 		/* clear last 2 bytes */
-		onyx_images[BIG_CPI][15] &= 0xffffff00;  
+		onyx_images[BIG_CPI][15] &= 0xffffff00;
 		/* set 2 bytes */
 		onyx_images[BIG_CPI][15] |= (0x000000ff&((dtlb_addr) >> 24));
 		onyx_images[BIG_CPI][16] = (dtlb_addr << 8)&0xffffff00;
@@ -405,24 +405,24 @@ static void perf_patch_images(void)
 
 	} else if (perf_processor_interface == CUDA_INTF) {
 		/* Cuda interface */
-		cuda_images[TLBMISS][16] =  
+		cuda_images[TLBMISS][16] =
 			(cuda_images[TLBMISS][16]&0xffff0000) |
 			((dtlb_addr >> 8)&0x0000ffff);
-		cuda_images[TLBMISS][17] = 
+		cuda_images[TLBMISS][17] =
 			((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff);
 		cuda_images[TLBMISS][18] = (itlb_addr << 16)&0xffff0000;
 
-		cuda_images[TLBHANDMISS][16] = 
+		cuda_images[TLBHANDMISS][16] =
 			(cuda_images[TLBHANDMISS][16]&0xffff0000) |
 			((dtlb_addr >> 8)&0x0000ffff);
-		cuda_images[TLBHANDMISS][17] = 
+		cuda_images[TLBHANDMISS][17] =
 			((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff);
 		cuda_images[TLBHANDMISS][18] = (itlb_addr << 16)&0xffff0000;
 
-		cuda_images[BIG_CPI][16] = 
+		cuda_images[BIG_CPI][16] =
 			(cuda_images[BIG_CPI][16]&0xffff0000) |
 			((dtlb_addr >> 8)&0x0000ffff);
-		cuda_images[BIG_CPI][17] = 
+		cuda_images[BIG_CPI][17] =
 			((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff);
 		cuda_images[BIG_CPI][18] = (itlb_addr << 16)&0xffff0000;
 	} else {
@@ -434,7 +434,7 @@ static void perf_patch_images(void)
 
 /*
  * ioctl routine
- * All routines effect the processor that they are executed on.  Thus you 
+ * All routines effect the processor that they are executed on.  Thus you
  * must be running on the processor that you wish to change.
  */
 
@@ -460,7 +460,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			}
 
 			/* copy out the Counters */
-			if (copy_to_user((void __user *)arg, raddr, 
+			if (copy_to_user((void __user *)arg, raddr,
 					sizeof (raddr)) != 0) {
 				error =  -EFAULT;
 				break;
@@ -488,7 +488,7 @@ static const struct file_operations perf_fops = {
 	.open = perf_open,
 	.release = perf_release
 };
-	
+
 static struct miscdevice perf_dev = {
 	MISC_DYNAMIC_MINOR,
 	PA_PERF_DEV,
@@ -596,7 +596,7 @@ static int perf_stop_counters(uint32_t *raddr)
 		/* OR sticky2 (bit 1496) to counter2 bit 32 */
 		tmp64 |= (userbuf[23] >> 8) & 0x0000000080000000;
 		raddr[2] = (uint32_t)tmp64;
-		
+
 		/* Counter3 is bits 1497 to 1528 */
 		tmp64 =  (userbuf[23] >> 7) & 0x00000000ffffffff;
 		/* OR sticky3 (bit 1529) to counter3 bit 32 */
@@ -618,7 +618,7 @@ static int perf_stop_counters(uint32_t *raddr)
 		userbuf[22] = 0;
 		userbuf[23] = 0;
 
-		/* 
+		/*
 		 * Write back the zeroed bytes + the image given
 		 * the read was destructive.
 		 */
@@ -626,13 +626,13 @@ static int perf_stop_counters(uint32_t *raddr)
 	} else {
 
 		/*
-		 * Read RDR-15 which contains the counters and sticky bits 
+		 * Read RDR-15 which contains the counters and sticky bits
 		 */
 		if (!perf_rdr_read_ubuf(15, userbuf)) {
 			return -13;
 		}
 
-		/* 
+		/*
 		 * Clear out the counters
 		 */
 		perf_rdr_clear(15);
@@ -645,7 +645,7 @@ static int perf_stop_counters(uint32_t *raddr)
 		raddr[2] = (uint32_t)((userbuf[1] >> 32) & 0x00000000ffffffffUL);
 		raddr[3] = (uint32_t)(userbuf[1] & 0x00000000ffffffffUL);
 	}
- 
+
 	return 0;
 }
 
@@ -683,7 +683,7 @@ static int perf_rdr_read_ubuf(uint32_t	rdr_num, uint64_t *buffer)
 	i = tentry->num_words;
 	while (i--) {
 		buffer[i] = 0;
-	}	
+	}
 
 	/* Check for bits an even number of 64 */
 	if ((xbits = width & 0x03f) != 0) {
@@ -809,18 +809,22 @@ static int perf_write_image(uint64_t *memaddr)
 	}
 
 	runway = ioremap_nocache(cpu_device->hpa.start, 4096);
+	if (!runway) {
+		pr_err("perf_write_image: ioremap failed!\n");
+		return -ENOMEM;
+	}
 
 	/* Merge intrigue bits into Runway STATUS 0 */
 	tmp64 = __raw_readq(runway + RUNWAY_STATUS) & 0xffecfffffffffffful;
-	__raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul), 
+	__raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul),
 		     runway + RUNWAY_STATUS);
-	
+
 	/* Write RUNWAY DEBUG registers */
 	for (i = 0; i < 8; i++) {
 		__raw_writeq(*memaddr++, runway + RUNWAY_DEBUG);
 	}
 
-	return 0; 
+	return 0;
 }
 
 /*
@@ -844,7 +848,7 @@ printk("perf_rdr_write\n");
 			perf_rdr_shift_out_U(rdr_num, buffer[i]);
 		} else {
 			perf_rdr_shift_out_W(rdr_num, buffer[i]);
-		}	
+		}
 	}
 printk("perf_rdr_write done\n");
 }
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 23de307c3052..41e60a9c7db2 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -742,7 +742,7 @@ lws_compare_and_swap_2:
 10:	ldd	0(%r25), %r25
 11:	ldd	0(%r24), %r24
 #else
-	/* Load new value into r22/r23 - high/low */
+	/* Load old value into r22/r23 - high/low */
 10:	ldw	0(%r25), %r22
 11:	ldw	4(%r25), %r23
 	/* Load new value into fr4 for atomic store later */
@@ -834,11 +834,11 @@ cas2_action:
 	copy	%r0, %r28
 #else
 	/* Compare first word */
-19:	ldw,ma	0(%r26), %r29
+19:	ldw	0(%r26), %r29
 	sub,=	%r29, %r22, %r0
 	b,n	cas2_end
 	/* Compare second word */
-20:	ldw,ma	4(%r26), %r29
+20:	ldw	4(%r26), %r29
 	sub,=	%r29, %r23, %r0
 	b,n	cas2_end
 	/* Perform the store */
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index b2da7c8baed7..292458b694fb 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -235,6 +235,28 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr)
 
 #define SWIZ_PTR(p)		((unsigned char __user *)((p) ^ swiz))
 
+#define __get_user_or_set_dar(_regs, _dest, _addr)		\
+	({							\
+		int rc = 0;					\
+		typeof(_addr) __addr = (_addr);			\
+		if (__get_user_inatomic(_dest, __addr)) {	\
+			_regs->dar = (unsigned long)__addr;	\
+			rc = -EFAULT;				\
+		}						\
+		rc;						\
+	})
+
+#define __put_user_or_set_dar(_regs, _src, _addr)		\
+	({							\
+		int rc = 0;					\
+		typeof(_addr) __addr = (_addr);			\
+		if (__put_user_inatomic(_src, __addr)) {	\
+			_regs->dar = (unsigned long)__addr;	\
+			rc = -EFAULT;				\
+		}						\
+		rc;						\
+	})
+
 static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr,
 			    unsigned int reg, unsigned int nb,
 			    unsigned int flags, unsigned int instr,
@@ -263,9 +285,10 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr,
 		} else {
 			unsigned long pc = regs->nip ^ (swiz & 4);
 
-			if (__get_user_inatomic(instr,
-						(unsigned int __user *)pc))
+			if (__get_user_or_set_dar(regs, instr,
+						  (unsigned int __user *)pc))
 				return -EFAULT;
+
 			if (swiz == 0 && (flags & SW))
 				instr = cpu_to_le32(instr);
 			nb = (instr >> 11) & 0x1f;
@@ -309,31 +332,31 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr,
 			       ((nb0 + 3) / 4) * sizeof(unsigned long));
 
 		for (i = 0; i < nb; ++i, ++p)
-			if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz),
-						SWIZ_PTR(p)))
+			if (__get_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz),
+						  SWIZ_PTR(p)))
 				return -EFAULT;
 		if (nb0 > 0) {
 			rptr = &regs->gpr[0];
 			addr += nb;
 			for (i = 0; i < nb0; ++i, ++p)
-				if (__get_user_inatomic(REG_BYTE(rptr,
-								 i ^ bswiz),
-							SWIZ_PTR(p)))
+				if (__get_user_or_set_dar(regs,
+							  REG_BYTE(rptr, i ^ bswiz),
+							  SWIZ_PTR(p)))
 					return -EFAULT;
 		}
 
 	} else {
 		for (i = 0; i < nb; ++i, ++p)
-			if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz),
-						SWIZ_PTR(p)))
+			if (__put_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz),
+						  SWIZ_PTR(p)))
 				return -EFAULT;
 		if (nb0 > 0) {
 			rptr = &regs->gpr[0];
 			addr += nb;
 			for (i = 0; i < nb0; ++i, ++p)
-				if (__put_user_inatomic(REG_BYTE(rptr,
-								 i ^ bswiz),
-							SWIZ_PTR(p)))
+				if (__put_user_or_set_dar(regs,
+							  REG_BYTE(rptr, i ^ bswiz),
+							  SWIZ_PTR(p)))
 					return -EFAULT;
 		}
 	}
@@ -345,29 +368,32 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr,
  * Only POWER6 has these instructions, and it does true little-endian,
  * so we don't need the address swizzling.
  */
-static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg,
-			   unsigned int flags)
+static int emulate_fp_pair(struct pt_regs *regs, unsigned char __user *addr,
+			   unsigned int reg, unsigned int flags)
 {
 	char *ptr0 = (char *) &current->thread.TS_FPR(reg);
 	char *ptr1 = (char *) &current->thread.TS_FPR(reg+1);
-	int i, ret, sw = 0;
+	int i, sw = 0;
 
 	if (reg & 1)
 		return 0;	/* invalid form: FRS/FRT must be even */
 	if (flags & SW)
 		sw = 7;
-	ret = 0;
+
 	for (i = 0; i < 8; ++i) {
 		if (!(flags & ST)) {
-			ret |= __get_user(ptr0[i^sw], addr + i);
-			ret |= __get_user(ptr1[i^sw], addr + i + 8);
+			if (__get_user_or_set_dar(regs, ptr0[i^sw], addr + i))
+				return -EFAULT;
+			if (__get_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8))
+				return -EFAULT;
 		} else {
-			ret |= __put_user(ptr0[i^sw], addr + i);
-			ret |= __put_user(ptr1[i^sw], addr + i + 8);
+			if (__put_user_or_set_dar(regs, ptr0[i^sw], addr + i))
+				return -EFAULT;
+			if (__put_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8))
+				return -EFAULT;
 		}
 	}
-	if (ret)
-		return -EFAULT;
+
 	return 1;	/* exception handled and fixed up */
 }
 
@@ -377,24 +403,27 @@ static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr,
 {
 	char *ptr0 = (char *)&regs->gpr[reg];
 	char *ptr1 = (char *)&regs->gpr[reg+1];
-	int i, ret, sw = 0;
+	int i, sw = 0;
 
 	if (reg & 1)
 		return 0;	/* invalid form: GPR must be even */
 	if (flags & SW)
 		sw = 7;
-	ret = 0;
+
 	for (i = 0; i < 8; ++i) {
 		if (!(flags & ST)) {
-			ret |= __get_user(ptr0[i^sw], addr + i);
-			ret |= __get_user(ptr1[i^sw], addr + i + 8);
+			if (__get_user_or_set_dar(regs, ptr0[i^sw], addr + i))
+				return -EFAULT;
+			if (__get_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8))
+				return -EFAULT;
 		} else {
-			ret |= __put_user(ptr0[i^sw], addr + i);
-			ret |= __put_user(ptr1[i^sw], addr + i + 8);
+			if (__put_user_or_set_dar(regs, ptr0[i^sw], addr + i))
+				return -EFAULT;
+			if (__put_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8))
+				return -EFAULT;
 		}
 	}
-	if (ret)
-		return -EFAULT;
+
 	return 1;	/* exception handled and fixed up */
 }
 #endif /* CONFIG_PPC64 */
@@ -687,9 +716,14 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
 	for (j = 0; j < length; j += elsize) {
 		for (i = 0; i < elsize; ++i) {
 			if (flags & ST)
-				ret |= __put_user(ptr[i^sw], addr + i);
+				ret = __put_user_or_set_dar(regs, ptr[i^sw],
+							    addr + i);
 			else
-				ret |= __get_user(ptr[i^sw], addr + i);
+				ret = __get_user_or_set_dar(regs, ptr[i^sw],
+							    addr + i);
+
+			if (ret)
+				return ret;
 		}
 		ptr  += elsize;
 #ifdef __LITTLE_ENDIAN__
@@ -739,7 +773,7 @@ int fix_alignment(struct pt_regs *regs)
 	unsigned int dsisr;
 	unsigned char __user *addr;
 	unsigned long p, swiz;
-	int ret, i;
+	int i;
 	union data {
 		u64 ll;
 		double dd;
@@ -936,7 +970,7 @@ int fix_alignment(struct pt_regs *regs)
 		if (flags & F) {
 			/* Special case for 16-byte FP loads and stores */
 			PPC_WARN_ALIGNMENT(fp_pair, regs);
-			return emulate_fp_pair(addr, reg, flags);
+			return emulate_fp_pair(regs, addr, reg, flags);
 		} else {
 #ifdef CONFIG_PPC64
 			/* Special case for 16-byte loads and stores */
@@ -966,15 +1000,12 @@ int fix_alignment(struct pt_regs *regs)
 		}
 
 		data.ll = 0;
-		ret = 0;
 		p = (unsigned long)addr;
 
 		for (i = 0; i < nb; i++)
-			ret |= __get_user_inatomic(data.v[start + i],
-						   SWIZ_PTR(p++));
-
-		if (unlikely(ret))
-			return -EFAULT;
+			if (__get_user_or_set_dar(regs, data.v[start + i],
+						  SWIZ_PTR(p++)))
+				return -EFAULT;
 
 	} else if (flags & F) {
 		data.ll = current->thread.TS_FPR(reg);
@@ -1046,15 +1077,13 @@ int fix_alignment(struct pt_regs *regs)
 			break;
 		}
 
-		ret = 0;
 		p = (unsigned long)addr;
 
 		for (i = 0; i < nb; i++)
-			ret |= __put_user_inatomic(data.v[start + i],
-						   SWIZ_PTR(p++));
+			if (__put_user_or_set_dar(regs, data.v[start + i],
+						  SWIZ_PTR(p++)))
+				return -EFAULT;
 
-		if (unlikely(ret))
-			return -EFAULT;
 	} else if (flags & F)
 		current->thread.TS_FPR(reg) = data.ll;
 	else
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 2cb4d5552319..891080c4a41e 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -1243,10 +1243,14 @@ _GLOBAL(ftrace_caller)
 	stdu	r1,-SWITCH_FRAME_SIZE(r1)
 
 	/* Save all gprs to pt_regs */
-	SAVE_8GPRS(0,r1)
-	SAVE_8GPRS(8,r1)
-	SAVE_8GPRS(16,r1)
-	SAVE_8GPRS(24,r1)
+	SAVE_GPR(0, r1)
+	SAVE_10GPRS(2, r1)
+	SAVE_10GPRS(12, r1)
+	SAVE_10GPRS(22, r1)
+
+	/* Save previous stack pointer (r1) */
+	addi	r8, r1, SWITCH_FRAME_SIZE
+	std	r8, GPR1(r1)
 
 	/* Load special regs for save below */
 	mfmsr   r8
@@ -1300,10 +1304,10 @@ ftrace_call:
 #endif
 
 	/* Restore gprs */
-	REST_8GPRS(0,r1)
-	REST_8GPRS(8,r1)
-	REST_8GPRS(16,r1)
-	REST_8GPRS(24,r1)
+	REST_GPR(0,r1)
+	REST_10GPRS(2,r1)
+	REST_10GPRS(12,r1)
+	REST_10GPRS(22,r1)
 
 	/* Restore callee's TOC */
 	ld	r2, 24(r1)
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 2e2fc1e37715..fd68e19b9ef7 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -764,7 +764,29 @@ EXC_REAL(program_check, 0x700, 0x800)
 EXC_VIRT(program_check, 0x4700, 0x4800, 0x700)
 TRAMP_KVM(PACA_EXGEN, 0x700)
 EXC_COMMON_BEGIN(program_check_common)
-	EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
+	/*
+	 * It's possible to receive a TM Bad Thing type program check with
+	 * userspace register values (in particular r1), but with SRR1 reporting
+	 * that we came from the kernel. Normally that would confuse the bad
+	 * stack logic, and we would report a bad kernel stack pointer. Instead
+	 * we switch to the emergency stack if we're taking a TM Bad Thing from
+	 * the kernel.
+	 */
+	li	r10,MSR_PR		/* Build a mask of MSR_PR ..	*/
+	oris	r10,r10,0x200000@h	/* .. and SRR1_PROGTM		*/
+	and	r10,r10,r12		/* Mask SRR1 with that.		*/
+	srdi	r10,r10,8		/* Shift it so we can compare	*/
+	cmpldi	r10,(0x200000 >> 8)	/* .. with an immediate.	*/
+	bne 1f				/* If != go to normal path.	*/
+
+	/* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack	*/
+	andi.	r10,r12,MSR_PR;		/* Set CR0 correctly for label	*/
+					/* 3 in EXCEPTION_PROLOG_COMMON	*/
+	mr	r10,r1			/* Save r1			*/
+	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
+	b 3f				/* Jump into the macro !!	*/
+1:	EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
 	bl	save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index dcbb9144c16d..d97370866a5f 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -131,7 +131,7 @@ static void flush_tmregs_to_thread(struct task_struct *tsk)
 	 * in the appropriate thread structures from live.
 	 */
 
-	if (tsk != current)
+	if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current))
 		return;
 
 	if (MSR_TM_SUSPENDED(mfmsr())) {
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 96698fdf93b4..04e92257fd69 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -452,9 +452,20 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 	if (MSR_TM_RESV(msr))
 		return -EINVAL;
 
-	/* pull in MSR TM from user context */
+	/* pull in MSR TS bits from user context */
 	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
 
+	/*
+	 * Ensure that TM is enabled in regs->msr before we leave the signal
+	 * handler. It could be the case that (a) user disabled the TM bit
+	 * through the manipulation of the MSR bits in uc_mcontext or (b) the
+	 * TM bit was disabled because a sufficient number of context switches
+	 * happened whilst in the signal handler and load_tm overflowed,
+	 * disabling the TM bit. In either case we can end up with an illegal
+	 * TM state leading to a TM Bad Thing when we return to userspace.
+	 */
+	regs->msr |= MSR_TM;
+
 	/* pull in MSR LE from user context */
 	regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
 
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c379ff5a4438..da2a7eccb10a 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -129,8 +129,11 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
 static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
 	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+	struct kvm *kvm = stt->kvm;
 
+	mutex_lock(&kvm->lock);
 	list_del_rcu(&stt->list);
+	mutex_unlock(&kvm->lock);
 
 	kvm_put_kvm(stt->kvm);
 
@@ -150,6 +153,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				   struct kvm_create_spapr_tce_64 *args)
 {
 	struct kvmppc_spapr_tce_table *stt = NULL;
+	struct kvmppc_spapr_tce_table *siter;
 	unsigned long npages, size;
 	int ret = -ENOMEM;
 	int i;
@@ -157,24 +161,16 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	if (!args->size)
 		return -EINVAL;
 
-	/* Check this LIOBN hasn't been previously allocated */
-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == args->liobn)
-			return -EBUSY;
-	}
-
 	size = args->size;
 	npages = kvmppc_tce_pages(size);
 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
-	if (ret) {
-		stt = NULL;
-		goto fail;
-	}
+	if (ret)
+		return ret;
 
 	stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
 		      GFP_KERNEL);
 	if (!stt)
-		goto fail;
+		goto fail_acct;
 
 	stt->liobn = args->liobn;
 	stt->page_shift = args->page_shift;
@@ -188,24 +184,39 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 			goto fail;
 	}
 
-	kvm_get_kvm(kvm);
-
 	mutex_lock(&kvm->lock);
-	list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	/* Check this LIOBN hasn't been previously allocated */
+	ret = 0;
+	list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) {
+		if (siter->liobn == args->liobn) {
+			ret = -EBUSY;
+			break;
+		}
+	}
+
+	if (!ret)
+		ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				       stt, O_RDWR | O_CLOEXEC);
+
+	if (ret >= 0) {
+		list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+		kvm_get_kvm(kvm);
+	}
 
 	mutex_unlock(&kvm->lock);
 
-	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-				stt, O_RDWR | O_CLOEXEC);
+	if (ret >= 0)
+		return ret;
 
-fail:
-	if (stt) {
-		for (i = 0; i < npages; i++)
-			if (stt->pages[i])
-				__free_page(stt->pages[i]);
+ fail:
+	for (i = 0; i < npages; i++)
+		if (stt->pages[i])
+			__free_page(stt->pages[i]);
 
-		kfree(stt);
-	}
+	kfree(stt);
+ fail_acct:
+	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
 	return ret;
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 70963c845e96..fc0df0f6fe88 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -601,8 +601,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		break;
 #endif
 	case KVM_CAP_PPC_HTM:
-		r = cpu_has_feature(CPU_FTR_TM_COMP) &&
-		    is_kvmppc_hv_enabled(kvm);
+		r = cpu_has_feature(CPU_FTR_TM_COMP) && hv_enabled;
 		break;
 	default:
 		r = 0;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index d5ce34dcf4d9..1e28747d677f 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -42,6 +42,8 @@
 #include <linux/memblock.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -421,6 +423,28 @@ static int __init parse_disable_radix(char *p)
 }
 early_param("disable_radix", parse_disable_radix);
 
+/*
+ * If we're running under a hypervisor, we currently can't do radix
+ * since we don't have the code to do the H_REGISTER_PROC_TBL hcall.
+ * We tell that we're running under a hypervisor by looking for the
+ * /chosen/ibm,architecture-vec-5 property.
+ */
+static void early_check_vec5(void)
+{
+	unsigned long root, chosen;
+	int size;
+	const u8 *vec5;
+
+	root = of_get_flat_dt_root();
+	chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
+	if (chosen == -FDT_ERR_NOTFOUND)
+		return;
+	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
+	if (!vec5)
+		return;
+	cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+}
+
 void __init mmu_early_init_devtree(void)
 {
 	/* Disable radix mode based on kernel command line. */
@@ -428,6 +452,15 @@ void __init mmu_early_init_devtree(void)
 	if (disable_radix || !(mfmsr() & MSR_HV))
 		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
 
+	/*
+	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
+	 * When running bare-metal, we can use radix if we like
+	 * even though the ibm,architecture-vec-5 property created by
+	 * skiboot doesn't have the necessary bits set.
+	 */
+	if (early_radix_enabled() && !(mfmsr() & MSR_HV))
+		early_check_vec5();
+
 	if (early_radix_enabled())
 		radix__early_init_devtree();
 	else
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index 4d0a4e5017c2..8e6dd17fe603 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -201,6 +201,10 @@
 	CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \
 	CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL
 
+/*
+ * Lets restrict use of PMC5 for instruction counting.
+ */
+#define P9_DD1_TEST_ADDER	(ISA207_TEST_ADDER | CNST_PMC_VAL(5))
 
 /* Bits in MMCR1 for PowerISA v2.07 */
 #define MMCR1_UNIT_SHIFT(pmc)		(60 - (4 * ((pmc) - 1)))
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 8e9a81967ff8..9abcd8f65504 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -295,7 +295,7 @@ static struct power_pmu power9_pmu = {
 	.name			= "POWER9",
 	.n_counter		= MAX_PMU_COUNTERS,
 	.add_fields		= ISA207_ADD_FIELDS,
-	.test_adder		= ISA207_TEST_ADDER,
+	.test_adder		= P9_DD1_TEST_ADDER,
 	.compute_mmcr		= isa207_compute_mmcr,
 	.config_bhrb		= power9_config_bhrb,
 	.bhrb_filter_map	= power9_bhrb_filter_map,
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index a560a98bcf3b..6a5e7467445c 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -225,8 +225,10 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index)
 		return -ENOENT;
 
 	dn = dlpar_configure_connector(drc_index, parent_dn);
-	if (!dn)
+	if (!dn) {
+		of_node_put(parent_dn);
 		return -ENOENT;
+	}
 
 	rc = dlpar_attach_node(dn);
 	if (rc)
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 303d28eb03a2..591cbdf615af 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -28,6 +28,7 @@
 #include <linux/cpufeature.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
+#include <linux/fips.h>
 #include <crypto/xts.h>
 #include <asm/cpacf.h>
 
@@ -501,6 +502,12 @@ static int xts_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
 	if (err)
 		return err;
 
+	/* In fips mode only 128 bit or 256 bit keys are valid */
+	if (fips_enabled && key_len != 32 && key_len != 64) {
+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
 	/* Pick the correct function code based on the key length */
 	fc = (key_len == 32) ? CPACF_KM_XTS_128 :
 	     (key_len == 64) ? CPACF_KM_XTS_256 : 0;
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index 1113389d0a39..fe7368a41aa8 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -110,22 +110,30 @@ static const u8 initial_parm_block[32] __initconst = {
 
 /*** helper functions ***/
 
+/*
+ * generate_entropy:
+ * This algorithm produces 64 bytes of entropy data based on 1024
+ * individual stckf() invocations assuming that each stckf() value
+ * contributes 0.25 bits of entropy. So the caller gets 256 bit
+ * entropy per 64 byte or 4 bits entropy per byte.
+ */
 static int generate_entropy(u8 *ebuf, size_t nbytes)
 {
 	int n, ret = 0;
-	u8 *pg, *h, hash[32];
+	u8 *pg, *h, hash[64];
 
-	pg = (u8 *) __get_free_page(GFP_KERNEL);
+	/* allocate 2 pages */
+	pg = (u8 *) __get_free_pages(GFP_KERNEL, 1);
 	if (!pg) {
 		prng_errorflag = PRNG_GEN_ENTROPY_FAILED;
 		return -ENOMEM;
 	}
 
 	while (nbytes) {
-		/* fill page with urandom bytes */
-		get_random_bytes(pg, PAGE_SIZE);
-		/* exor page with stckf values */
-		for (n = 0; n < PAGE_SIZE / sizeof(u64); n++) {
+		/* fill pages with urandom bytes */
+		get_random_bytes(pg, 2*PAGE_SIZE);
+		/* exor pages with 1024 stckf values */
+		for (n = 0; n < 2 * PAGE_SIZE / sizeof(u64); n++) {
 			u64 *p = ((u64 *)pg) + n;
 			*p ^= get_tod_clock_fast();
 		}
@@ -134,8 +142,8 @@ static int generate_entropy(u8 *ebuf, size_t nbytes)
 			h = hash;
 		else
 			h = ebuf;
-		/* generate sha256 from this page */
-		cpacf_kimd(CPACF_KIMD_SHA_256, h, pg, PAGE_SIZE);
+		/* hash over the filled pages */
+		cpacf_kimd(CPACF_KIMD_SHA_512, h, pg, 2*PAGE_SIZE);
 		if (n < sizeof(hash))
 			memcpy(ebuf, hash, n);
 		ret += n;
@@ -143,7 +151,7 @@ static int generate_entropy(u8 *ebuf, size_t nbytes)
 		nbytes -= n;
 	}
 
-	free_page((unsigned long)pg);
+	free_pages((unsigned long)pg, 1);
 	return ret;
 }
 
@@ -334,7 +342,7 @@ static int __init prng_sha512_selftest(void)
 static int __init prng_sha512_instantiate(void)
 {
 	int ret, datalen;
-	u8 seed[64];
+	u8 seed[64 + 32 + 16];
 
 	pr_debug("prng runs in SHA-512 mode "
 		 "with chunksize=%d and reseed_limit=%u\n",
@@ -357,12 +365,12 @@ static int __init prng_sha512_instantiate(void)
 	if (ret)
 		goto outfree;
 
-	/* generate initial seed bytestring, first 48 bytes of entropy */
-	ret = generate_entropy(seed, 48);
-	if (ret != 48)
+	/* generate initial seed bytestring, with 256 + 128 bits entropy */
+	ret = generate_entropy(seed, 64 + 32);
+	if (ret != 64 + 32)
 		goto outfree;
 	/* followed by 16 bytes of unique nonce */
-	get_tod_clock_ext(seed + 48);
+	get_tod_clock_ext(seed + 64 + 32);
 
 	/* initial seed of the ppno drng */
 	cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED,
@@ -395,9 +403,9 @@ static void prng_sha512_deinstantiate(void)
 static int prng_sha512_reseed(void)
 {
 	int ret;
-	u8 seed[32];
+	u8 seed[64];
 
-	/* generate 32 bytes of fresh entropy */
+	/* fetch 256 bits of fresh entropy */
 	ret = generate_entropy(seed, sizeof(seed));
 	if (ret != sizeof(seed))
 		return ret;
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index bea785d7f853..af85d6b12028 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -5,6 +5,7 @@
 #include <linux/errno.h>
 
 typedef struct {
+	spinlock_t lock;
 	cpumask_t cpu_attach_mask;
 	atomic_t flush_count;
 	unsigned int flush_mm;
@@ -25,6 +26,7 @@ typedef struct {
 } mm_context_t;
 
 #define INIT_MM_CONTEXT(name)						   \
+	.context.lock =	__SPIN_LOCK_UNLOCKED(name.context.lock),	   \
 	.context.pgtable_lock =						   \
 			__SPIN_LOCK_UNLOCKED(name.context.pgtable_lock),   \
 	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 515fea5a3fc4..f65a708ac395 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,6 +15,7 @@
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
+	spin_lock_init(&mm->context.lock);
 	spin_lock_init(&mm->context.pgtable_lock);
 	INIT_LIST_HEAD(&mm->context.pgtable_list);
 	spin_lock_init(&mm->context.gmap_lock);
@@ -93,7 +94,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	if (prev == next)
 		return;
 	cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
-	cpumask_set_cpu(cpu, mm_cpumask(next));
 	/* Clear old ASCE by loading the kernel ASCE. */
 	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
 	__ctl_load(S390_lowcore.kernel_asce, 7, 7);
@@ -111,9 +111,8 @@ static inline void finish_arch_post_lock_switch(void)
 		preempt_disable();
 		while (atomic_read(&mm->context.flush_count))
 			cpu_relax();
-
-		if (mm->context.flush_mm)
-			__tlb_flush_mm(mm);
+		cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
+		__tlb_flush_mm_lazy(mm);
 		preempt_enable();
 	}
 	set_fs(current->thread.mm_segment);
@@ -126,6 +125,7 @@ static inline void activate_mm(struct mm_struct *prev,
                                struct mm_struct *next)
 {
 	switch_mm(prev, next, current);
+	cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
 	set_user_asce(next);
 }
 
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0cea7026e4ff..db74d398a443 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -480,7 +480,7 @@ static inline int mm_alloc_pgste(struct mm_struct *mm)
  * In the case that a guest uses storage keys
  * faults should no longer be backed by zero pages
  */
-#define mm_forbids_zeropage mm_use_skey
+#define mm_forbids_zeropage mm_has_pgste
 static inline int mm_use_skey(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
@@ -1359,7 +1359,9 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
 static inline void pmdp_invalidate(struct vm_area_struct *vma,
 				   unsigned long addr, pmd_t *pmdp)
 {
-	pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+	pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+
+	pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
 }
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
index 39846100682a..eed927aeb08f 100644
--- a/arch/s390/include/asm/tlbflush.h
+++ b/arch/s390/include/asm/tlbflush.h
@@ -43,23 +43,6 @@ static inline void __tlb_flush_global(void)
  * Flush TLB entries for a specific mm on all CPUs (in case gmap is used
  * this implicates multiple ASCEs!).
  */
-static inline void __tlb_flush_full(struct mm_struct *mm)
-{
-	preempt_disable();
-	atomic_inc(&mm->context.flush_count);
-	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
-		/* Local TLB flush */
-		__tlb_flush_local();
-	} else {
-		/* Global TLB flush */
-		__tlb_flush_global();
-		/* Reset TLB flush mask */
-		cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask);
-	}
-	atomic_dec(&mm->context.flush_count);
-	preempt_enable();
-}
-
 static inline void __tlb_flush_mm(struct mm_struct *mm)
 {
 	unsigned long gmap_asce;
@@ -71,16 +54,18 @@ static inline void __tlb_flush_mm(struct mm_struct *mm)
 	 */
 	preempt_disable();
 	atomic_inc(&mm->context.flush_count);
+	/* Reset TLB flush mask */
+	cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask);
+	barrier();
 	gmap_asce = READ_ONCE(mm->context.gmap_asce);
 	if (MACHINE_HAS_IDTE && gmap_asce != -1UL) {
 		if (gmap_asce)
 			__tlb_flush_idte(gmap_asce);
 		__tlb_flush_idte(mm->context.asce);
 	} else {
-		__tlb_flush_full(mm);
+		/* Global TLB flush */
+		__tlb_flush_global();
 	}
-	/* Reset TLB flush mask */
-	cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask);
 	atomic_dec(&mm->context.flush_count);
 	preempt_enable();
 }
@@ -94,7 +79,6 @@ static inline void __tlb_flush_kernel(void)
 }
 #else
 #define __tlb_flush_global()	__tlb_flush_local()
-#define __tlb_flush_full(mm)	__tlb_flush_local()
 
 /*
  * Flush TLB entries for a specific ASCE on all CPUs.
@@ -112,10 +96,12 @@ static inline void __tlb_flush_kernel(void)
 
 static inline void __tlb_flush_mm_lazy(struct mm_struct * mm)
 {
+	spin_lock(&mm->context.lock);
 	if (mm->context.flush_mm) {
-		__tlb_flush_mm(mm);
 		mm->context.flush_mm = 0;
+		__tlb_flush_mm(mm);
 	}
+	spin_unlock(&mm->context.lock);
 }
 
 /*
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 3ba622702ce4..cb2cd04e6698 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2125,6 +2125,37 @@ static inline void thp_split_mm(struct mm_struct *mm)
 }
 
 /*
+ * Remove all empty zero pages from the mapping for lazy refaulting
+ * - This must be called after mm->context.has_pgste is set, to avoid
+ *   future creation of zero pages
+ * - This must be called after THP was enabled
+ */
+static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
+			   unsigned long end, struct mm_walk *walk)
+{
+	unsigned long addr;
+
+	for (addr = start; addr != end; addr += PAGE_SIZE) {
+		pte_t *ptep;
+		spinlock_t *ptl;
+
+		ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+		if (is_zero_pfn(pte_pfn(*ptep)))
+			ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
+		pte_unmap_unlock(ptep, ptl);
+	}
+	return 0;
+}
+
+static inline void zap_zero_pages(struct mm_struct *mm)
+{
+	struct mm_walk walk = { .pmd_entry = __zap_zero_pages };
+
+	walk.mm = mm;
+	walk_page_range(0, TASK_SIZE, &walk);
+}
+
+/*
  * switch on pgstes for its userspace process (for kvm)
  */
 int s390_enable_sie(void)
@@ -2141,6 +2172,7 @@ int s390_enable_sie(void)
 	mm->context.has_pgste = 1;
 	/* split thp mappings and disable thp for future mappings */
 	thp_split_mm(mm);
+	zap_zero_pages(mm);
 	up_write(&mm->mmap_sem);
 	return 0;
 }
@@ -2153,13 +2185,6 @@ EXPORT_SYMBOL_GPL(s390_enable_sie);
 static int __s390_enable_skey(pte_t *pte, unsigned long addr,
 			      unsigned long next, struct mm_walk *walk)
 {
-	/*
-	 * Remove all zero page mappings,
-	 * after establishing a policy to forbid zero page mappings
-	 * following faults for that page will get fresh anonymous pages
-	 */
-	if (is_zero_pfn(pte_pfn(*pte)))
-		ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID));
 	/* Clear storage key */
 	ptep_zap_key(walk->mm, addr, pte);
 	return 0;
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 18d4107e10ee..97fc449a7470 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -56,13 +56,12 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
-	unsigned long mask, result;
 	struct page *head, *page;
+	unsigned long mask;
 	int refs;
 
-	result = write ? 0 : _SEGMENT_ENTRY_PROTECT;
-	mask = result | _SEGMENT_ENTRY_INVALID;
-	if ((pmd_val(pmd) & mask) != result)
+	mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID;
+	if ((pmd_val(pmd) & mask) != 0)
 		return 0;
 	VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT));
 
diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h
index be0cc1beed41..3fae200dd251 100644
--- a/arch/sparc/include/asm/setup.h
+++ b/arch/sparc/include/asm/setup.h
@@ -59,8 +59,11 @@ extern atomic_t dcpage_flushes;
 extern atomic_t dcpage_flushes_xcall;
 
 extern int sysctl_tsb_ratio;
-#endif
 
+#ifdef CONFIG_SERIAL_SUNHV
+void sunhv_migrate_hvcons_irq(int cpu);
+#endif
+#endif
 void sun_do_break(void);
 extern int stop_a_enabled;
 extern int scons_pwroff;
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 2deb89ef1d5f..ca7cb8e57ab0 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1465,8 +1465,12 @@ void smp_send_stop(void)
 	int cpu;
 
 	if (tlb_type == hypervisor) {
+		int this_cpu = smp_processor_id();
+#ifdef CONFIG_SERIAL_SUNHV
+		sunhv_migrate_hvcons_irq(this_cpu);
+#endif
 		for_each_online_cpu(cpu) {
-			if (cpu == smp_processor_id())
+			if (cpu == this_cpu)
 				continue;
 #ifdef CONFIG_SUN_LDOMS
 			if (ldom_domaining_enabled) {
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e7636bac7372..6c98821fef5e 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -62,8 +62,10 @@
 #define new_len2		145f-144f
 
 /*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
  * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
  */
 #define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 1b020381ab38..d4aea31eec03 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	alt_end_marker ":\n"
 
 /*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
  * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
  *
- * The additional "-" is needed because gas works with s32s.
+ * The additional "-" is needed because gas uses a "true" value of -1.
  */
-#define alt_max_short(a, b)	"((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
+#define alt_max_short(a, b)	"((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
 
 /*
  * Pad the second replacement alternative with additional NOPs if it is
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index b31761ecce63..7bcd138c3aa9 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)			\
 do {								\
+	unsigned long base;					\
 	unsigned v;						\
 	(pr_reg)[0] = (regs)->r15;				\
 	(pr_reg)[1] = (regs)->r14;				\
@@ -226,8 +227,8 @@ do {								\
 	(pr_reg)[18] = (regs)->flags;				\
 	(pr_reg)[19] = (regs)->sp;				\
 	(pr_reg)[20] = (regs)->ss;				\
-	(pr_reg)[21] = current->thread.fsbase;			\
-	(pr_reg)[22] = current->thread.gsbase;			\
+	rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base;		\
+	rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base;	\
 	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
 	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
 	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 19d14ac23ef9..fc3c7e49c8e4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -296,6 +296,7 @@ struct x86_emulate_ctxt {
 
 	bool perm_ok; /* do not check permissions if true */
 	bool ud;	/* inject an #UD if host doesn't support insn */
+	bool tf;	/* TF value before instruction (after for syscall/sysret) */
 
 	bool have_exception;
 	struct x86_exception exception;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 11e572561960..c3c1ad2fce5c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -178,10 +178,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
 		return -EINVAL;
 	}
 
+	if (!enabled) {
+		++disabled_cpus;
+		return -EINVAL;
+	}
+
 	if (boot_cpu_physical_apicid != -1U)
 		ver = boot_cpu_apic_version;
 
-	cpu = __generic_processor_info(id, ver, enabled);
+	cpu = generic_processor_info(id, ver);
 	if (cpu >= 0)
 		early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2234918e494..e2ead34da465 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2070,7 +2070,7 @@ static int allocate_logical_cpuid(int apicid)
 	return nr_logical_cpuids++;
 }
 
-int __generic_processor_info(int apicid, int version, bool enabled)
+int generic_processor_info(int apicid, int version)
 {
 	int cpu, max = nr_cpu_ids;
 	bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2128,11 +2128,9 @@ int __generic_processor_info(int apicid, int version, bool enabled)
 	if (num_processors >= nr_cpu_ids) {
 		int thiscpu = max + disabled_cpus;
 
-		if (enabled) {
-			pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
-				   "reached. Processor %d/0x%x ignored.\n",
-				   max, thiscpu, apicid);
-		}
+		pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
+			   "reached. Processor %d/0x%x ignored.\n",
+			   max, thiscpu, apicid);
 
 		disabled_cpus++;
 		return -EINVAL;
@@ -2184,23 +2182,13 @@ int __generic_processor_info(int apicid, int version, bool enabled)
 		apic->x86_32_early_logical_apicid(cpu);
 #endif
 	set_cpu_possible(cpu, true);
-
-	if (enabled) {
-		num_processors++;
-		physid_set(apicid, phys_cpu_present_map);
-		set_cpu_present(cpu, true);
-	} else {
-		disabled_cpus++;
-	}
+	physid_set(apicid, phys_cpu_present_map);
+	set_cpu_present(cpu, true);
+	num_processors++;
 
 	return cpu;
 }
 
-int generic_processor_info(int apicid, int version)
-{
-	return __generic_processor_info(apicid, version, true);
-}
-
 int hard_smp_processor_id(void)
 {
 	return read_apic_id();
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cdc0deab00c9..13dbcc0f9d03 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -34,6 +34,7 @@
 #include <linux/mm.h>
 
 #include <asm/microcode_intel.h>
+#include <asm/intel-family.h>
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 #include <asm/setup.h>
@@ -1046,6 +1047,18 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
 	return 0;
 }
 
+static bool is_blacklisted(unsigned int cpu)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) {
+		pr_err_once("late loading on model 79 is disabled.\n");
+		return true;
+	}
+
+	return false;
+}
+
 static enum ucode_state request_microcode_fw(int cpu, struct device *device,
 					     bool refresh_fw)
 {
@@ -1054,6 +1067,9 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
 	const struct firmware *firmware;
 	enum ucode_state ret;
 
+	if (is_blacklisted(cpu))
+		return UCODE_NFOUND;
+
 	sprintf(name, "intel-ucode/%02x-%02x-%02x",
 		c->x86, c->x86_model, c->x86_mask);
 
@@ -1078,6 +1094,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
 static enum ucode_state
 request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
+	if (is_blacklisted(cpu))
+		return UCODE_NFOUND;
+
 	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
 }
 
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c114b132d121..7052d9a65fe9 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -130,11 +130,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	fpu__activate_fpstate_write(fpu);
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
+	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
 		ret = copyin_to_xsaves(kbuf, ubuf, xsave);
-	else
+	} else {
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 
+		/* xcomp_bv must be 0 when using uncompacted format */
+		if (!ret && xsave->header.xcomp_bv)
+			ret = -EINVAL;
+	}
+
 	/*
 	 * In case of failure, mark all states as init:
 	 */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a184c210efba..3ec0d2d64601 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -329,6 +329,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		} else {
 			err = __copy_from_user(&fpu->state.xsave,
 					       buf_fx, state_size);
+
+			/* xcomp_bv must be 0 when using uncompacted format */
+			if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv)
+				err = -EINVAL;
 		}
 
 		if (err || __copy_from_user(&env, buf, sizeof(env))) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 55ffd9dc2258..77f17cbfe271 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -141,7 +141,8 @@ void kvm_async_pf_task_wait(u32 token)
 
 	n.token = token;
 	n.cpu = smp_processor_id();
-	n.halted = is_idle_task(current) || preempt_count() > 1;
+	n.halted = is_idle_task(current) || preempt_count() > 1 ||
+		   rcu_preempt_depth();
 	init_swait_queue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
 	raw_spin_unlock(&b->lock);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3760b3c1ca0..0887d2ae3797 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task)
 	}
 }
 
+enum which_selector {
+	FS,
+	GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+					     unsigned short selector,
+					     enum which_selector which)
+{
+	if (likely(selector == 0)) {
+		/*
+		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
+		 * be the pre-existing saved base or it could be zero.  On AMD
+		 * (with X86_BUG_NULL_SEG), the segment base could be almost
+		 * anything.
+		 *
+		 * This branch is very hot (it's hit twice on almost every
+		 * context switch between 64-bit programs), and avoiding
+		 * the RDMSR helps a lot, so we just assume that whatever
+		 * value is already saved is correct.  This matches historical
+		 * Linux behavior, so it won't break existing applications.
+		 *
+		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+		 * report that the base is zero, it needs to actually be zero:
+		 * see the corresponding logic in load_seg_legacy.
+		 */
+	} else {
+		/*
+		 * If the selector is 1, 2, or 3, then the base is zero on
+		 * !X86_BUG_NULL_SEG CPUs and could be anything on
+		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
+		 * has never attempted to preserve the base across context
+		 * switches.
+		 *
+		 * If selector > 3, then it refers to a real segment, and
+		 * saving the base isn't necessary.
+		 */
+		if (which == FS)
+			prev_p->thread.fsbase = 0;
+		else
+			prev_p->thread.gsbase = 0;
+	}
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+	savesegment(fs, task->thread.fsindex);
+	savesegment(gs, task->thread.gsindex);
+	save_base_legacy(task, task->thread.fsindex, FS);
+	save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+static __always_inline void loadseg(enum which_selector which,
+				    unsigned short sel)
+{
+	if (which == FS)
+		loadsegment(fs, sel);
+	else
+		load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+					    unsigned long prev_base,
+					    unsigned short next_index,
+					    unsigned long next_base,
+					    enum which_selector which)
+{
+	if (likely(next_index <= 3)) {
+		/*
+		 * The next task is using 64-bit TLS, is not using this
+		 * segment at all, or is having fun with arcane CPU features.
+		 */
+		if (next_base == 0) {
+			/*
+			 * Nasty case: on AMD CPUs, we need to forcibly zero
+			 * the base.
+			 */
+			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+				loadseg(which, __USER_DS);
+				loadseg(which, next_index);
+			} else {
+				/*
+				 * We could try to exhaustively detect cases
+				 * under which we can skip the segment load,
+				 * but there's really only one case that matters
+				 * for performance: if both the previous and
+				 * next states are fully zeroed, we can skip
+				 * the load.
+				 *
+				 * (This assumes that prev_base == 0 has no
+				 * false positives.  This is the case on
+				 * Intel-style CPUs.)
+				 */
+				if (likely(prev_index | next_index | prev_base))
+					loadseg(which, next_index);
+			}
+		} else {
+			if (prev_index != next_index)
+				loadseg(which, next_index);
+			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+			       next_base);
+		}
+	} else {
+		/*
+		 * The next task is using a real segment.  Loading the selector
+		 * is sufficient.
+		 */
+		loadseg(which, next_index);
+	}
+}
+
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 		unsigned long arg, struct task_struct *p, unsigned long tls)
 {
@@ -216,10 +333,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 		    unsigned long new_sp,
 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 {
+	WARN_ON_ONCE(regs != current_pt_regs());
+
+	if (static_cpu_has(X86_BUG_NULL_SEG)) {
+		/* Loading zero below won't clear the base. */
+		loadsegment(fs, __USER_DS);
+		load_gs_index(__USER_DS);
+	}
+
 	loadsegment(fs, 0);
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
 	load_gs_index(0);
+
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	regs->cs		= _cs;
@@ -264,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	unsigned prev_fsindex, prev_gsindex;
 	fpu_switch_t fpu_switch;
 
 	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
@@ -274,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 *
 	 * (e.g. xen_load_tls())
 	 */
-	savesegment(fs, prev_fsindex);
-	savesegment(gs, prev_gsindex);
+	save_fsgs(prev_p);
 
 	/*
 	 * Load TLS before restoring any segments so that segment loads
@@ -314,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (unlikely(next->ds | prev->ds))
 		loadsegment(ds, next->ds);
 
-	/*
-	 * Switch FS and GS.
-	 *
-	 * These are even more complicated than DS and ES: they have
-	 * 64-bit bases are that controlled by arch_prctl.  The bases
-	 * don't necessarily match the selectors, as user code can do
-	 * any number of things to cause them to be inconsistent.
-	 *
-	 * We don't promise to preserve the bases if the selectors are
-	 * nonzero.  We also don't promise to preserve the base if the
-	 * selector is zero and the base doesn't match whatever was
-	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
-	 * FSGSBASE instructions are enabled, we'll need to offer
-	 * stronger guarantees.)
-	 *
-	 * As an invariant,
-	 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
-	 * impossible.
-	 */
-	if (next->fsindex) {
-		/* Loading a nonzero value into FS sets the index and base. */
-		loadsegment(fs, next->fsindex);
-	} else {
-		if (next->fsbase) {
-			/* Next index is zero but next base is nonzero. */
-			if (prev_fsindex)
-				loadsegment(fs, 0);
-			wrmsrl(MSR_FS_BASE, next->fsbase);
-		} else {
-			/* Next base and index are both zero. */
-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
-				/*
-				 * We don't know the previous base and can't
-				 * find out without RDMSR.  Forcibly clear it.
-				 */
-				loadsegment(fs, __USER_DS);
-				loadsegment(fs, 0);
-			} else {
-				/*
-				 * If the previous index is zero and ARCH_SET_FS
-				 * didn't change the base, then the base is
-				 * also zero and we don't need to do anything.
-				 */
-				if (prev->fsbase || prev_fsindex)
-					loadsegment(fs, 0);
-			}
-		}
-	}
-	/*
-	 * Save the old state and preserve the invariant.
-	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
-	 * without RDMSR because Intel user code can zero it without telling
-	 * us and AMD user code can program any 32-bit value without telling
-	 * us.
-	 */
-	if (prev_fsindex)
-		prev->fsbase = 0;
-	prev->fsindex = prev_fsindex;
-
-	if (next->gsindex) {
-		/* Loading a nonzero value into GS sets the index and base. */
-		load_gs_index(next->gsindex);
-	} else {
-		if (next->gsbase) {
-			/* Next index is zero but next base is nonzero. */
-			if (prev_gsindex)
-				load_gs_index(0);
-			wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
-		} else {
-			/* Next base and index are both zero. */
-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
-				/*
-				 * We don't know the previous base and can't
-				 * find out without RDMSR.  Forcibly clear it.
-				 *
-				 * This contains a pointless SWAPGS pair.
-				 * Fixing it would involve an explicit check
-				 * for Xen or a new pvop.
-				 */
-				load_gs_index(__USER_DS);
-				load_gs_index(0);
-			} else {
-				/*
-				 * If the previous index is zero and ARCH_SET_GS
-				 * didn't change the base, then the base is
-				 * also zero and we don't need to do anything.
-				 */
-				if (prev->gsbase || prev_gsindex)
-					load_gs_index(0);
-			}
-		}
-	}
-	/*
-	 * Save the old state and preserve the invariant.
-	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
-	 * without RDMSR because Intel user code can zero it without telling
-	 * us and AMD user code can program any 32-bit value without telling
-	 * us.
-	 */
-	if (prev_gsindex)
-		prev->gsbase = 0;
-	prev->gsindex = prev_gsindex;
+	load_seg_legacy(prev->fsindex, prev->fsbase,
+			next->fsindex, next->fsbase, FS);
+	load_seg_legacy(prev->gsindex, prev->gsbase,
+			next->gsindex, next->gsbase, GS);
 
 	switch_fpu_finish(next_fpu, fpu_switch);
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de36660751b5..72b737b8c9d6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2738,6 +2738,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 		ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
 	}
 
+	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
 	return X86EMUL_CONTINUE;
 }
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5f2412704b81..d29c745f10ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3649,19 +3649,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
 				unsigned level, unsigned gpte)
 {
 	/*
-	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
-	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
-	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
-	 */
-	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
-
-	/*
 	 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
 	 * If it is clear, there are no large pages at this level, so clear
 	 * PT_PAGE_SIZE_MASK in gpte if that is the case.
 	 */
 	gpte &= level - mmu->last_nonleaf_level;
 
+	/*
+	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+	 */
+	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
 	return gpte & PT_PAGE_SIZE_MASK;
 }
 
@@ -4169,6 +4169,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 
 	update_permission_bitmask(vcpu, context, true);
 	update_pkru_bitmask(vcpu, context, true);
+	update_last_nonleaf_level(vcpu, context);
 	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
 	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a01105485315..37363900297d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -324,10 +324,11 @@ retry_walk:
 		--walker->level;
 
 		index = PT_INDEX(addr, walker->level);
-
 		table_gfn = gpte_to_gfn(pte);
 		offset    = index * sizeof(pt_element_t);
 		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
+
+		BUG_ON(walker->level < 1);
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3dc6d8017ce9..a8ae57acb6f6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2167,46 +2167,44 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	struct pi_desc old, new;
 	unsigned int dest;
 
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	/*
+	 * In case of hot-plug or hot-unplug, we may have to undo
+	 * vmx_vcpu_pi_put even if there is no assigned device.  And we
+	 * always keep PI.NDST up to date for simplicity: it makes the
+	 * code easier, and CPU migration is not a fast path.
+	 */
+	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+		return;
+
+	/*
+	 * First handle the simple case where no cmpxchg is necessary; just
+	 * allow posting non-urgent interrupts.
+	 *
+	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+	 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
+	 * expects the VCPU to be on the blocked_vcpu_list that matches
+	 * PI.NDST.
+	 */
+	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
+	    vcpu->cpu == cpu) {
+		pi_clear_sn(pi_desc);
 		return;
+	}
 
+	/* The full case.  */
 	do {
 		old.control = new.control = pi_desc->control;
 
-		/*
-		 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
-		 * are two possible cases:
-		 * 1. After running 'pre_block', context switch
-		 *    happened. For this case, 'sn' was set in
-		 *    vmx_vcpu_put(), so we need to clear it here.
-		 * 2. After running 'pre_block', we were blocked,
-		 *    and woken up by some other guy. For this case,
-		 *    we don't need to do anything, 'pi_post_block'
-		 *    will do everything for us. However, we cannot
-		 *    check whether it is case #1 or case #2 here
-		 *    (maybe, not needed), so we also clear sn here,
-		 *    I think it is not a big deal.
-		 */
-		if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
-			if (vcpu->cpu != cpu) {
-				dest = cpu_physical_id(cpu);
-
-				if (x2apic_enabled())
-					new.ndst = dest;
-				else
-					new.ndst = (dest << 8) & 0xFF00;
-			}
+		dest = cpu_physical_id(cpu);
 
-			/* set 'NV' to 'notification vector' */
-			new.nv = POSTED_INTR_VECTOR;
-		}
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
 
-		/* Allow posting non-urgent interrupts */
 		new.sn = 0;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
 }
 
 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
@@ -4761,21 +4759,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_SMP
 	if (vcpu->mode == IN_GUEST_MODE) {
-		struct vcpu_vmx *vmx = to_vmx(vcpu);
-
 		/*
-		 * Currently, we don't support urgent interrupt,
-		 * all interrupts are recognized as non-urgent
-		 * interrupt, so we cannot post interrupts when
-		 * 'SN' is set.
+		 * The vector of interrupt to be delivered to vcpu had
+		 * been set in PIR before this function.
 		 *
-		 * If the vcpu is in guest mode, it means it is
-		 * running instead of being scheduled out and
-		 * waiting in the run queue, and that's the only
-		 * case when 'SN' is set currently, warning if
-		 * 'SN' is set.
+		 * Following cases will be reached in this block, and
+		 * we always send a notification event in all cases as
+		 * explained below.
+		 *
+		 * Case 1: vcpu keeps in non-root mode. Sending a
+		 * notification event posts the interrupt to vcpu.
+		 *
+		 * Case 2: vcpu exits to root mode and is still
+		 * runnable. PIR will be synced to vIRR before the
+		 * next vcpu entry. Sending a notification event in
+		 * this case has no effect, as vcpu is not in root
+		 * mode.
+		 *
+		 * Case 3: vcpu exits to root mode and is blocked.
+		 * vcpu_block() has already synced PIR to vIRR and
+		 * never blocks vcpu if vIRR is not cleared. Therefore,
+		 * a blocked vcpu here does not wait for any requested
+		 * interrupts in PIR, and sending a notification event
+		 * which has no effect is safe here.
 		 */
-		WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
 
 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
 				POSTED_INTR_VECTOR);
@@ -9187,6 +9194,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
 
+	/*
+	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+	 * or POSTED_INTR_WAKEUP_VECTOR.
+	 */
+	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+	vmx->pi_desc.sn = 1;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -9996,6 +10010,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
 				page_to_phys(vmx->nested.virtual_apic_page));
 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+	} else {
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+				CPU_BASED_CR8_STORE_EXITING;
+#endif
 	}
 
 	if (cpu_has_vmx_msr_bitmap() &&
@@ -10671,7 +10690,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
 	 */
 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
-	kvm_set_cr4(vcpu, vmcs12->host_cr4);
+	vmx_set_cr4(vcpu, vmcs12->host_cr4);
 
 	nested_ept_uninit_mmu_context(vcpu);
 
@@ -11000,6 +11019,37 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+
+	do {
+		old.control = new.control = pi_desc->control;
+		WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+		     "Wakeup handler not enabled while the VCPU is blocked\n");
+
+		dest = cpu_physical_id(vcpu->cpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* set 'NV' to 'notification vector' */
+		new.nv = POSTED_INTR_VECTOR;
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
+
+	if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+		list_del(&vcpu->blocked_vcpu_list);
+		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+		vcpu->pre_pcpu = -1;
+	}
+}
+
 /*
  * This routine does the following things for vCPU which is going
  * to be blocked if VT-d PI is enabled.
@@ -11015,7 +11065,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
  */
 static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
-	unsigned long flags;
 	unsigned int dest;
 	struct pi_desc old, new;
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -11025,34 +11074,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 		!kvm_vcpu_apicv_active(vcpu))
 		return 0;
 
-	vcpu->pre_pcpu = vcpu->cpu;
-	spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
-			  vcpu->pre_pcpu), flags);
-	list_add_tail(&vcpu->blocked_vcpu_list,
-		      &per_cpu(blocked_vcpu_on_cpu,
-		      vcpu->pre_pcpu));
-	spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
-			       vcpu->pre_pcpu), flags);
+	WARN_ON(irqs_disabled());
+	local_irq_disable();
+	if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+		vcpu->pre_pcpu = vcpu->cpu;
+		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+		list_add_tail(&vcpu->blocked_vcpu_list,
+			      &per_cpu(blocked_vcpu_on_cpu,
+				       vcpu->pre_pcpu));
+		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+	}
 
 	do {
 		old.control = new.control = pi_desc->control;
 
-		/*
-		 * We should not block the vCPU if
-		 * an interrupt is posted for it.
-		 */
-		if (pi_test_on(pi_desc) == 1) {
-			spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
-					  vcpu->pre_pcpu), flags);
-			list_del(&vcpu->blocked_vcpu_list);
-			spin_unlock_irqrestore(
-					&per_cpu(blocked_vcpu_on_cpu_lock,
-					vcpu->pre_pcpu), flags);
-			vcpu->pre_pcpu = -1;
-
-			return 1;
-		}
-
 		WARN((pi_desc->sn == 1),
 		     "Warning: SN field of posted-interrupts "
 		     "is set before blocking\n");
@@ -11074,10 +11109,15 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 
 		/* set 'NV' to 'wakeup vector' */
 		new.nv = POSTED_INTR_WAKEUP_VECTOR;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
 
-	return 0;
+	/* We should not block the vCPU if an interrupt is posted for it.  */
+	if (pi_test_on(pi_desc) == 1)
+		__pi_post_block(vcpu);
+
+	local_irq_enable();
+	return (vcpu->pre_pcpu == -1);
 }
 
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
@@ -11093,44 +11133,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 
 static void pi_post_block(struct kvm_vcpu *vcpu)
 {
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-	struct pi_desc old, new;
-	unsigned int dest;
-	unsigned long flags;
-
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	if (vcpu->pre_pcpu == -1)
 		return;
 
-	do {
-		old.control = new.control = pi_desc->control;
-
-		dest = cpu_physical_id(vcpu->cpu);
-
-		if (x2apic_enabled())
-			new.ndst = dest;
-		else
-			new.ndst = (dest << 8) & 0xFF00;
-
-		/* Allow posting non-urgent interrupts */
-		new.sn = 0;
-
-		/* set 'NV' to 'notification vector' */
-		new.nv = POSTED_INTR_VECTOR;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
-
-	if(vcpu->pre_pcpu != -1) {
-		spin_lock_irqsave(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
-		list_del(&vcpu->blocked_vcpu_list);
-		spin_unlock_irqrestore(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
-		vcpu->pre_pcpu = -1;
-	}
+	WARN_ON(irqs_disabled());
+	local_irq_disable();
+	__pi_post_block(vcpu);
+	local_irq_enable();
 }
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
@@ -11158,7 +11167,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 	struct kvm_lapic_irq irq;
 	struct kvm_vcpu *vcpu;
 	struct vcpu_data vcpu_info;
-	int idx, ret = -EINVAL;
+	int idx, ret = 0;
 
 	if (!kvm_arch_has_assigned_device(kvm) ||
 		!irq_remapping_cap(IRQ_POSTING_CAP) ||
@@ -11167,7 +11176,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-	BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+	if (guest_irq >= irq_rt->nr_rt_entries ||
+	    hlist_empty(&irq_rt->map[guest_irq])) {
+		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+			     guest_irq, irq_rt->nr_rt_entries);
+		goto out;
+	}
 
 	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
 		if (e->type != KVM_IRQ_ROUTING_MSI)
@@ -11210,12 +11224,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 
 		if (set)
 			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
-		else {
-			/* suppress notification event before unposting */
-			pi_set_sn(vcpu_to_pi_desc(vcpu));
+		else
 			ret = irq_set_vcpu_affinity(host_irq, NULL);
-			pi_clear_sn(vcpu_to_pi_desc(vcpu));
-		}
 
 		if (ret < 0) {
 			printk(KERN_INFO "%s: failed to update PI IRTE\n",
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9e2c2b00215b..31b15149f412 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5250,6 +5250,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
 	ctxt->eflags = kvm_get_rflags(vcpu);
+	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
 	ctxt->eip = kvm_rip_read(vcpu);
 	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
 		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
@@ -5465,37 +5467,26 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
 	return dr6;
 }
 
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
 {
 	struct kvm_run *kvm_run = vcpu->run;
 
-	/*
-	 * rflags is the old, "raw" value of the flags.  The new value has
-	 * not been saved yet.
-	 *
-	 * This is correct even for TF set by the guest, because "the
-	 * processor will not generate this exception after the instruction
-	 * that sets the TF flag".
-	 */
-	if (unlikely(rflags & X86_EFLAGS_TF)) {
-		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-			kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
-						  DR6_RTM;
-			kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
-			kvm_run->debug.arch.exception = DB_VECTOR;
-			kvm_run->exit_reason = KVM_EXIT_DEBUG;
-			*r = EMULATE_USER_EXIT;
-		} else {
-			vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
-			/*
-			 * "Certain debug exceptions may clear bit 0-3.  The
-			 * remaining contents of the DR6 register are never
-			 * cleared by the processor".
-			 */
-			vcpu->arch.dr6 &= ~15;
-			vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
-			kvm_queue_exception(vcpu, DB_VECTOR);
-		}
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+		kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+		kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+		kvm_run->debug.arch.exception = DB_VECTOR;
+		kvm_run->exit_reason = KVM_EXIT_DEBUG;
+		*r = EMULATE_USER_EXIT;
+	} else {
+		vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+		/*
+		 * "Certain debug exceptions may clear bit 0-3.  The
+		 * remaining contents of the DR6 register are never
+		 * cleared by the processor".
+		 */
+		vcpu->arch.dr6 &= ~15;
+		vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+		kvm_queue_exception(vcpu, DB_VECTOR);
 	}
 }
 
@@ -5650,8 +5641,9 @@ restart:
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 		kvm_rip_write(vcpu, ctxt->eip);
-		if (r == EMULATE_DONE)
-			kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+		if (r == EMULATE_DONE &&
+		    (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+			kvm_vcpu_do_singlestep(vcpu, &r);
 		if (!ctxt->have_exception ||
 		    exception_type(ctxt->exception.vector) == EXCPT_TRAP)
 			__kvm_set_rflags(vcpu, ctxt->eflags);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9f72ca3b2669..1dd796025472 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -191,8 +191,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
  * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
  *	     faulted on a pte with its pkey=4.
  */
-static void fill_sig_info_pkey(int si_code, siginfo_t *info,
-		struct vm_area_struct *vma)
+static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
 {
 	/* This is effectively an #ifdef */
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
@@ -208,7 +207,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
 	 * valid VMA, so we should never reach this without a
 	 * valid VMA.
 	 */
-	if (!vma) {
+	if (!pkey) {
 		WARN_ONCE(1, "PKU fault with no VMA passed in");
 		info->si_pkey = 0;
 		return;
@@ -218,13 +217,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
 	 * absolutely guranteed to be 100% accurate because of
 	 * the race explained above.
 	 */
-	info->si_pkey = vma_pkey(vma);
+	info->si_pkey = *pkey;
 }
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-		     struct task_struct *tsk, struct vm_area_struct *vma,
-		     int fault)
+		     struct task_struct *tsk, u32 *pkey, int fault)
 {
 	unsigned lsb = 0;
 	siginfo_t info;
@@ -239,7 +237,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 		lsb = PAGE_SHIFT;
 	info.si_addr_lsb = lsb;
 
-	fill_sig_info_pkey(si_code, &info, vma);
+	fill_sig_info_pkey(si_code, &info, pkey);
 
 	force_sig_info(si_signo, &info, tsk);
 }
@@ -718,8 +716,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 	unsigned long flags;
 	int sig;
-	/* No context means no VMA to pass down */
-	struct vm_area_struct *vma = NULL;
 
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -744,7 +740,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
 			/* XXX: hwpoison faults will set the wrong code. */
 			force_sig_info_fault(signal, si_code, address,
-					     tsk, vma, 0);
+					     tsk, NULL, 0);
 		}
 
 		/*
@@ -853,8 +849,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-		       unsigned long address, struct vm_area_struct *vma,
-		       int si_code)
+		       unsigned long address, u32 *pkey, int si_code)
 {
 	struct task_struct *tsk = current;
 
@@ -902,7 +897,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		tsk->thread.error_code	= error_code;
 		tsk->thread.trap_nr	= X86_TRAP_PF;
 
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
 
 		return;
 	}
@@ -915,9 +910,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 
 static noinline void
 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-		     unsigned long address, struct vm_area_struct *vma)
+		     unsigned long address, u32 *pkey)
 {
-	__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
+	__bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
 }
 
 static void
@@ -925,6 +920,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 	   unsigned long address,  struct vm_area_struct *vma, int si_code)
 {
 	struct mm_struct *mm = current->mm;
+	u32 pkey;
+
+	if (vma)
+		pkey = vma_pkey(vma);
 
 	/*
 	 * Something tried to access memory that isn't in our memory map..
@@ -932,7 +931,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 	 */
 	up_read(&mm->mmap_sem);
 
-	__bad_area_nosemaphore(regs, error_code, address, vma, si_code);
+	__bad_area_nosemaphore(regs, error_code, address,
+			       (vma) ? &pkey : NULL, si_code);
 }
 
 static noinline void
@@ -975,7 +975,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 
 static void
 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
-	  struct vm_area_struct *vma, unsigned int fault)
+	  u32 *pkey, unsigned int fault)
 {
 	struct task_struct *tsk = current;
 	int code = BUS_ADRERR;
@@ -1002,13 +1002,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 		code = BUS_MCEERR_AR;
 	}
 #endif
-	force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
+	force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
 }
 
 static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
-	       unsigned long address, struct vm_area_struct *vma,
-	       unsigned int fault)
+	       unsigned long address, u32 *pkey, unsigned int fault)
 {
 	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
 		no_context(regs, error_code, address, 0, 0);
@@ -1032,9 +1031,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	} else {
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 			     VM_FAULT_HWPOISON_LARGE))
-			do_sigbus(regs, error_code, address, vma, fault);
+			do_sigbus(regs, error_code, address, pkey, fault);
 		else if (fault & VM_FAULT_SIGSEGV)
-			bad_area_nosemaphore(regs, error_code, address, vma);
+			bad_area_nosemaphore(regs, error_code, address, pkey);
 		else
 			BUG();
 	}
@@ -1220,6 +1219,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	struct mm_struct *mm;
 	int fault, major = 0;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+	u32 pkey;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1420,9 +1420,10 @@ good_area:
 		return;
 	}
 
+	pkey = vma_pkey(vma);
 	up_read(&mm->mmap_sem);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
-		mm_fault_error(regs, error_code, address, vma, fault);
+		mm_fault_error(regs, error_code, address, &pkey, fault);
 		return;
 	}
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a324fc8bed8..3e27ded6ac65 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -689,7 +689,7 @@ static void __meminit free_pagetable(struct page *page, int order)
 	if (PageReserved(page)) {
 		__ClearPageReserved(page);
 
-		magic = (unsigned long)page->lru.next;
+		magic = (unsigned long)page->freelist;
 		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 555b9fa0ad43..7dbdb780264d 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -8,6 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
 LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
 targets += purgatory.ro
 
+KASAN_SANITIZE	:= n
 KCOV_INSTRUMENT := n
 
 # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That