From dfe94d4086e40e92b1926bddcefa629b791e9b28 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Mon, 21 Dec 2020 22:55:41 -0800
Subject: x86/hyperv: Fix kexec panic/hang issues

Currently the kexec kernel can panic or hang due to 2 causes:

1) hv_cpu_die() is not called upon kexec, so the hypervisor corrupts the
old VP Assist Pages when the kexec kernel runs. The same issue is fixed
for hibernation in commit 421f090c819d ("x86/hyperv: Suspend/resume the
VP assist page for hibernation"). Now fix it for kexec.

2) hyperv_cleanup() is called too early. In the kexec path, the other CPUs
are stopped in hv_machine_shutdown() -> native_machine_shutdown(), so
between hv_kexec_handler() and native_machine_shutdown(), the other CPUs
can still try to access the hypercall page and cause panic. The workaround
"hv_hypercall_pg = NULL;" in hyperv_cleanup() is unreliabe. Move
hyperv_cleanup() to a better place.

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20201222065541.24312-1-decui@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/kernel/cpu/mshyperv.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f628e3dc150f..43b54bef5448 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -135,14 +135,32 @@ static void hv_machine_shutdown(void)
 {
 	if (kexec_in_progress && hv_kexec_handler)
 		hv_kexec_handler();
+
+	/*
+	 * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
+	 * corrupts the old VP Assist Pages and can crash the kexec kernel.
+	 */
+	if (kexec_in_progress && hyperv_init_cpuhp > 0)
+		cpuhp_remove_state(hyperv_init_cpuhp);
+
+	/* The function calls stop_other_cpus(). */
 	native_machine_shutdown();
+
+	/* Disable the hypercall page when there is only 1 active CPU. */
+	if (kexec_in_progress)
+		hyperv_cleanup();
 }
 
 static void hv_machine_crash_shutdown(struct pt_regs *regs)
 {
 	if (hv_crash_handler)
 		hv_crash_handler(regs);
+
+	/* The function calls crash_smp_send_stop(). */
 	native_machine_crash_shutdown(regs);
+
+	/* Disable the hypercall page when there is only 1 active CPU. */
+	hyperv_cleanup();
 }
 #endif /* CONFIG_KEXEC_CORE */
 #endif /* CONFIG_HYPERV */
-- 
cgit v1.2.3


From 7024f60d655272bd2ca1d3a4c9e0a63319b1eea1 Mon Sep 17 00:00:00 2001
From: "Hyunwook (Wooky) Baek" <baekhw@google.com>
Date: Sat, 9 Jan 2021 23:11:02 -0800
Subject: x86/sev-es: Handle string port IO to kernel memory properly

Don't assume dest/source buffers are userspace addresses when manually
copying data for string I/O or MOVS MMIO, as {get,put}_user() will fail
if handed a kernel address and ultimately lead to a kernel panic.

When invoking INSB/OUTSB instructions in kernel space in a
SEV-ES-enabled VM, the kernel crashes with the following message:

  "SEV-ES: Unsupported exception in #VC instruction emulation - can't continue"

Handle that case properly.

 [ bp: Massage commit message. ]

Fixes: f980f9c31a92 ("x86/sev-es: Compile early handler code into kernel image")
Signed-off-by: Hyunwook (Wooky) Baek <baekhw@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Link: https://lkml.kernel.org/r/20210110071102.2576186-1-baekhw@google.com
---
 arch/x86/kernel/sev-es.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 0bd1a0fc587e..ab31c34ba508 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -286,6 +286,12 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
 	u16 d2;
 	u8  d1;
 
+	/* If instruction ran in kernel mode and the I/O buffer is in kernel space */
+	if (!user_mode(ctxt->regs) && !access_ok(target, size)) {
+		memcpy(dst, buf, size);
+		return ES_OK;
+	}
+
 	switch (size) {
 	case 1:
 		memcpy(&d1, buf, 1);
@@ -335,6 +341,12 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
 	u16 d2;
 	u8  d1;
 
+	/* If instruction ran in kernel mode and the I/O buffer is in kernel space */
+	if (!user_mode(ctxt->regs) && !access_ok(s, size)) {
+		memcpy(buf, src, size);
+		return ES_OK;
+	}
+
 	switch (size) {
 	case 1:
 		if (get_user(d1, s))
-- 
cgit v1.2.3


From 76e2fc63ca40977af893b724b00cc2f8e9ce47a4 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <Yazen.Ghannam@amd.com>
Date: Mon, 11 Jan 2021 11:04:29 +0100
Subject: x86/cpu/amd: Set __max_die_per_package on AMD

Set the maximum DIE per package variable on AMD using the
NodesPerProcessor topology value. This will be used by RAPL, among
others, to determine the maximum number of DIEs on the system in order
to do per-DIE manipulations.

 [ bp: Productize into a proper patch. ]

Fixes: 028c221ed190 ("x86/CPU/AMD: Save AMD NodeId as cpu_die_id")
Reported-by: Johnathan Smithinovic <johnathan.smithinovic@gmx.at>
Reported-by: Rafael Kitover <rkitover@gmail.com>
Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Johnathan Smithinovic <johnathan.smithinovic@gmx.at>
Tested-by: Rafael Kitover <rkitover@gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=210939
Link: https://lkml.kernel.org/r/20210106112106.GE5729@zn.tnic
Link: https://lkml.kernel.org/r/20210111101455.1194-1-bp@alien8.de
---
 arch/x86/kernel/cpu/amd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f8ca66f3d861..347a956f71ca 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -542,12 +542,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		u32 ecx;
 
 		ecx = cpuid_ecx(0x8000001e);
-		nodes_per_socket = ((ecx >> 8) & 7) + 1;
+		__max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
 	} else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
 		u64 value;
 
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
-		nodes_per_socket = ((value >> 3) & 7) + 1;
+		__max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
 	}
 
 	if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
-- 
cgit v1.2.3


From a1d5c98aac33a5a0004ecf88905dcc261c52f988 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 6 Jan 2021 15:36:21 +0100
Subject: x86/sev: Fix nonistr violation

When the compiler fails to inline, it violates nonisntr:

  vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0xc7: call to sev_es_wr_ghcb_msr() leaves .noinstr.text section

Fixes: 4ca68e023b11 ("x86/sev-es: Handle NMI State")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210106144017.532902065@infradead.org
---
 arch/x86/kernel/sev-es.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index ab31c34ba508..84c1821819af 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -225,7 +225,7 @@ static inline u64 sev_es_rd_ghcb_msr(void)
 	return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
 }
 
-static inline void sev_es_wr_ghcb_msr(u64 val)
+static __always_inline void sev_es_wr_ghcb_msr(u64 val)
 {
 	u32 low, high;
 
-- 
cgit v1.2.3


From 737495361d4469477ffe45d51e6fc56f44f3cc6a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 6 Jan 2021 15:36:24 +0100
Subject: x86/mce: Remove explicit/superfluous tracing

There's some explicit tracing left in exc_machine_check_kernel(),
remove it, as it's already implied by irqentry_nmi_enter().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210106144017.719310466@infradead.org
---
 arch/x86/kernel/cpu/mce/core.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 13d3f1cbda17..e133ce1e562b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1992,10 +1992,9 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
 	 * that out because it's an indirect call. Annotate it.
 	 */
 	instrumentation_begin();
-	trace_hardirqs_off_finish();
+
 	machine_check_vector(regs);
-	if (regs->flags & X86_EFLAGS_IF)
-		trace_hardirqs_on_prepare();
+
 	instrumentation_end();
 	irqentry_nmi_exit(regs, irq_state);
 }
@@ -2004,7 +2003,9 @@ static __always_inline void exc_machine_check_user(struct pt_regs *regs)
 {
 	irqentry_enter_from_user_mode(regs);
 	instrumentation_begin();
+
 	machine_check_vector(regs);
+
 	instrumentation_end();
 	irqentry_exit_to_user_mode(regs);
 }
-- 
cgit v1.2.3


From 1eb8f690bcb565a6600f8b6dcc78f7b239ceba17 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 14 Jan 2021 10:36:59 +0100
Subject: x86/topology: Make __max_die_per_package available unconditionally

Move it outside of CONFIG_SMP in order to avoid ifdeffery at the usage
sites.

Fixes: 76e2fc63ca40 ("x86/cpu/amd: Set __max_die_per_package on AMD")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210114111814.5346-1-bp@alien8.de
---
 arch/x86/kernel/cpu/topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 1068002c8532..8678864ce712 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -25,10 +25,10 @@
 #define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
 #define LEVEL_MAX_SIBLINGS(ebx)		((ebx) & 0xffff)
 
-#ifdef CONFIG_SMP
 unsigned int __max_die_per_package __read_mostly = 1;
 EXPORT_SYMBOL(__max_die_per_package);
 
+#ifdef CONFIG_SMP
 /*
  * Check if given CPUID extended toplogy "leaf" is implemented
  */
-- 
cgit v1.2.3


From 9c7d9017a49fb8516c13b7bff59b7da2abed23e1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 8 Jan 2021 19:05:59 +0100
Subject: x86: PM: Register syscore_ops for scale invariance

On x86 scale invariace tends to be disabled during resume from
suspend-to-RAM, because the MPERF or APERF MSR values are not as
expected then due to updates taking place after the platform
firmware has been invoked to complete the suspend transition.

That, of course, is not desirable, especially if the schedutil
scaling governor is in use, because the lack of scale invariance
causes it to be less reliable.

To counter that effect, modify init_freq_invariance() to register
a syscore_ops object for scale invariance with the ->resume callback
pointing to init_counter_refs() which will run on the CPU starting
the resume transition (the other CPUs will be taken care of the
"online" operations taking place later).

Fixes: e2b0d619b400 ("x86, sched: check for counters overflow in frequency invariant accounting")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Link: https://lkml.kernel.org/r/1803209.Mvru99baaF@kreacher
---
 arch/x86/kernel/smpboot.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8ca66af96a54..117e24fbfd8a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
 #include <linux/numa.h>
 #include <linux/pgtable.h>
 #include <linux/overflow.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/acpi.h>
 #include <asm/desc.h>
@@ -2083,6 +2084,23 @@ static void init_counter_refs(void)
 	this_cpu_write(arch_prev_mperf, mperf);
 }
 
+#ifdef CONFIG_PM_SLEEP
+static struct syscore_ops freq_invariance_syscore_ops = {
+	.resume = init_counter_refs,
+};
+
+static void register_freq_invariance_syscore_ops(void)
+{
+	/* Bail out if registered already. */
+	if (freq_invariance_syscore_ops.node.prev)
+		return;
+
+	register_syscore_ops(&freq_invariance_syscore_ops);
+}
+#else
+static inline void register_freq_invariance_syscore_ops(void) {}
+#endif
+
 static void init_freq_invariance(bool secondary, bool cppc_ready)
 {
 	bool ret = false;
@@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready)
 	if (ret) {
 		init_counter_refs();
 		static_branch_enable(&arch_scale_freq_key);
+		register_freq_invariance_syscore_ops();
 		pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
 	} else {
 		pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
-- 
cgit v1.2.3


From e45122893a9870813f9bd7b4add4f613e6f29008 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 20 Jan 2021 21:09:48 -0800
Subject: x86/fpu: Add kernel_fpu_begin_mask() to selectively initialize state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, requesting kernel FPU access doesn't distinguish which parts of
the extended ("FPU") state are needed.  This is nice for simplicity, but
there are a few cases in which it's suboptimal:

 - The vast majority of in-kernel FPU users want XMM/YMM/ZMM state but do
   not use legacy 387 state.  These users want MXCSR initialized but don't
   care about the FPU control word.  Skipping FNINIT would save time.
   (Empirically, FNINIT is several times slower than LDMXCSR.)

 - Code that wants MMX doesn't want or need MXCSR initialized.
   _mmx_memcpy(), for example, can run before CR4.OSFXSR gets set, and
   initializing MXCSR will fail because LDMXCSR generates an #UD when the
   aforementioned CR4 bit is not set.

 - Any future in-kernel users of XFD (eXtended Feature Disable)-capable
   dynamic states will need special handling.

Add a more specific API that allows callers to specify exactly what they
want.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Krzysztof Piotr Olędzki <ole@ans.pl>
Link: https://lkml.kernel.org/r/aff1cac8b8fc7ee900cf73e8f2369966621b053f.1611205691.git.luto@kernel.org
---
 arch/x86/kernel/fpu/core.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index eb86a2b831b1..571220ac8bea 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -121,7 +121,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
 }
 EXPORT_SYMBOL(copy_fpregs_to_fpstate);
 
-void kernel_fpu_begin(void)
+void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
 	preempt_disable();
 
@@ -141,13 +141,14 @@ void kernel_fpu_begin(void)
 	}
 	__cpu_invalidate_fpregs_state();
 
-	if (boot_cpu_has(X86_FEATURE_XMM))
+	/* Put sane initial values into the control registers. */
+	if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
 		ldmxcsr(MXCSR_DEFAULT);
 
-	if (boot_cpu_has(X86_FEATURE_FPU))
+	if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
 		asm volatile ("fninit");
 }
-EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
 
 void kernel_fpu_end(void)
 {
-- 
cgit v1.2.3


From bde9cfa3afe4324ec251e4af80ebf9b7afaf7afe Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Sat, 23 Jan 2021 21:00:57 -0800
Subject: x86/setup: don't remove E820_TYPE_RAM for pfn 0

Patch series "mm: fix initialization of struct page for holes in  memory layout", v3.

Commit 73a6e474cb37 ("mm: memmap_init: iterate over memblock regions
rather that check each PFN") exposed several issues with the memory map
initialization and these patches fix those issues.

Initially there were crashes during compaction that Qian Cai reported
back in April [1].  It seemed back then that the problem was fixed, but
a few weeks ago Andrea Arcangeli hit the same bug [2] and there was an
additional discussion at [3].

[1] https://lore.kernel.org/lkml/8C537EB7-85EE-4DCF-943E-3CC0ED0DF56D@lca.pw
[2] https://lore.kernel.org/lkml/20201121194506.13464-1-aarcange@redhat.com
[3] https://lore.kernel.org/mm-commits/20201206005401.qKuAVgOXr%akpm@linux-foundation.org

This patch (of 2):

The first 4Kb of memory is a BIOS owned area and to avoid its allocation
for the kernel it was not listed in e820 tables as memory.  As the result,
pfn 0 was never recognised by the generic memory management and it is not
a part of neither node 0 nor ZONE_DMA.

If set_pfnblock_flags_mask() would be ever called for the pageblock
corresponding to the first 2Mbytes of memory, having pfn 0 outside of
ZONE_DMA would trigger

	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);

Along with reserving the first 4Kb in e820 tables, several first pages are
reserved with memblock in several places during setup_arch().  These
reservations are enough to ensure the kernel does not touch the BIOS area
and it is not necessary to remove E820_TYPE_RAM for pfn 0.

Remove the update of e820 table that changes the type of pfn 0 and move
the comment describing why it was done to trim_low_memory_range() that
reserves the beginning of the memory.

Link: https://lkml.kernel.org/r/20210111194017.22696-2-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/setup.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 740f3bdb3f61..3412c4595efd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -660,17 +660,6 @@ static void __init trim_platform_memory_ranges(void)
 
 static void __init trim_bios_range(void)
 {
-	/*
-	 * A special case is the first 4Kb of memory;
-	 * This is a BIOS owned area, not kernel ram, but generally
-	 * not listed as such in the E820 table.
-	 *
-	 * This typically reserves additional memory (64KiB by default)
-	 * since some BIOSes are known to corrupt low memory.  See the
-	 * Kconfig help text for X86_RESERVE_LOW.
-	 */
-	e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
-
 	/*
 	 * special case: Some BIOSes report the PC BIOS
 	 * area (640Kb -> 1Mb) as RAM even though it is not.
@@ -728,6 +717,15 @@ early_param("reservelow", parse_reservelow);
 
 static void __init trim_low_memory_range(void)
 {
+	/*
+	 * A special case is the first 4Kb of memory;
+	 * This is a BIOS owned area, not kernel ram, but generally
+	 * not listed as such in the E820 table.
+	 *
+	 * This typically reserves additional memory (64KiB by default)
+	 * since some BIOSes are known to corrupt low memory.  See the
+	 * Kconfig help text for X86_RESERVE_LOW.
+	 */
 	memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
 }
 	
-- 
cgit v1.2.3


From 9ad22e165994ccb64d85b68499eaef97342c175b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Jan 2021 22:16:27 +0100
Subject: x86/debug: Fix DR6 handling

Tom reported that one of the GDB test-cases failed, and Boris bisected
it to commit:

  d53d9bc0cf78 ("x86/debug: Change thread.debugreg6 to thread.virtual_dr6")

The debugging session led us to commit:

  6c0aca288e72 ("x86: Ignore trap bits on single step exceptions")

It turns out that TF and data breakpoints are both traps and will be
merged, while instruction breakpoints are faults and will not be merged.
This means 6c0aca288e72 is wrong, only TF and instruction breakpoints
need to be excluded while TF and data breakpoints can be merged.

 [ bp: Massage commit message. ]

Fixes: d53d9bc0cf78 ("x86/debug: Change thread.debugreg6 to thread.virtual_dr6")
Fixes: 6c0aca288e72 ("x86: Ignore trap bits on single step exceptions")
Reported-by: Tom de Vries <tdevries@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/YBMAbQGACujjfz%2Bi@hirez.programming.kicks-ass.net
Link: https://lkml.kernel.org/r/20210128211627.GB4348@worktop.programming.kicks-ass.net
---
 arch/x86/kernel/hw_breakpoint.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 03aa33b58165..6694c0f8e6c1 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -491,15 +491,12 @@ static int hw_breakpoint_handler(struct die_args *args)
 	struct perf_event *bp;
 	unsigned long *dr6_p;
 	unsigned long dr6;
+	bool bpx;
 
 	/* The DR6 value is pointed by args->err */
 	dr6_p = (unsigned long *)ERR_PTR(args->err);
 	dr6 = *dr6_p;
 
-	/* If it's a single step, TRAP bits are random */
-	if (dr6 & DR_STEP)
-		return NOTIFY_DONE;
-
 	/* Do an early return if no trap bits are set in DR6 */
 	if ((dr6 & DR_TRAP_BITS) == 0)
 		return NOTIFY_DONE;
@@ -509,28 +506,29 @@ static int hw_breakpoint_handler(struct die_args *args)
 		if (likely(!(dr6 & (DR_TRAP0 << i))))
 			continue;
 
+		bp = this_cpu_read(bp_per_reg[i]);
+		if (!bp)
+			continue;
+
+		bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE;
+
 		/*
-		 * The counter may be concurrently released but that can only
-		 * occur from a call_rcu() path. We can then safely fetch
-		 * the breakpoint, use its callback, touch its counter
-		 * while we are in an rcu_read_lock() path.
+		 * TF and data breakpoints are traps and can be merged, however
+		 * instruction breakpoints are faults and will be raised
+		 * separately.
+		 *
+		 * However DR6 can indicate both TF and instruction
+		 * breakpoints. In that case take TF as that has precedence and
+		 * delay the instruction breakpoint for the next exception.
 		 */
-		rcu_read_lock();
+		if (bpx && (dr6 & DR_STEP))
+			continue;
 
-		bp = this_cpu_read(bp_per_reg[i]);
 		/*
 		 * Reset the 'i'th TRAP bit in dr6 to denote completion of
 		 * exception handling
 		 */
 		(*dr6_p) &= ~(DR_TRAP0 << i);
-		/*
-		 * bp can be NULL due to lazy debug register switching
-		 * or due to concurrent perf counter removing.
-		 */
-		if (!bp) {
-			rcu_read_unlock();
-			break;
-		}
 
 		perf_bp_event(bp, args->regs);
 
@@ -538,11 +536,10 @@ static int hw_breakpoint_handler(struct die_args *args)
 		 * Set up resume flag to avoid breakpoint recursion when
 		 * returning back to origin.
 		 */
-		if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+		if (bpx)
 			args->regs->flags |= X86_EFLAGS_RF;
-
-		rcu_read_unlock();
 	}
+
 	/*
 	 * Further processing in do_debug() is needed for a) user-space
 	 * breakpoints (to generate signals) and b) when the system has
-- 
cgit v1.2.3


From 8acf417805a5f5c69e9ff66f14cab022c2755161 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 1 Feb 2021 19:00:07 +0000
Subject: x86/split_lock: Enable the split lock feature on another Alder Lake
 CPU

Add Alder Lake mobile processor to CPU list to enumerate and enable the
split lock feature.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20210201190007.4031869-1-fenghua.yu@intel.com
---
 arch/x86/kernel/cpu/intel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 59a1e3ce3f14..816fdbec795a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1159,6 +1159,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,		1),
 	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	1),
 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,		1),
+	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		1),
 	{}
 };
 
-- 
cgit v1.2.3


From 5c279c4cf206e03995e04fd3404fa95ffd243a97 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Thu, 4 Feb 2021 20:12:37 +0200
Subject: Revert "x86/setup: don't remove E820_TYPE_RAM for pfn 0"

This reverts commit bde9cfa3afe4324ec251e4af80ebf9b7afaf7afe.

Changing the first memory page type from E820_TYPE_RESERVED to
E820_TYPE_RAM makes it a part of "System RAM" resource rather than a
reserved resource and this in turn causes devmem_is_allowed() to treat
is as area that can be accessed but it is filled with zeroes instead of
the actual data as previously.

The change in /dev/mem output causes lilo to fail as was reported at
slakware users forum, and probably other legacy applications will
experience similar problems.

Link: https://www.linuxquestions.org/questions/slackware-14/slackware-current-lilo-vesa-warnings-after-recent-updates-4175689617/#post6214439
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/setup.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3412c4595efd..740f3bdb3f61 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -660,6 +660,17 @@ static void __init trim_platform_memory_ranges(void)
 
 static void __init trim_bios_range(void)
 {
+	/*
+	 * A special case is the first 4Kb of memory;
+	 * This is a BIOS owned area, not kernel ram, but generally
+	 * not listed as such in the E820 table.
+	 *
+	 * This typically reserves additional memory (64KiB by default)
+	 * since some BIOSes are known to corrupt low memory.  See the
+	 * Kconfig help text for X86_RESERVE_LOW.
+	 */
+	e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+
 	/*
 	 * special case: Some BIOSes report the PC BIOS
 	 * area (640Kb -> 1Mb) as RAM even though it is not.
@@ -717,15 +728,6 @@ early_param("reservelow", parse_reservelow);
 
 static void __init trim_low_memory_range(void)
 {
-	/*
-	 * A special case is the first 4Kb of memory;
-	 * This is a BIOS owned area, not kernel ram, but generally
-	 * not listed as such in the E820 table.
-	 *
-	 * This typically reserves additional memory (64KiB by default)
-	 * since some BIOSes are known to corrupt low memory.  See the
-	 * Kconfig help text for X86_RESERVE_LOW.
-	 */
 	memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
 }
 	
-- 
cgit v1.2.3


From 25a068b8e9a4eb193d755d58efcb3c98928636e0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 5 Mar 2020 09:47:08 -0800
Subject: x86/apic: Add extra serialization for non-serializing MSRs

Jan Kiszka reported that the x2apic_wrmsr_fence() function uses a plain
MFENCE while the Intel SDM (10.12.3 MSR Access in x2APIC Mode) calls for
MFENCE; LFENCE.

Short summary: we have special MSRs that have weaker ordering than all
the rest. Add fencing consistent with current SDM recommendations.

This is not known to cause any issues in practice, only in theory.

Longer story below:

The reason the kernel uses a different semantic is that the SDM changed
(roughly in late 2017). The SDM changed because folks at Intel were
auditing all of the recommended fences in the SDM and realized that the
x2apic fences were insufficient.

Why was the pain MFENCE judged insufficient?

WRMSR itself is normally a serializing instruction. No fences are needed
because the instruction itself serializes everything.

But, there are explicit exceptions for this serializing behavior written
into the WRMSR instruction documentation for two classes of MSRs:
IA32_TSC_DEADLINE and the X2APIC MSRs.

Back to x2apic: WRMSR is *not* serializing in this specific case.
But why is MFENCE insufficient? MFENCE makes writes visible, but
only affects load/store instructions. WRMSR is unfortunately not a
load/store instruction and is unaffected by MFENCE. This means that a
non-serializing WRMSR could be reordered by the CPU to execute before
the writes made visible by the MFENCE have even occurred in the first
place.

This means that an x2apic IPI could theoretically be triggered before
there is any (visible) data to process.

Does this affect anything in practice? I honestly don't know. It seems
quite possible that by the time an interrupt gets to consume the (not
yet) MFENCE'd data, it has become visible, mostly by accident.

To be safe, add the SDM-recommended fences for all x2apic WRMSRs.

This also leaves open the question of the _other_ weakly-ordered WRMSR:
MSR_IA32_TSC_DEADLINE. While it has the same ordering architecture as
the x2APIC MSRs, it seems substantially less likely to be a problem in
practice. While writes to the in-memory Local Vector Table (LVT) might
theoretically be reordered with respect to a weakly-ordered WRMSR like
TSC_DEADLINE, the SDM has this to say:

  In x2APIC mode, the WRMSR instruction is used to write to the LVT
  entry. The processor ensures the ordering of this write and any
  subsequent WRMSR to the deadline; no fencing is required.

But, that might still leave xAPIC exposed. The safest thing to do for
now is to add the extra, recommended LFENCE.

 [ bp: Massage commit message, fix typos, drop accidentally added
   newline to tools/arch/x86/include/asm/barrier.h. ]

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20200305174708.F77040DD@viggo.jf.intel.com
---
 arch/x86/kernel/apic/apic.c           | 4 ++++
 arch/x86/kernel/apic/x2apic_cluster.c | 6 ++++--
 arch/x86/kernel/apic/x2apic_phys.c    | 9 ++++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6bd20c0de8bc..7f4c081f59f0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -41,6 +41,7 @@
 #include <asm/perf_event.h>
 #include <asm/x86_init.h>
 #include <linux/atomic.h>
+#include <asm/barrier.h>
 #include <asm/mpspec.h>
 #include <asm/i8259.h>
 #include <asm/proto.h>
@@ -477,6 +478,9 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;
 
+	/* This MSR is special and need a special fence: */
+	weak_wrmsr_fence();
+
 	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index df6adc5674c9..f4da9bb69a88 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int vector)
 {
 	u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
 }
 
@@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 	unsigned long flags;
 	u32 dest;
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	local_irq_save(flags);
 
 	tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 0e4e81971567..6bde05a86b4e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -43,7 +43,8 @@ static void x2apic_send_IPI(int cpu, int vector)
 {
 	u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
 }
 
@@ -54,7 +55,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 	unsigned long this_cpu;
 	unsigned long flags;
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 
 	local_irq_save(flags);
 
@@ -125,7 +127,8 @@ void __x2apic_send_IPI_shorthand(int vector, u32 which)
 {
 	unsigned long cfg = __prepare_ICR(which, vector, 0);
 
-	x2apic_wrmsr_fence();
+	/* x2apic MSRs are special and need a special fence: */
+	weak_wrmsr_fence();
 	native_x2apic_icr_write(cfg, 0);
 }
 
-- 
cgit v1.2.3


From c4bed4b96918ff1d062ee81fdae4d207da4fa9b0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Thu, 4 Feb 2021 23:27:06 +0800
Subject: x86/debug: Prevent data breakpoints on __per_cpu_offset

When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU GSBASE value
via __per_cpu_offset or pcpu_unit_offsets.

When a data breakpoint is set on __per_cpu_offset[cpu] (read-write
operation), the specific CPU will be stuck in an infinite #DB loop.

RCU will try to send an NMI to the specific CPU, but it is not working
either since NMI also relies on paranoid_entry(). Which means it's
undebuggable.

Fixes: eaad981291ee3("x86/entry/64: Introduce the FIND_PERCPU_BASE macro")
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210204152708.21308-1-jiangshanlai@gmail.com
---
 arch/x86/kernel/hw_breakpoint.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 6694c0f8e6c1..012ed82e5bd3 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
 			CPU_ENTRY_AREA_TOTAL_SIZE))
 		return true;
 
+	/*
+	 * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
+	 * GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
+	 */
+#ifdef CONFIG_SMP
+	if (within_area(addr, end, (unsigned long)__per_cpu_offset,
+			sizeof(unsigned long) * nr_cpu_ids))
+		return true;
+#else
+	if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets,
+			sizeof(pcpu_unit_offsets)))
+		return true;
+#endif
+
 	for_each_possible_cpu(cpu) {
 		/* The original rw GDT is being used after load_direct_gdt() */
 		if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
-- 
cgit v1.2.3


From 3943abf2dbfae9ea4d2da05c1db569a0603f76da Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Thu, 4 Feb 2021 23:27:07 +0800
Subject: x86/debug: Prevent data breakpoints on cpu_dr7

local_db_save() is called at the start of exc_debug_kernel(), reads DR7 and
disables breakpoints to prevent recursion.

When running in a guest (X86_FEATURE_HYPERVISOR), local_db_save() reads the
per-cpu variable cpu_dr7 to check whether a breakpoint is active or not
before it accesses DR7.

A data breakpoint on cpu_dr7 therefore results in infinite #DB recursion.

Disallow data breakpoints on cpu_dr7 to prevent that.

Fixes: 84b6a3491567a("x86/entry: Optimize local_db_save() for virt")
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210204152708.21308-2-jiangshanlai@gmail.com
---
 arch/x86/kernel/hw_breakpoint.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 012ed82e5bd3..668a4a6533d9 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -307,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
 				(unsigned long)&per_cpu(cpu_tlbstate, cpu),
 				sizeof(struct tlb_state)))
 			return true;
+
+		/*
+		 * When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
+		 * will read per-cpu cpu_dr7 before clear dr7 register.
+		 */
+		if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu),
+				sizeof(cpu_dr7)))
+			return true;
 	}
 
 	return false;
-- 
cgit v1.2.3


From 6342adcaa683c2b705c24ed201dc11b35854c88d Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Wed, 3 Feb 2021 13:00:48 -0500
Subject: entry: Ensure trap after single-step on system call return

Commit 299155244770 ("entry: Drop usage of TIF flags in the generic syscall
code") introduced a bug on architectures using the generic syscall entry
code, in which processes stopped by PTRACE_SYSCALL do not trap on syscall
return after receiving a TIF_SINGLESTEP.

The reason is that the meaning of TIF_SINGLESTEP flag is overloaded to
cause the trap after a system call is executed, but since the above commit,
the syscall call handler only checks for the SYSCALL_WORK flags on the exit
work.

Split the meaning of TIF_SINGLESTEP such that it only means single-step
mode, and create a new type of SYSCALL_WORK to request a trap immediately
after a syscall in single-step mode.  In the current implementation, the
SYSCALL_WORK flag shadows the TIF_SINGLESTEP flag for simplicity.

Update x86 to flip this bit when a tracer enables single stepping.

Fixes: 299155244770 ("entry: Drop usage of TIF flags in the generic syscall code")
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Kyle Huey <me@kylehuey.com>
Link: https://lore.kernel.org/r/87h7mtc9pr.fsf_-_@collabora.com
---
 arch/x86/kernel/step.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 60d2c3798ba2..0f3c307b37b3 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child)
 		regs->flags |= X86_EFLAGS_TF;
 
 	/*
-	 * Always set TIF_SINGLESTEP - this guarantees that
-	 * we single-step system calls etc..  This will also
+	 * Always set TIF_SINGLESTEP.  This will also
 	 * cause us to set TF when returning to user mode.
 	 */
 	set_tsk_thread_flag(child, TIF_SINGLESTEP);
 
+	/*
+	 * Ensure that a trap is triggered once stepping out of a system
+	 * call prior to executing any user instruction.
+	 */
+	set_task_syscall_work(child, SYSCALL_EXIT_TRAP);
+
 	oflags = regs->flags;
 
 	/* Set TF on the kernel stack.. */
@@ -230,6 +235,7 @@ void user_disable_single_step(struct task_struct *child)
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+	clear_task_syscall_work(child, SYSCALL_EXIT_TRAP);
 
 	/* But touch TF only if it was set by us.. */
 	if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
-- 
cgit v1.2.3


From 53c9d9240944088274aadbbbafc6138ca462db4f Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 20 Jan 2021 14:55:44 +0100
Subject: x86/pv: Switch SWAPGS to ALTERNATIVE

SWAPGS is used only for interrupts coming from user mode or for
returning to user mode. So there is no reason to use the PARAVIRT
framework, as it can easily be replaced by an ALTERNATIVE depending
on X86_FEATURE_XENPV.

There are several instances using the PV-aware SWAPGS macro in paths
which are never executed in a Xen PV guest. Replace those with the
plain swapgs instruction. For SWAPGS_UNSAFE_STACK the same applies.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210120135555.32594-5-jgross@suse.com
---
 arch/x86/kernel/asm-offsets_64.c | 1 -
 arch/x86/kernel/paravirt.c       | 1 -
 arch/x86/kernel/paravirt_patch.c | 3 ---
 3 files changed, 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 828be792231e..1354bc30614d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -15,7 +15,6 @@ int main(void)
 #ifdef CONFIG_PARAVIRT_XXL
 	OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
 	       cpu.usergs_sysret64);
-	OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
 #ifdef CONFIG_DEBUG_ENTRY
 	OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
 #endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 6c3407ba6ee9..5e5fcf5c376d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -312,7 +312,6 @@ struct paravirt_patch_template pv_ops = {
 
 	.cpu.usergs_sysret64	= native_usergs_sysret64,
 	.cpu.iret		= native_iret,
-	.cpu.swapgs		= native_swapgs,
 
 #ifdef CONFIG_X86_IOPL_IOPERM
 	.cpu.invalidate_io_bitmap	= native_tss_invalidate_io_bitmap,
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index ace6e334cb39..7c518b08aa3c 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -28,7 +28,6 @@ struct patch_xxl {
 	const unsigned char	irq_restore_fl[2];
 	const unsigned char	cpu_wbinvd[2];
 	const unsigned char	cpu_usergs_sysret64[6];
-	const unsigned char	cpu_swapgs[3];
 	const unsigned char	mov64[3];
 };
 
@@ -43,7 +42,6 @@ static const struct patch_xxl patch_data_xxl = {
 	.cpu_wbinvd		= { 0x0f, 0x09 },	// wbinvd
 	.cpu_usergs_sysret64	= { 0x0f, 0x01, 0xf8,
 				    0x48, 0x0f, 0x07 },	// swapgs; sysretq
-	.cpu_swapgs		= { 0x0f, 0x01, 0xf8 },	// swapgs
 	.mov64			= { 0x48, 0x89, 0xf8 },	// mov %rdi, %rax
 };
 
@@ -86,7 +84,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
 	PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
 
 	PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
-	PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
 	PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
 #endif
 
-- 
cgit v1.2.3


From afd30525a659ac0ae0904f0cb4a2ca75522c3123 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 20 Jan 2021 14:55:45 +0100
Subject: x86/xen: Drop USERGS_SYSRET64 paravirt call

USERGS_SYSRET64 is used to return from a syscall via SYSRET, but
a Xen PV guest will nevertheless use the IRET hypercall, as there
is no sysret PV hypercall defined.

So instead of testing all the prerequisites for doing a sysret and
then mangling the stack for Xen PV again for doing an iret just use
the iret exit from the beginning.

This can easily be done via an ALTERNATIVE like it is done for the
sysenter compat case already.

It should be noted that this drops the optimization in Xen for not
restoring a few registers when returning to user mode, but it seems
as if the saved instructions in the kernel more than compensate for
this drop (a kernel build in a Xen PV guest was slightly faster with
this patch applied).

While at it remove the stale sysret32 remnants.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210120135555.32594-6-jgross@suse.com
---
 arch/x86/kernel/asm-offsets_64.c | 2 --
 arch/x86/kernel/paravirt.c       | 5 +----
 arch/x86/kernel/paravirt_patch.c | 4 ----
 3 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1354bc30614d..b14533af7676 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,8 +13,6 @@ int main(void)
 {
 #ifdef CONFIG_PARAVIRT
 #ifdef CONFIG_PARAVIRT_XXL
-	OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
-	       cpu.usergs_sysret64);
 #ifdef CONFIG_DEBUG_ENTRY
 	OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
 #endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5e5fcf5c376d..18560b71e717 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
 	else if (opfunc == _paravirt_ident_64)
 		ret = paravirt_patch_ident_64(insn_buff, len);
 
-	else if (type == PARAVIRT_PATCH(cpu.iret) ||
-		 type == PARAVIRT_PATCH(cpu.usergs_sysret64))
+	else if (type == PARAVIRT_PATCH(cpu.iret))
 		/* If operation requires a jmp, then jmp */
 		ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
 #endif
@@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu)
 
 /* These are in entry.S */
 extern void native_iret(void);
-extern void native_usergs_sysret64(void);
 
 static struct resource reserve_ioports = {
 	.start = 0,
@@ -310,7 +308,6 @@ struct paravirt_patch_template pv_ops = {
 
 	.cpu.load_sp0		= native_load_sp0,
 
-	.cpu.usergs_sysret64	= native_usergs_sysret64,
 	.cpu.iret		= native_iret,
 
 #ifdef CONFIG_X86_IOPL_IOPERM
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index 7c518b08aa3c..2fada2c347c9 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -27,7 +27,6 @@ struct patch_xxl {
 	const unsigned char	mmu_write_cr3[3];
 	const unsigned char	irq_restore_fl[2];
 	const unsigned char	cpu_wbinvd[2];
-	const unsigned char	cpu_usergs_sysret64[6];
 	const unsigned char	mov64[3];
 };
 
@@ -40,8 +39,6 @@ static const struct patch_xxl patch_data_xxl = {
 	.mmu_write_cr3		= { 0x0f, 0x22, 0xdf },	// mov %rdi, %cr3
 	.irq_restore_fl		= { 0x57, 0x9d },	// push %rdi; popfq
 	.cpu_wbinvd		= { 0x0f, 0x09 },	// wbinvd
-	.cpu_usergs_sysret64	= { 0x0f, 0x01, 0xf8,
-				    0x48, 0x0f, 0x07 },	// swapgs; sysretq
 	.mov64			= { 0x48, 0x89, 0xf8 },	// mov %rdi, %rax
 };
 
@@ -83,7 +80,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
 	PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
 	PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
 
-	PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
 	PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
 #endif
 
-- 
cgit v1.2.3


From ab234a260b1f625b26cbefa93ca365b0ae66df33 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 20 Jan 2021 14:55:46 +0100
Subject: x86/pv: Rework arch_local_irq_restore() to not use popf

POPF is a rather expensive operation, so don't use it for restoring
irq flags. Instead, test whether interrupts are enabled in the flags
parameter and enable interrupts via STI in that case.

This results in the restore_fl paravirt op to be no longer needed.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210120135555.32594-7-jgross@suse.com
---
 arch/x86/kernel/irqflags.S       | 11 -----------
 arch/x86/kernel/paravirt.c       |  1 -
 arch/x86/kernel/paravirt_patch.c |  3 ---
 3 files changed, 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 0db0375235b4..8ef35063964b 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl)
 	ret
 SYM_FUNC_END(native_save_fl)
 EXPORT_SYMBOL(native_save_fl)
-
-/*
- * void native_restore_fl(unsigned long flags)
- * %eax/%rdi: flags
- */
-SYM_FUNC_START(native_restore_fl)
-	push %_ASM_ARG1
-	popf
-	ret
-SYM_FUNC_END(native_restore_fl)
-EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 18560b71e717..c60222ab8ab9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -320,7 +320,6 @@ struct paravirt_patch_template pv_ops = {
 
 	/* Irq ops. */
 	.irq.save_fl		= __PV_IS_CALLEE_SAVE(native_save_fl),
-	.irq.restore_fl		= __PV_IS_CALLEE_SAVE(native_restore_fl),
 	.irq.irq_disable	= __PV_IS_CALLEE_SAVE(native_irq_disable),
 	.irq.irq_enable		= __PV_IS_CALLEE_SAVE(native_irq_enable),
 	.irq.safe_halt		= native_safe_halt,
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index 2fada2c347c9..abd27ec67397 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -25,7 +25,6 @@ struct patch_xxl {
 	const unsigned char	mmu_read_cr2[3];
 	const unsigned char	mmu_read_cr3[3];
 	const unsigned char	mmu_write_cr3[3];
-	const unsigned char	irq_restore_fl[2];
 	const unsigned char	cpu_wbinvd[2];
 	const unsigned char	mov64[3];
 };
@@ -37,7 +36,6 @@ static const struct patch_xxl patch_data_xxl = {
 	.mmu_read_cr2		= { 0x0f, 0x20, 0xd0 },	// mov %cr2, %[re]ax
 	.mmu_read_cr3		= { 0x0f, 0x20, 0xd8 },	// mov %cr3, %[re]ax
 	.mmu_write_cr3		= { 0x0f, 0x22, 0xdf },	// mov %rdi, %cr3
-	.irq_restore_fl		= { 0x57, 0x9d },	// push %rdi; popfq
 	.cpu_wbinvd		= { 0x0f, 0x09 },	// wbinvd
 	.mov64			= { 0x48, 0x89, 0xf8 },	// mov %rdi, %rax
 };
@@ -71,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
 	switch (type) {
 
 #ifdef CONFIG_PARAVIRT_XXL
-	PATCH_CASE(irq, restore_fl, xxl, insn_buff, len);
 	PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
 	PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
 	PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
-- 
cgit v1.2.3