x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned

An interesting thing happened when a guest Linux instance took a machine check. The VMM unmapped the bad page from guest physical space and passed the machine check to the guest. Linux took all the normal actions to offline the page from the process that was using it. But then guest Linux crashed because it said there was a second machine check inside the kernel with this stack trace: do_memory_failure set_mce_nospec set_memory_uc _set_memory_uc change_page_attr_set_clr cpa_flush clflush_cache_range_opt This was odd, because a CLFLUSH instruction shouldn't raise a machine check (it isn't consuming the data). Further investigation showed that the VMM had passed in another machine check because is appeared that the guest was accessing the bad page. Fix is to check the scope of the poison by checking the MCi_MISC register. If the entire page is affected, then unmap the page. If only part of the page is affected, then mark the page as uncacheable. This assumes that VMMs will do the logical thing and pass in the "whole page scope" via the MCi_MISC register (since they unmapped the entire page). [ bp: Adjust to x86/entry changes. ] Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()") Reported-by: Jue Wang <juew@google.com> Signed-off-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Borislav Petkov <bp@suse.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Jue Wang <juew@google.com> Cc: <stable@vger.kernel.org> Link: https://lkml.kernel.org/r/20200520163546.GA7977@agluck-desk2.amr.corp.intel.com
author: Tony Luck <tony.luck@intel.com> 2020-05-20 09:35:46 -0700
committer: Thomas Gleixner <tglx@linutronix.de> 2020-06-11 15:19:17 +0200
commit: 17fae1294ad9d711b2c3dd0edef479d40c76a5e8 (patch)
tree: 13d6d176ac7bf835ab28f7b5bfe7e08996c9decb /arch/x86/kernel
parent: f77d26a9fc525286bcef3d4f98b52e17482cf49c (diff)
1 files changed, 14 insertions, 4 deletions
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 30413325de22..ce9120c4f740 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -520,6 +520,14 @@ bool mce_is_memory_error(struct mce *m)
 }
 EXPORT_SYMBOL_GPL(mce_is_memory_error);
 
+static bool whole_page(struct mce *m)
+{
+	if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+		return true;
+
+	return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
 bool mce_is_correctable(struct mce *m)
 {
 	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -573,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
 
 	pfn = mce->addr >> PAGE_SHIFT;
 	if (!memory_failure(pfn, 0)) {
-		set_mce_nospec(pfn);
+		set_mce_nospec(pfn, whole_page(mce));
 		mce->kflags |= MCE_HANDLED_UC;
 	}
 
@@ -1173,11 +1181,12 @@ static void kill_me_maybe(struct callback_head *cb)
 	int flags = MF_ACTION_REQUIRED;
 
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
-	if (!(p->mce_status & MCG_STATUS_RIPV))
+
+	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;
 
 	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
-		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		return;
 	}
 
@@ -1331,7 +1340,8 @@ void noinstr do_machine_check(struct pt_regs *regs)
 		BUG_ON(!on_thread_stack() || !user_mode(regs));
 
 		current->mce_addr = m.addr;
-		current->mce_status = m.mcgstatus;
+		current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV);
+		current->mce_whole_page = whole_page(&m);
 		current->mce_kill_me.func = kill_me_maybe;
 		if (kill_it)
 			current->mce_kill_me.func = kill_me_now;
author	Tony Luck <tony.luck@intel.com>	2020-05-20 09:35:46 -0700
committer	Thomas Gleixner <tglx@linutronix.de>	2020-06-11 15:19:17 +0200
commit	17fae1294ad9d711b2c3dd0edef479d40c76a5e8 (patch)
tree	13d6d176ac7bf835ab28f7b5bfe7e08996c9decb /arch/x86/kernel
parent	f77d26a9fc525286bcef3d4f98b52e17482cf49c (diff)