aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>2013-10-30 20:06:13 +0530
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-12-05 16:06:06 +1100
commitb63a0ffe35de7e5f9b907bbc2c783e702f7e15af (patch)
tree4ec5c5261fcfcb9789f7ff5a36852fada5dc8564 /arch/powerpc/platforms/powernv
parent28446de2ce9992f6d13e4594a25fc9c3b9f4517b (diff)
powerpc/powernv: Machine check exception handling.
Add basic error handling in machine check exception handler. - If MSR_RI isn't set, we can not recover. - Check if disposition set to OpalMCE_DISPOSITION_RECOVERED. - Check if address at fault is inside kernel address space, if not then send SIGBUS to process if we hit exception when in userspace. - If address at fault is not provided then and if we get a synchronous machine check while in userspace then kill the task. Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/opal.c43
1 files changed, 42 insertions, 1 deletions
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index f348bd49487c..01e74cbc67e9 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -18,6 +18,7 @@
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/slab.h>
+#include <linux/sched.h>
#include <linux/kobject.h>
#include <asm/opal.h>
#include <asm/firmware.h>
@@ -251,6 +252,44 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
return written;
}
+static int opal_recover_mce(struct pt_regs *regs,
+ struct machine_check_event *evt)
+{
+ int recovered = 0;
+ uint64_t ea = get_mce_fault_addr(evt);
+
+ if (!(regs->msr & MSR_RI)) {
+ /* If MSR_RI isn't set, we cannot recover */
+ recovered = 0;
+ } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
+ /* Platform corrected itself */
+ recovered = 1;
+ } else if (ea && !is_kernel_addr(ea)) {
+ /*
+ * Faulting address is not in kernel text. We should be fine.
+ * We need to find which process uses this address.
+ * For now, kill the task if we have received exception when
+ * in userspace.
+ *
+ * TODO: Queue up this address for hwpoisioning later.
+ */
+ if (user_mode(regs) && !is_global_init(current)) {
+ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+ recovered = 1;
+ } else
+ recovered = 0;
+ } else if (user_mode(regs) && !is_global_init(current) &&
+ evt->severity == MCE_SEV_ERROR_SYNC) {
+ /*
+ * If we have received a synchronous error when in userspace
+ * kill the task.
+ */
+ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+ recovered = 1;
+ }
+ return recovered;
+}
+
int opal_machine_check(struct pt_regs *regs)
{
struct machine_check_event evt;
@@ -266,7 +305,9 @@ int opal_machine_check(struct pt_regs *regs)
}
machine_check_print_event_info(&evt);
- return evt.severity == MCE_SEV_FATAL ? 0 : 1;
+ if (opal_recover_mce(regs, &evt))
+ return 1;
+ return 0;
}
static irqreturn_t opal_interrupt(int irq, void *data)