From 4d2be1267fcfb3a4d2198fd696aec5e3dcbce60e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 11 Jun 2009 15:28:09 +0530 Subject: perf_counter, x86: Check old-AMD performance monitoring support AMD supports performance monitoring start from K7 (i.e. family 6), so disable it for earlier AMD CPUs. Signed-off-by: Jaswinder Singh Rajput Cc: Robert Richter Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <1244714289.6923.0.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 275bc142cd5..3c37c3930ca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1459,6 +1459,10 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + x86_pmu = amd_pmu; switch (boot_cpu_data.x86) { -- cgit v1.2.3 From f4db43a38f7387c3b19c9565124c06ab0c5d6e9a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 13 Jun 2009 01:06:21 +0530 Subject: perf_counter, x86: Update AMD hw caching related event table All AMD models share the same hw caching related event table. Also complete the table with more events. Signed-off-by: Jaswinder Singh Rajput Cc: Robert Richter Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <1244835381.2802.2.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3c37c3930ca..77a59a5566a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -389,23 +389,23 @@ static u64 intel_pmu_raw_event(u64 event) return event & CORE_EVNTSEL_MASK; } -static const u64 amd_0f_hw_cache_event_ids +static const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ }, }, [ C(L1I ) ] = { @@ -418,17 +418,17 @@ static const u64 amd_0f_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ [ C(RESULT_MISS) ] = 0, }, }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -438,8 +438,8 @@ static const u64 amd_0f_hw_cache_event_ids }, [ C(DTLB) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0, @@ -1465,16 +1465,10 @@ static int amd_pmu_init(void) x86_pmu = amd_pmu; - switch (boot_cpu_data.x86) { - case 0x0f: - case 0x10: - case 0x11: - memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); - pr_cont("AMD Family 0f/10/11 events, "); - break; - } return 0; } -- cgit v1.2.3 From 5a6cec3abbdb74244caab68db100825a8c4ac02d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 May 2009 11:25:09 +0200 Subject: perf_counter, x86: Fix call-chain walking Fix the ptregs variant when we hit user-mode tasks. Cc: Frederic Weisbecker Cc: Pekka Enberg Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 77a59a5566a..09d8cb69c3f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1644,7 +1644,9 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) const void __user *fp; int nr = entry->nr; - regs = (struct pt_regs *)current->thread.sp0 - 1; + if (!user_mode(regs)) + regs = task_pt_regs(current); + fp = (void __user *)regs->bp; callchain_store(entry, regs->ip); @@ -1656,7 +1658,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) if (!copy_stack_frame(fp, &frame)) break; - if ((unsigned long)fp < user_stack_pointer(regs)) + if ((unsigned long)fp < regs->sp) break; callchain_store(entry, frame.return_address); -- cgit v1.2.3 From 038e836e97e70c4ad2b5058b07fc7207f50b59dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Jun 2009 09:57:59 +0200 Subject: perf_counter, x86: Fix kernel-space call-chains Kernel-space call-chains were trimmed at the first entry because we never processed anything beyond the first stack context. Allow the backtrace to jump from NMI to IRQ stack then to task stack and finally user-space stack. Also calculate the stack and bp variables correctly so that the stack walker does not exit early. We can get deep traces as a result, visible in perf report -D output: 0x32af0 [0xe0]: PERF_EVENT (IP, 5): 15134: 0xffffffff815225fd period: 1 ... chain: u:2, k:22, nr:24 ..... 0: 0xffffffff815225fd ..... 1: 0xffffffff810ac51c ..... 2: 0xffffffff81018e29 ..... 3: 0xffffffff81523939 ..... 4: 0xffffffff81524b8f ..... 5: 0xffffffff81524bd9 ..... 6: 0xffffffff8105e498 ..... 7: 0xffffffff8152315a ..... 8: 0xffffffff81522c3a ..... 9: 0xffffffff810d9b74 ..... 10: 0xffffffff810dbeec ..... 11: 0xffffffff810dc3fb This is a 22-entries kernel-space chain. (We still only record reliable stack entries.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 09d8cb69c3f..6d5e7cfd97e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1575,8 +1575,8 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - /* Don't bother with IRQ stacks for now */ - return -1; + /* Process all stacks: */ + return 0; } static void backtrace_address(void *data, unsigned long addr, int reliable) @@ -1594,6 +1594,8 @@ static const struct stacktrace_ops backtrace_ops = { .address = backtrace_address, }; +#include "../dumpstack.h" + static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { @@ -1601,26 +1603,20 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) char *stack; int nr = entry->nr; - callchain_store(entry, instruction_pointer(regs)); + callchain_store(entry, regs->ip); stack = ((char *)regs + sizeof(struct pt_regs)); #ifdef CONFIG_FRAME_POINTER - bp = frame_pointer(regs); + get_bp(bp); #else bp = 0; #endif - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + dump_trace(NULL, regs, (void *)&stack, bp, &backtrace_ops, entry); entry->kernel = entry->nr - nr; } - -struct stack_frame { - const void __user *next_fp; - unsigned long return_address; -}; - static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) { int ret; @@ -1652,7 +1648,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, regs->ip); while (entry->nr < MAX_STACK_DEPTH) { - frame.next_fp = NULL; + frame.next_frame = NULL; frame.return_address = 0; if (!copy_stack_frame(fp, &frame)) @@ -1662,7 +1658,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) break; callchain_store(entry, frame.return_address); - fp = frame.next_fp; + fp = frame.next_frame; } entry->user = entry->nr - nr; -- cgit v1.2.3 From 74193ef0ecab92535c8517f082f1f50504526c9b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2009 13:07:24 +0200 Subject: perf_counter: x86: Fix call-chain support to use NMI-safe methods __copy_from_user_inatomic() isn't NMI safe in that it can trigger the page fault handler which is another trap and its return path invokes IRET which will also close the NMI context. Therefore use a GUP based approach to copy the stack frames over. We tried an alternative solution as well: we used a forward ported version of Mathieu Desnoyers's "NMI safe INT3 and Page Fault" patch that modifies the exception return path to use an open-coded IRET with explicit stack unrolling and TF checking. This didnt work as it interacted with faulting user-space instructions, causing them not to restart properly, which corrupts user-space registers. Solving that would probably involve disassembling those instructions and backtracing the RIP. But even without that, the code was deemed rather complex to the already non-trivial x86 entry assembly code, so instead we went for this GUP based method that does a software-walk of the pagetables. Signed-off-by: Peter Zijlstra Cc: Nick Piggin Cc: Pekka Enberg Cc: Vegard Nossum Cc: Jeremy Fitzhardinge Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 49 ++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6d5e7cfd97e..e8c68a5091d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -1617,20 +1618,48 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) entry->kernel = entry->nr - nr; } -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) { + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; int ret; - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); - return ret; + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + unsigned long bytes; + + bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + + return bytes == sizeof(*frame); } static void @@ -1643,7 +1672,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) if (!user_mode(regs)) regs = task_pt_regs(current); - fp = (void __user *)regs->bp; + fp = (void __user *)regs->bp; callchain_store(entry, regs->ip); -- cgit v1.2.3 From 86e13684aa77f07c77db352f437d9e53a84dde90 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Wed, 22 Apr 2009 13:48:30 +0200 Subject: [CPUFREQ] powernow-k8: Set transition latency to 1 if ACPI tables export 0 This doesn't fix anything, but it's expected that a transition latency of 0 could cause trouble in the future. Signed-off-by: Thomas Renninger Cc: Langsdorf, Mark Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index cf52215d9eb..935989693a6 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1046,6 +1046,19 @@ static int get_transition_latency(struct powernow_k8_data *data) if (cur_latency > max_latency) max_latency = cur_latency; } + if (max_latency == 0) { + /* + * Fam 11h always returns 0 as transition latency. + * This is intended and means "very fast". While cpufreq core + * and governors currently can handle that gracefully, better + * set it to 1 to avoid problems in the future. + * For all others it's a BIOS bug. + */ + if (!boot_cpu_data.x86 == 0x11) + printk(KERN_ERR FW_WARN PFX "Invalid zero transition " + "latency\n"); + max_latency = 1; + } /* value in usecs, needs to be in nanoseconds */ return 1000 * max_latency; } -- cgit v1.2.3 From 21335d021464c3ba3c20fc7207ffe2bdd2458568 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Thu, 23 Apr 2009 19:45:04 +0100 Subject: [CPUFREQ] powernow-k8.c: mess cleanup Mess cleanup in powernow_k8_acpi_pst_values() function. Signed-off-by: Luis Henriques Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 935989693a6..a9d7ecbf64b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1,3 +1,4 @@ + /* * (c) 2003-2006 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the @@ -823,13 +824,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; - control = data->acpi_data.states[index].control; data->irt = (control - >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> - RVO_SHIFT) & RVO_MASK; data->exttype = (control - >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 - << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = - (control >> VST_SHIFT) & VST_MASK; } + control = data->acpi_data.states[index].control; + data->irt = (control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (control >> VST_SHIFT) & VST_MASK; +} static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { -- cgit v1.2.3 From b394f1dfc070e6f50f5f33694000815e50ef319d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 10 Jun 2009 12:41:37 -0700 Subject: [CPUFREQ] reduce scope of ACPI_PSS_BIOS_BUG_MSG[] This symbol doesn't need file-global scope. Cc: "Zhang, Rui" Cc: Dave Jones Cc: Ingo Molnar Cc: Langsdorf, Mark Cc: Leo Milano Cc: Thomas Renninger Signed-off-by: Andrew Morton Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a9d7ecbf64b..2709b3c183b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1250,13 +1250,12 @@ static int powernowk8_verify(struct cpufreq_policy *pol) return cpufreq_frequency_table_verify(pol, data->powernow_table); } -static const char ACPI_PSS_BIOS_BUG_MSG[] = - KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; - /* per CPU init entry point to the driver */ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { + static const char ACPI_PSS_BIOS_BUG_MSG[] = + KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" + KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; cpumask_t oldmask; int rc; -- cgit v1.2.3 From 532cfee6ba0a8efcf7c3ce38b9881292d79d516e Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Thu, 11 Jun 2009 15:26:48 +0000 Subject: [CPUFREQ] powernow-k8: read P-state from HW By definition, "cpuinfo_cur_freq" should report the value from HW. So, don't depend on the cached value. Instead read P-state directly from HW, while taking into account the erratum 311 workaround for Fam 11h processors. Cc: Andreas Herrmann Cc: Langsdorf, Mark Cc: Thomas Renninger Signed-off-by: Naga Chumbalkar Reviewed-by: Andreas Herrmann Tested-by: Andreas Herrmann Acked-by: Langsdorf, Mark Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2709b3c183b..331021112f2 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -118,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data) u32 i = 0; if (cpu_family == CPU_HW_PSTATE) { - if (data->currpstate == HW_PSTATE_INVALID) { - /* read (initial) hw pstate if not yet set */ - rdmsr(MSR_PSTATE_STATUS, lo, hi); - i = lo & HW_PSTATE_MASK; - - /* - * a workaround for family 11h erratum 311 might cause - * an "out-of-range Pstate if the core is in Pstate-0 - */ - if (i >= data->numps) - data->currpstate = HW_PSTATE_0; - else - data->currpstate = i; - } + rdmsr(MSR_PSTATE_STATUS, lo, hi); + i = lo & HW_PSTATE_MASK; + data->currpstate = i; + + /* + * a workaround for family 11h erratum 311 might cause + * an "out-of-range Pstate if the core is in Pstate-0 + */ + if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) + data->currpstate = HW_PSTATE_0; + return 0; } do { -- cgit v1.2.3 From e15bc4559b397a611441a135b1f5992f07d0f436 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Thu, 11 Jun 2009 15:26:54 +0000 Subject: [CPUFREQ] powernow-k8: get drv data for correct CPU Make powernowk8_get() similar to powernowk8_target() and powernowk8_verify() in the way it obtains "powernow_data" for a given CPU. Cc: Andreas Herrmann Cc: Langsdorf, Mark Cc: Thomas Renninger Signed-off-by: Naga Chumbalkar Reviewed-by: Andreas Herrmann Tested-by: Andreas Herrmann Acked-by: Langsdorf, Mark Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 331021112f2..20c7b99d7ba 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1385,13 +1385,9 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) static unsigned int powernowk8_get(unsigned int cpu) { - struct powernow_k8_data *data; + struct powernow_k8_data *data = per_cpu(powernow_data, cpu); cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; - unsigned int first; - - first = cpumask_first(cpu_core_mask(cpu)); - data = per_cpu(powernow_data, first); if (!data) return -EINVAL; -- cgit v1.2.3 From 394122ab144dae4b276d74644a2f11c44a60ac5c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Jun 2009 22:59:58 +0930 Subject: [CPUFREQ] cpumask: avoid playing with cpus_allowed in speedstep-ich.c Impact: don't play with current's cpumask It's generally a very bad idea to mug some process's cpumask: it could legitimately and reasonably be changed by root, which could break us (if done before our code) or them (if we restore the wrong value). We use smp_call_function_single: this had the advantage of being more efficient, too. Signed-off-by: Rusty Russell To: cpufreq@vger.kernel.org Cc: Dominik Brodowski Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 93 +++++++++++++++++------------ arch/x86/kernel/cpu/cpufreq/speedstep-lib.c | 1 + 2 files changed, 55 insertions(+), 39 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 016c1a4fa3f..6911e91fb4f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -89,7 +89,8 @@ static int speedstep_find_register(void) * speedstep_set_state - set the SpeedStep state * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) * - * Tries to change the SpeedStep state. + * Tries to change the SpeedStep state. Can be called from + * smp_call_function_single. */ static void speedstep_set_state(unsigned int state) { @@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state) return; } +/* Wrapper for smp_call_function_single. */ +static void _speedstep_set_state(void *_state) +{ + speedstep_set_state(*(unsigned int *)_state); +} /** * speedstep_activate - activate SpeedStep control in the chipset @@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -static unsigned int _speedstep_get(const struct cpumask *cpus) -{ +struct get_freq_data { unsigned int speed; - cpumask_t cpus_allowed; - - cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpus); - speed = speedstep_get_frequency(speedstep_processor); - set_cpus_allowed_ptr(current, &cpus_allowed); - dprintk("detected %u kHz as current frequency\n", speed); - return speed; + unsigned int processor; +}; + +static void get_freq_data(void *_data) +{ + struct get_freq_data *data = _data; + + data->speed = speedstep_get_frequency(data->processor); } static unsigned int speedstep_get(unsigned int cpu) { - return _speedstep_get(cpumask_of(cpu)); + struct get_freq_data data = { .processor = cpu }; + + /* You're supposed to ensure CPU is online. */ + if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + BUG(); + + dprintk("detected %u kHz as current frequency\n", data.speed); + return data.speed; } /** @@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - unsigned int newstate = 0; + unsigned int newstate = 0, policy_cpu; struct cpufreq_freqs freqs; - cpumask_t cpus_allowed; int i; if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) return -EINVAL; - freqs.old = _speedstep_get(policy->cpus); + policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); + freqs.old = speedstep_get(policy_cpu); freqs.new = speedstep_freqs[newstate].frequency; freqs.cpu = policy->cpu; @@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy, if (freqs.old == freqs.new) return 0; - cpus_allowed = current->cpus_allowed; - for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } - /* switch to physical CPU where state is to be changed */ - set_cpus_allowed_ptr(current, policy->cpus); - - speedstep_set_state(newstate); - - /* allow to be run on all CPUs */ - set_cpus_allowed_ptr(current, &cpus_allowed); + smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, + true); for_each_cpu(i, policy->cpus) { freqs.cpu = i; @@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy) return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); } +struct get_freqs { + struct cpufreq_policy *policy; + int ret; +}; + +static void get_freqs_on_cpu(void *_get_freqs) +{ + struct get_freqs *get_freqs = _get_freqs; + + get_freqs->ret = + speedstep_get_freqs(speedstep_processor, + &speedstep_freqs[SPEEDSTEP_LOW].frequency, + &speedstep_freqs[SPEEDSTEP_HIGH].frequency, + &get_freqs->policy->cpuinfo.transition_latency, + &speedstep_set_state); +} static int speedstep_cpu_init(struct cpufreq_policy *policy) { - int result = 0; - unsigned int speed; - cpumask_t cpus_allowed; + int result; + unsigned int policy_cpu, speed; + struct get_freqs gf; /* only run on CPU to be set, or on its sibling */ #ifdef CONFIG_SMP cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); #endif - - cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, policy->cpus); + policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); /* detect low and high frequency and transition latency */ - result = speedstep_get_freqs(speedstep_processor, - &speedstep_freqs[SPEEDSTEP_LOW].frequency, - &speedstep_freqs[SPEEDSTEP_HIGH].frequency, - &policy->cpuinfo.transition_latency, - &speedstep_set_state); - set_cpus_allowed_ptr(current, &cpus_allowed); - if (result) - return result; + gf.policy = policy; + smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); + if (gf.ret) + return gf.ret; /* get current speed setting */ - speed = _speedstep_get(policy->cpus); + speed = speedstep_get(policy_cpu); if (!speed) return -EIO; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 2e3c6862657..f4c290b8482 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void) } +/* Warning: may get called from smp_call_function_single. */ unsigned int speedstep_get_frequency(unsigned int processor) { switch (processor) { -- cgit v1.2.3 From e3f996c26ff6c4c084aaaa64dce6e54d31f517be Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Jun 2009 22:59:58 +0930 Subject: [CPUFREQ] cpumask: avoid cpumask games in arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c Impact: don't play with current's cpumask It's generally a very bad idea to mug some process's cpumask: it could legitimately and reasonably be changed by root, which could break us (if done before our code) or them (if we restore the wrong value). Use rdmsr_on_cpu and wrmsr_on_cpu instead. Signed-off-by: Rusty Russell To: cpufreq@vger.kernel.org Cc: Jeremy Fitzhardinge Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 60 +++++++----------------- 1 file changed, 17 insertions(+), 43 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 55c831ed71c..8d672ef162c 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu) { unsigned l, h; unsigned clock_freq; - cpumask_t saved_mask; - saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) - return 0; - - rdmsr(MSR_IA32_PERF_STATUS, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); clock_freq = extract_clock(l, cpu, 0); if (unlikely(clock_freq == 0)) { @@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu) * P-state transition (like TM2). Get the last freq set * in PERF_CTL. */ - rdmsr(MSR_IA32_PERF_CTL, l, h); + rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); clock_freq = extract_clock(l, cpu, 1); } - - set_cpus_allowed_ptr(current, &saved_mask); return clock_freq; } @@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy, struct cpufreq_freqs freqs; int retval = 0; unsigned int j, k, first_cpu, tmp; - cpumask_var_t saved_mask, covered_cpus; + cpumask_var_t covered_cpus; - if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) - return -ENOMEM; - if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { - free_cpumask_var(saved_mask); + if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) return -ENOMEM; - } - cpumask_copy(saved_mask, ¤t->cpus_allowed); if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { retval = -ENODEV; @@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy, first_cpu = 1; for_each_cpu(j, policy->cpus) { - const struct cpumask *mask; + int good_cpu; /* cpufreq holds the hotplug lock, so we are safe here */ if (!cpu_online(j)) @@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy, * Make sure we are running on CPU that wants to change freq */ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - mask = policy->cpus; + good_cpu = cpumask_any_and(policy->cpus, + cpu_online_mask); else - mask = cpumask_of(j); + good_cpu = j; - set_cpus_allowed_ptr(current, mask); - preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), *mask))) { + if (good_cpu >= nr_cpu_ids) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { /* We haven't started the transition yet. */ - goto migrate_end; + goto out; } - preempt_enable(); break; } msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; if (first_cpu) { - rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); + rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); if (msr == (oldmsr & 0xffff)) { dprintk("no change needed - msr was and needs " "to be %x\n", oldmsr); retval = 0; - goto migrate_end; + goto out; } freqs.old = extract_clock(oldmsr, cpu, 0); @@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy, oldmsr |= msr; } - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - preempt_enable(); + wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) break; - } - cpu_set(j, *covered_cpus); - preempt_enable(); + cpumask_set_cpu(j, covered_cpus); } for_each_cpu(k, policy->cpus) { @@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - for_each_cpu_mask_nr(j, *covered_cpus) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); - } + for_each_cpu(j, covered_cpus) + wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); tmp = freqs.new; freqs.new = freqs.old; @@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy, cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } } - set_cpus_allowed_ptr(current, saved_mask); retval = 0; - goto out; -migrate_end: - preempt_enable(); - set_cpus_allowed_ptr(current, saved_mask); out: - free_cpumask_var(saved_mask); free_cpumask_var(covered_cpus); return retval; } -- cgit v1.2.3 From 1ff6e97f1d993dff2f9b6f4a9173687370660232 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 20:55:37 +0930 Subject: [CPUFREQ] cpumask: avoid playing with cpus_allowed in powernow-k8.c cpumask: avoid playing with cpus_allowed in powernow-k8.c It's generally a very bad idea to mug some process's cpumask: it could legitimately and reasonably be changed by root, which could break us (if done before our code) or them (if we restore the wrong value). I did not replace powernowk8_target; it needs fixing, but it grabs a mutex (so no smp_call_function_single here) but Mark points out it can be called multiple times per second, so work_on_cpu is too heavy. Signed-off-by: Rusty Russell To: cpufreq@vger.kernel.org Acked-by: Mark Langsdorf Tested-by: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 118 +++++++++++++++--------------- 1 file changed, 60 insertions(+), 58 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 20c7b99d7ba..1f55547d5b3 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -508,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, return 0; } -static int check_supported_cpu(unsigned int cpu) +static void check_supported_cpu(void *_rc) { - cpumask_t oldmask; u32 eax, ebx, ecx, edx; - unsigned int rc = 0; - - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + int *rc = _rc; - if (smp_processor_id() != cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); - goto out; - } + *rc = -ENODEV; if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) - goto out; + return; eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) - goto out; + return; if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); - goto out; + return; } eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { printk(KERN_INFO PFX "No frequency change capabilities detected\n"); - goto out; + return; } cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); @@ -550,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu) != P_STATE_TRANSITION_CAPABLE) { printk(KERN_INFO PFX "Power state transitions not supported\n"); - goto out; + return; } } else { /* must be a HW Pstate capable processor */ cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) cpu_family = CPU_HW_PSTATE; else - goto out; + return; } - rc = 1; - -out: - set_cpus_allowed_ptr(current, &oldmask); - return rc; + *rc = 0; } static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, @@ -1247,6 +1236,32 @@ static int powernowk8_verify(struct cpufreq_policy *pol) return cpufreq_frequency_table_verify(pol, data->powernow_table); } +struct init_on_cpu { + struct powernow_k8_data *data; + int rc; +}; + +static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) +{ + struct init_on_cpu *init_on_cpu = _init_on_cpu; + + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing init, change pending bit set\n"); + init_on_cpu->rc = -ENODEV; + return; + } + + if (query_current_values_with_pending_wait(init_on_cpu->data)) { + init_on_cpu->rc = -ENODEV; + return; + } + + if (cpu_family == CPU_OPTERON) + fidvid_msr_init(); + + init_on_cpu->rc = 0; +} + /* per CPU init entry point to the driver */ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { @@ -1254,13 +1269,14 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; - cpumask_t oldmask; + struct init_on_cpu init_on_cpu; int rc; if (!cpu_online(pol->cpu)) return -ENODEV; - if (!check_supported_cpu(pol->cpu)) + smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); + if (rc) return -ENODEV; data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); @@ -1300,27 +1316,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) pol->cpuinfo.transition_latency = get_transition_latency(data); /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out_unmask; - } - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing init, change pending bit set\n"); - goto err_out_unmask; - } - - if (query_current_values_with_pending_wait(data)) - goto err_out_unmask; - - if (cpu_family == CPU_OPTERON) - fidvid_msr_init(); - - /* run on any CPU again */ - set_cpus_allowed_ptr(current, &oldmask); + init_on_cpu.data = data; + smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, + &init_on_cpu, 1); + rc = init_on_cpu.rc; + if (rc != 0) + goto err_out_exit_acpi; if (cpu_family == CPU_HW_PSTATE) cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); @@ -1357,8 +1358,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) return 0; -err_out_unmask: - set_cpus_allowed_ptr(current, &oldmask); +err_out_exit_acpi: powernow_k8_cpu_exit_acpi(data); err_out: @@ -1383,24 +1383,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) return 0; } +static void query_values_on_cpu(void *_err) +{ + int *err = _err; + struct powernow_k8_data *data = __get_cpu_var(powernow_data); + + *err = query_current_values_with_pending_wait(data); +} + static unsigned int powernowk8_get(unsigned int cpu) { struct powernow_k8_data *data = per_cpu(powernow_data, cpu); - cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; + int err; if (!data) return -EINVAL; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (smp_processor_id() != cpu) { - printk(KERN_ERR PFX - "limiting to CPU %d failed in powernowk8_get\n", cpu); - set_cpus_allowed_ptr(current, &oldmask); - return 0; - } - - if (query_current_values_with_pending_wait(data)) + smp_call_function_single(cpu, query_values_on_cpu, &err, true); + if (err) goto out; if (cpu_family == CPU_HW_PSTATE) @@ -1411,7 +1412,6 @@ static unsigned int powernowk8_get(unsigned int cpu) out: - set_cpus_allowed_ptr(current, &oldmask); return khz; } @@ -1437,7 +1437,9 @@ static int __cpuinit powernowk8_init(void) unsigned int i, supported_cpus = 0; for_each_online_cpu(i) { - if (check_supported_cpu(i)) + int rc; + smp_call_function_single(i, check_supported_cpu, &rc, 1); + if (rc == 0) supported_cpus++; } -- cgit v1.2.3 From 8e7c25971b1590776a90b249de3d859dd45e7414 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 20:58:37 +0930 Subject: [CPUFREQ] cpumask: new cpumask operators for arch/x86/kernel/cpu/cpufreq/powernow-k8.c Remove all old-style cpumask operators, and cpumask_t. Also: get rid of the unused define_siblings function. Signed-off-by: Rusty Russell Acked-by: Mark Langsdorf Tested-by: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 8 ++++---- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 11 ----------- 2 files changed, 4 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 1f55547d5b3..81cbe64ed6b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1094,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1102,7 +1102,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, res = transition_fid_vid(data, fid, vid); freqs.new = find_khz_freq_from_fid(data->currfid); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -1127,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, data->currpstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1135,7 +1135,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, res = transition_pstate(data, pstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask_nr(i, *(data->available_cores)) { + for_each_cpu(i, data->available_cores) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index 6c6698feade..c9c1190b5e1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -223,14 +223,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); - -#ifdef CONFIG_SMP -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ -} -#else -static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) -{ - cpu_set(0, cpu_sharedcore_mask[0]); -} -#endif -- cgit v1.2.3 From 184e1fdfea066ab8f12a1e8912f402d2d6556d11 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 15 Jun 2009 15:37:07 +0800 Subject: x86, mce: fix a race condition about mce_callin and no_way_out If one CPU has no_way_out == 1, all other CPUs should have no_way_out == 1. But despite global_nwo is read after mce_callin, global_nwo is updated after mce_callin too. So it is possible that some CPU read global_nwo before some other CPU update global_nwo, so that no_way_out == 1 for some CPU, while no_way_out == 0 for some other CPU. This patch fixes this race condition via moving mce_callin updating after global_nwo updating, with a smp_wmb in between. A smp_rmb is added between their reading too. Signed-off-by: Huang Ying Acked-by: Andi Kleen Acked-by: Hidetoshi Seto --- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fabba15e455..19294b8524c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -703,6 +703,11 @@ static int mce_start(int no_way_out, int *order) } atomic_add(no_way_out, &global_nwo); + /* + * global_nwo should be updated before mce_callin + */ + smp_wmb(); + *order = atomic_add_return(1, &mce_callin); /* * Wait for everyone. @@ -716,6 +721,10 @@ static int mce_start(int no_way_out, int *order) ndelay(SPINUNIT); } + /* + * mce_callin should be read before global_nwo + */ + smp_rmb(); /* * Cache the global no_way_out state. */ @@ -862,7 +871,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Establish sequential order between the CPUs entering the machine * check handler. */ - int order; + int order = -1; /* * If no_way_out gets set, there is no safe way to recover from this @@ -887,7 +896,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!banks) goto out; - order = atomic_add_return(1, &mce_callin); mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); -- cgit v1.2.3 From 33edbf02a92771fa2a81e41084a44ba874e3a5a5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:18:45 +0900 Subject: x86, mce: don't init timer if !mce_available In mce_cpu_restart, mce_init_timer is called unconditionally. If !mce_available (e.g. mce is disabled), there are no useful work for timer. Stop running it. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 19294b8524c..dda77215e9e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1617,8 +1617,9 @@ static int mce_resume(struct sys_device *dev) static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); - if (mce_available(¤t_cpu_data)) - mce_init(); + if (!mce_available(¤t_cpu_data)) + return; + mce_init(); mce_init_timer(); } -- cgit v1.2.3 From 7fb06fc9672b947424e05871243a4c8e19ec3bce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 18:18:43 +0900 Subject: x86, mce: cleanup mce_start() Simplify interface of mce_start(): - no_way_out = mce_start(no_way_out, &order); + order = mce_start(&no_way_out); Now Monarch and Subjects share same exit(return) in usual path. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 66 +++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index dda77215e9e..739fd7eca0a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -691,23 +691,21 @@ static atomic_t global_nwo; * in the entry order. * TBD double check parallel CPU hotunplug */ -static int mce_start(int no_way_out, int *order) +static int mce_start(int *no_way_out) { - int nwo; + int order; int cpus = num_online_cpus(); u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; - if (!timeout) { - *order = -1; - return no_way_out; - } + if (!timeout) + return -1; - atomic_add(no_way_out, &global_nwo); + atomic_add(*no_way_out, &global_nwo); /* * global_nwo should be updated before mce_callin */ smp_wmb(); - *order = atomic_add_return(1, &mce_callin); + order = atomic_add_return(1, &mce_callin); /* * Wait for everyone. @@ -715,8 +713,7 @@ static int mce_start(int no_way_out, int *order) while (atomic_read(&mce_callin) != cpus) { if (mce_timed_out(&timeout)) { atomic_set(&global_nwo, 0); - *order = -1; - return no_way_out; + return -1; } ndelay(SPINUNIT); } @@ -725,34 +722,34 @@ static int mce_start(int no_way_out, int *order) * mce_callin should be read before global_nwo */ smp_rmb(); - /* - * Cache the global no_way_out state. - */ - nwo = atomic_read(&global_nwo); - /* - * Monarch starts executing now, the others wait. - */ - if (*order == 1) { + if (order == 1) { + /* + * Monarch: Starts executing now, the others wait. + */ atomic_set(&mce_executing, 1); - return nwo; + } else { + /* + * Subject: Now start the scanning loop one by one in + * the original callin order. + * This way when there are any shared banks it will be + * only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < order) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } } /* - * Now start the scanning loop one by one - * in the original callin order. - * This way when there are any shared banks it will - * be only seen by one CPU before cleared, avoiding duplicates. + * Cache the global no_way_out state. */ - while (atomic_read(&mce_executing) < *order) { - if (mce_timed_out(&timeout)) { - atomic_set(&global_nwo, 0); - *order = -1; - return no_way_out; - } - ndelay(SPINUNIT); - } - return nwo; + *no_way_out = atomic_read(&global_nwo); + + return order; } /* @@ -871,8 +868,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Establish sequential order between the CPUs entering the machine * check handler. */ - int order = -1; - + int order; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -917,7 +913,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * This way we don't report duplicated events on shared banks * because the first one to see it will clear it. */ - no_way_out = mce_start(no_way_out, &order); + order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); if (!bank[i]) -- cgit v1.2.3 From 4e5b3e690dda890523e93af9c545261f5916a3a6 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:20:20 +0900 Subject: x86, mce: add __read_mostly Add __read_mostly to data written during setup. Suggested-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 739fd7eca0a..2d2e0b565a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -57,7 +57,7 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; -int mce_disabled; +int mce_disabled __read_mostly; #ifdef CONFIG_X86_NEW_MCE @@ -76,19 +76,19 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors * 3: never panic or SIGBUS, log all errors (for testing only) */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static int monarch_timeout = -1; -static int mce_panic_timeout; -static int mce_dont_log_ce; -int mce_cmci_disabled; -int mce_ignore_ce; -int mce_ser; +static int tolerant __read_mostly = 1; +static int banks __read_mostly; +static u64 *bank __read_mostly; +static int rip_msr __read_mostly; +static int mce_bootlog __read_mostly = -1; +static int monarch_timeout __read_mostly = -1; +static int mce_panic_timeout __read_mostly; +static int mce_dont_log_ce __read_mostly; +int mce_cmci_disabled __read_mostly; +int mce_ignore_ce __read_mostly; +int mce_ser __read_mostly; +static unsigned long notify_user; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; -- cgit v1.2.3 From 1020bcbcc7da36001d9226c5d57e999949cb80c5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:20:57 +0900 Subject: x86, mce: rename static variables around trigger "trigger" is not straight forward name for valiable that holds name of user mode helper program which triggered by machine check events. This patch renames this valiable and kins to more recognizable names. trigger => mce_helper trigger_argv => mce_helper_argv notify_user => mce_need_notify No functional changes. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2d2e0b565a9..e8b893e880e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -88,9 +88,10 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; -static unsigned long notify_user; -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; +/* User mode helper program triggered by machine check event */ +static unsigned long mce_need_notify; +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; static unsigned long dont_init_banks; @@ -180,7 +181,7 @@ void mce_log(struct mce *mce) wmb(); mce->finished = 1; - set_bit(0, ¬ify_user); + set_bit(0, &mce_need_notify); } static void print_mce(struct mce *m) @@ -1122,7 +1123,7 @@ static void mcheck_timer(unsigned long data) static void mce_do_trigger(struct work_struct *work) { - call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); } static DECLARE_WORK(mce_trigger_work, mce_do_trigger); @@ -1139,7 +1140,7 @@ int mce_notify_irq(void) clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, ¬ify_user)) { + if (test_and_clear_bit(0, &mce_need_notify)) { wake_up_interruptible(&mce_wait); /* @@ -1147,7 +1148,7 @@ int mce_notify_irq(void) * work_pending is always cleared before the function is * executed. */ - if (trigger[0] && !work_pending(&mce_trigger_work)) + if (mce_helper[0] && !work_pending(&mce_trigger_work)) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) @@ -1664,9 +1665,9 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - strcpy(buf, trigger); + strcpy(buf, mce_helper); strcat(buf, "\n"); - return strlen(trigger) + 1; + return strlen(mce_helper) + 1; } static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, @@ -1675,10 +1676,10 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *p; int len; - strncpy(trigger, buf, sizeof(trigger)); - trigger[sizeof(trigger)-1] = 0; - len = strlen(trigger); - p = strchr(trigger, '\n'); + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + len = strlen(mce_helper); + p = strchr(mce_helper, '\n'); if (*p) *p = 0; -- cgit v1.2.3 From 9af43b54ab4509f1dac49637d6917d57292e6518 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:21:36 +0900 Subject: x86, mce: sysfs entries for new mce options Add sysfs interface for admins who want to tweak these options without rebooting the system. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 84 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e8b893e880e..4b62443b6e7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1626,6 +1626,26 @@ static void mce_restart(void) on_each_cpu(mce_cpu_restart, NULL, 1); } +/* Toggle features for corrected errors */ +static void mce_disable_ce(void *all) +{ + if (!mce_available(¤t_cpu_data)) + return; + if (all) + del_timer_sync(&__get_cpu_var(mce_timer)); + cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ + if (!mce_available(¤t_cpu_data)) + return; + cmci_reenable(); + cmci_recheck(); + if (all) + mce_init_timer(); +} + static struct sysdev_class mce_sysclass = { .suspend = mce_suspend, .shutdown = mce_shutdown, @@ -1687,6 +1707,52 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return len; } +static ssize_t set_ignore_ce(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mce_ignore_ce ^ !!new) { + if (new) { + /* disable ce features */ + on_each_cpu(mce_disable_ce, (void *)1, 1); + mce_ignore_ce = 1; + } else { + /* enable ce features */ + mce_ignore_ce = 0; + on_each_cpu(mce_enable_ce, (void *)1, 1); + } + } + return size; +} + +static ssize_t set_cmci_disabled(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mce_cmci_disabled ^ !!new) { + if (new) { + /* disable cmci */ + on_each_cpu(mce_disable_ce, NULL, 1); + mce_cmci_disabled = 1; + } else { + /* enable cmci */ + mce_cmci_disabled = 0; + on_each_cpu(mce_enable_ce, NULL, 1); + } + } + return size; +} + static ssize_t store_int_with_restart(struct sys_device *s, struct sysdev_attribute *attr, const char *buf, size_t size) @@ -1699,6 +1765,7 @@ static ssize_t store_int_with_restart(struct sys_device *s, static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); static struct sysdev_ext_attribute attr_check_interval = { _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, @@ -1706,9 +1773,24 @@ static struct sysdev_ext_attribute attr_check_interval = { &check_interval }; +static struct sysdev_ext_attribute attr_ignore_ce = { + _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), + &mce_ignore_ce +}; + +static struct sysdev_ext_attribute attr_cmci_disabled = { + _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_cmci_disabled), + &mce_cmci_disabled +}; + static struct sysdev_attribute *mce_attrs[] = { - &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, + &attr_tolerant.attr, + &attr_check_interval.attr, + &attr_trigger, &attr_monarch_timeout.attr, + &attr_dont_log_ce.attr, + &attr_ignore_ce.attr, + &attr_cmci_disabled.attr, NULL }; -- cgit v1.2.3 From 9e55e44e39798541ba39d57f4b569deb555ae1ce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:22:15 +0900 Subject: x86, mce: unify mce.h There are 2 headers: arch/x86/include/asm/mce.h arch/x86/kernel/cpu/mcheck/mce.h and in the latter small header: #include This patch move all contents in the latter header into the former, and fix all files using the latter to include the former instead. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/k7.c | 3 +-- arch/x86/kernel/cpu/mcheck/mce.c | 1 - arch/x86/kernel/cpu/mcheck/mce.h | 38 ------------------------------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 3 +-- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 -- arch/x86/kernel/cpu/mcheck/non-fatal.c | 3 +-- arch/x86/kernel/cpu/mcheck/p4.c | 3 +-- arch/x86/kernel/cpu/mcheck/p5.c | 3 +-- arch/x86/kernel/cpu/mcheck/p6.c | 3 +-- arch/x86/kernel/cpu/mcheck/winchip.c | 3 +-- 10 files changed, 7 insertions(+), 55 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/mce.h (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index 89e51042415..b945d5dbc60 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine Check Handler For AMD Athlon/Duron: */ static void k7_machine_check(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4b62443b6e7..faedd776847 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -44,7 +44,6 @@ #include #include "mce-internal.h" -#include "mce.h" /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h deleted file mode 100644 index 84a552b458c..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include - -#ifdef CONFIG_X86_OLD_MCE -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -#endif - -#ifdef CONFIG_X86_ANCIENT_MCE -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); -extern int mce_p5_enable; -static inline int mce_p5_enabled(void) { return mce_p5_enable; } -static inline void enable_p5_mce(void) { mce_p5_enable = 1; } -#else -static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} -static inline int mce_p5_enabled(void) { return 0; } -static inline void enable_p5_mce(void) { } -#endif - -/* Call the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); - -#ifdef CONFIG_X86_OLD_MCE - -extern int nr_mce_banks; - -void intel_set_thermal_handler(void); - -#else - -static inline void intel_set_thermal_handler(void) { } - -#endif - -void intel_init_thermal(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2b011d2d857..475478b2088 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -11,10 +11,9 @@ #include #include #include +#include #include -#include "mce.h" - void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index f2ef6952c40..c548111d011 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,8 +16,6 @@ #include #include -#include "mce.h" - asmlinkage void smp_thermal_interrupt(void) { __u64 msr_val; diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index 70b710420f7..f5f2d6f71fb 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -17,10 +17,9 @@ #include #include +#include #include -#include "mce.h" - static int firstbank; #define MCE_RATE (15*HZ) /* timer rate is 15s */ diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 82cee108a2d..8d3e40edd64 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -12,10 +12,9 @@ #include #include #include +#include #include -#include "mce.h" - /* as supported by the P4/Xeon family */ struct intel_mce_extended_msrs { u32 eax; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 015f481ab1b..747853f9188 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* By default disabled */ int mce_p5_enable; diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 43c24e66745..01e4f817818 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -10,10 +10,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine Check Handler For PII/PIII */ static void intel_machine_check(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 81b02487090..54060f56597 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -9,10 +9,9 @@ #include #include +#include #include -#include "mce.h" - /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { -- cgit v1.2.3 From c697836985e18d9c34897428ba563b13044a6dcd Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:22:49 +0900 Subject: x86, mce: make mce_disabled boolean The mce_disabled on 32bit is a tristate variable [1,0,-1], while 64bit version is boolean [0,1]. This patch makes mce_disabled always boolean, and use mce_p5_enabled to indicate the third state instead. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 8 +++----- arch/x86/kernel/cpu/mcheck/p5.c | 12 +++++------- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index faedd776847..6095e0296ab 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1286,8 +1286,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) return; switch (c->x86_vendor) { case X86_VENDOR_INTEL: - if (mce_p5_enabled()) - intel_p5_mcheck_init(c); + intel_p5_mcheck_init(c); break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); @@ -2002,7 +2001,7 @@ EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { - if (mce_disabled == 1) + if (mce_disabled) return; switch (c->x86_vendor) { @@ -2032,10 +2031,9 @@ void mcheck_init(struct cpuinfo_x86 *c) static int __init mcheck_enable(char *str) { - mce_disabled = -1; + mce_p5_enabled = 1; return 1; } - __setup("mce", mcheck_enable); #endif /* CONFIG_X86_OLD_MCE */ diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 747853f9188..5c0e6533d9b 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -14,7 +14,7 @@ #include /* By default disabled */ -int mce_p5_enable; +int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) @@ -42,15 +42,13 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; - /* Check for MCE support: */ - if (!cpu_has(c, X86_FEATURE_MCE)) + /* Default P5 to off as its often misconnected: */ + if (!mce_p5_enabled) return; -#ifdef CONFIG_X86_OLD_MCE - /* Default P5 to off as its often misconnected: */ - if (mce_disabled != -1) + /* Check for MCE support: */ + if (!cpu_has(c, X86_FEATURE_MCE)) return; -#endif machine_check_vector = pentium_machine_check; /* Make sure the vector pointer is visible before we enable MCEs: */ -- cgit v1.2.3 From 3adacb70d32046ccc9f0333b50bb2ba1582ccdf4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:23:28 +0900 Subject: x86, mce: unify smp_thermal_interrupt, prepare p4 Remove unused argument regs from handlers, and use inc_irq_stat. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p4.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 8d3e40edd64..ddeed6e295a 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -35,7 +35,7 @@ static int mce_num_extended_msrs; #ifdef CONFIG_X86_MCE_P4THERMAL -static void unexpected_thermal_interrupt(struct pt_regs *regs) +static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", smp_processor_id()); @@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) } /* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(struct pt_regs *regs) +static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -54,14 +54,13 @@ static void intel_thermal_interrupt(struct pt_regs *regs) } /* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = - unexpected_thermal_interrupt; +static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); - vendor_thermal_interrupt(regs); - __get_cpu_var(irq_stat).irq_thermal_count++; + vendor_thermal_interrupt(); + inc_irq_stat(irq_thermal_count); irq_exit(); } -- cgit v1.2.3 From 5335612a574a45beab14193ec641ed2f45e7a523 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:24:09 +0900 Subject: x86, mce: unify smp_thermal_interrupt, prepare mce_intel_64 Break smp_thermal_interrupt() into two functions. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index c548111d011..a5232b2c4ca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,19 +16,21 @@ #include #include -asmlinkage void smp_thermal_interrupt(void) +static void intel_thermal_interrupt(void) { __u64 msr_val; - ack_APIC_irq(); - - exit_idle(); - irq_enter(); - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) mce_log_therm_throt_event(msr_val); +} +asmlinkage void smp_thermal_interrupt(void) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + intel_thermal_interrupt(); inc_irq_stat(irq_thermal_count); irq_exit(); } -- cgit v1.2.3 From e8ce2c5ee826b3787202493effcd08d4b1e1e639 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:24:40 +0900 Subject: x86, mce: unify smp_thermal_interrupt, prepare Let them in same shape. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 20 ++++++++++++++++++-- arch/x86/kernel/cpu/mcheck/p4.c | 10 ++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index a5232b2c4ca..922e3a482f6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -16,6 +16,14 @@ #include #include +static void unexpected_thermal_interrupt(void) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +/* P4/Xeon Thermal transition interrupt handler: */ static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -25,14 +33,22 @@ static void intel_thermal_interrupt(void) mce_log_therm_throt_event(msr_val); } +/* Thermal interrupt handler for this CPU setup: */ +static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; + asmlinkage void smp_thermal_interrupt(void) { - ack_APIC_irq(); exit_idle(); irq_enter(); - intel_thermal_interrupt(); inc_irq_stat(irq_thermal_count); + intel_thermal_interrupt(); irq_exit(); + ack_APIC_irq(); +} + +void intel_set_thermal_handler(void) +{ + vendor_thermal_interrupt = intel_thermal_interrupt; } /* diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index ddeed6e295a..75313f5b624 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -47,10 +48,9 @@ static void intel_thermal_interrupt(void) { __u64 msr_val; - ack_APIC_irq(); - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - therm_throt_process(msr_val & THERM_STATUS_PROCHOT); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + mce_log_therm_throt_event(msr_val); } /* Thermal interrupt handler for this CPU setup: */ @@ -58,10 +58,12 @@ static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { + exit_idle(); irq_enter(); - vendor_thermal_interrupt(); inc_irq_stat(irq_thermal_count); + vendor_thermal_interrupt(); irq_exit(); + ack_APIC_irq(); } void intel_set_thermal_handler(void) -- cgit v1.2.3 From a65c88dd2c83b569dbd13778da689861bdf977f2 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:25:27 +0900 Subject: x86, mce: unify smp_thermal_interrupt Put common functions into therm_throt.c, modify Makefile. unexpected_thermal_interrupt intel_thermal_interrupt smp_thermal_interrupt intel_set_thermal_handler Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 7 ++--- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 38 -------------------------- arch/x86/kernel/cpu/mcheck/p4.c | 45 ------------------------------- arch/x86/kernel/cpu/mcheck/therm_throt.c | 40 ++++++++++++++++++++++++++- 4 files changed, 43 insertions(+), 87 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 45004faf67e..53df57d11c5 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,12 @@ -obj-y = mce.o therm_throt.o +obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o -obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 922e3a482f6..3b7a0572e2f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -9,48 +9,10 @@ #include #include #include -#include #include #include -#include -#include #include -static void unexpected_thermal_interrupt(void) -{ - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", - smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) - mce_log_therm_throt_event(msr_val); -} - -/* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; - -asmlinkage void smp_thermal_interrupt(void) -{ - exit_idle(); - irq_enter(); - inc_irq_stat(irq_thermal_count); - intel_thermal_interrupt(); - irq_exit(); - ack_APIC_irq(); -} - -void intel_set_thermal_handler(void) -{ - vendor_thermal_interrupt = intel_thermal_interrupt; -} - /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 75313f5b624..76c5f0305f9 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -1,8 +1,6 @@ /* * P4 specific Machine Check Exception Reporting */ - -#include #include #include #include @@ -10,9 +8,6 @@ #include #include -#include -#include -#include #include #include @@ -33,46 +28,6 @@ struct intel_mce_extended_msrs { static int mce_num_extended_msrs; - -#ifdef CONFIG_X86_MCE_P4THERMAL - -static void unexpected_thermal_interrupt(void) -{ - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", - smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler: */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) - mce_log_therm_throt_event(msr_val); -} - -/* Thermal interrupt handler for this CPU setup: */ -static void (*vendor_thermal_interrupt)(void) = unexpected_thermal_interrupt; - -void smp_thermal_interrupt(struct pt_regs *regs) -{ - exit_idle(); - irq_enter(); - inc_irq_stat(irq_thermal_count); - vendor_thermal_interrupt(); - irq_exit(); - ack_APIC_irq(); -} - -void intel_set_thermal_handler(void) -{ - vendor_thermal_interrupt = intel_thermal_interrupt; -} - -#endif /* CONFIG_X86_MCE_P4THERMAL */ - /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7b1ae2e20ba..b3792b19685 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -13,6 +13,7 @@ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. * Inspired by Ross Biro's and Al Borchers' counter code. */ +#include #include #include #include @@ -20,6 +21,8 @@ #include #include +#include +#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -186,6 +189,41 @@ static __init int thermal_throttle_init_device(void) return 0; } - device_initcall(thermal_throttle_init_device); + #endif /* CONFIG_SYSFS */ + +/* Thermal transition interrupt handler */ +void intel_thermal_interrupt(void) +{ + __u64 msr_val; + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + mce_log_therm_throt_event(msr_val); +} + +static void unexpected_thermal_interrupt(void) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +{ + exit_idle(); + irq_enter(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); +} + +void intel_set_thermal_handler(void) +{ + smp_thermal_vector = intel_thermal_interrupt; +} -- cgit v1.2.3 From 895287c0a6aa571160c47ee10de11b542166c4f9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:26:10 +0900 Subject: x86, mce: squash mce_intel.c into therm_throt.c move intel_init_thermal() into therm_throt.c Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel.c | 73 -------------------------------- arch/x86/kernel/cpu/mcheck/therm_throt.c | 66 +++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 74 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 53df57d11c5..659564e5fc0 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -9,4 +9,4 @@ obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o mce_intel.o +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c deleted file mode 100644 index 475478b2088..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Common code for Intel machine checks - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index b3792b19685..7a508aaafce 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -16,13 +16,21 @@ #include #include #include +#include #include #include +#include +#include +#include #include #include +#include +#include +#include #include #include +#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -227,3 +235,61 @@ void intel_set_thermal_handler(void) { smp_thermal_vector = intel_thermal_interrupt; } + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) + tm2 = 1; + + /* Check whether a vector already exists */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + intel_set_thermal_handler(); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} -- cgit v1.2.3 From 8363fc82d36c0886292e33925391dca93f03bd50 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:26:36 +0900 Subject: x86, mce: remove intel_set_thermal_handler() and make intel_thermal_interrupt() static. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7a508aaafce..7c7944cc515 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -202,7 +202,7 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ /* Thermal transition interrupt handler */ -void intel_thermal_interrupt(void) +static void intel_thermal_interrupt(void) { __u64 msr_val; @@ -231,11 +231,6 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } -void intel_set_thermal_handler(void) -{ - smp_thermal_vector = intel_thermal_interrupt; -} - void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -278,7 +273,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_THERM_INTERRUPT, l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - intel_set_thermal_handler(); + smp_thermal_vector = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); -- cgit v1.2.3 From 1149e7264528723b8c3b075a90386ceb2e07070a Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:27:13 +0900 Subject: x86, mce: remove therm_throt.h Now all symbols in the header are static. Remove the header. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 1 - arch/x86/kernel/cpu/mcheck/p4.c | 1 - arch/x86/kernel/cpu/mcheck/therm_throt.c | 5 ++--- 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 3b7a0572e2f..663a88e8b3c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -11,7 +11,6 @@ #include #include #include -#include /* * Support for Intel Correct Machine Check Interrupts. This allows diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 76c5f0305f9..4482aea9aa2 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7c7944cc515..bff8dd191dd 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include @@ -38,7 +37,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); +static atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ @@ -93,7 +92,7 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -int therm_throt_process(int curr) +static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); -- cgit v1.2.3 From 1af0815f9639752409a9c15ba6a3dc0f322fd114 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 15 Jun 2009 17:28:38 +0900 Subject: x86, mce: rename _64.c files which are no longer 64-bit-specific Rename files that are no longer 64bit specific: mce_amd_64.c => mce_amd.c mce_intel_64.c => mce_intel.c Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 4 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 703 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 703 ------------------------------ arch/x86/kernel/cpu/mcheck/mce_intel.c | 225 ++++++++++ arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 225 ---------- 5 files changed, 930 insertions(+), 930 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce_amd.c delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_amd_64.c create mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel_64.c (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 659564e5fc0..188a1ca5ad2 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -3,8 +3,8 @@ obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o -obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c new file mode 100644 index 00000000000..ddae21620bd --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -0,0 +1,703 @@ +/* + * (c) 2005, 2006 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Written by Jacob Shin - AMD, Inc. + * + * Support : jacob.shin@amd.com + * + * April 2006 + * - added support for AMD Family 0x10 processors + * + * All MC4_MISCi registers are shared between multi-cores + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define PFX "mce_threshold: " +#define VERSION "version 1.1.1" +#define NR_BANKS 6 +#define NR_BLOCKS 9 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 + +struct threshold_block { + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; +}; + +/* defaults used early on boot */ +static struct threshold_block threshold_defaults = { + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, +}; + +struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; + cpumask_var_t cpus; +}; +static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); + +#ifdef CONFIG_SMP +static unsigned char shared_bank[NR_BANKS] = { + 0, 0, 0, 0, 1 +}; +#endif + +static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ + +static void amd_threshold_interrupt(void); + +/* + * CPU Initialization + */ + +struct thresh_restart { + struct threshold_block *b; + int reset; + u16 old_limit; +}; + +/* must be called with correct cpu affinity */ +/* Called via smp_call_function_single() */ +static void threshold_restart_bank(void *_tr) +{ + struct thresh_restart *tr = _tr; + u32 mci_misc_hi, mci_misc_lo; + + rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + + if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ + + if (tr->reset) { /* reset err count and overflow bit */ + mci_misc_hi = + (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ + int new_count = (mci_misc_hi & THRESHOLD_MAX) + + (tr->old_limit - tr->b->threshold_limit); + + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + tr->b->interrupt_enable ? + (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : + (mci_misc_hi &= ~MASK_INT_TYPE_HI); + + mci_misc_hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); +} + +/* cpu init entry point, called from mce.c with preempt off */ +void mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; + unsigned int bank, block; + struct thresh_restart tr; + u8 lvt_off; + + for (bank = 0; bank < NR_BANKS; ++bank) { + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else + ++address; + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); +#ifdef CONFIG_SMP + if (shared_bank[bank] && c->cpu_core_id) + break; +#endif + lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0); + + high &= ~MASK_LVTOFF_HI; + high |= lvt_off << 20; + wrmsr(address, low, high); + + threshold_defaults.address = address; + tr.b = &threshold_defaults; + tr.reset = 0; + tr.old_limit = 0; + threshold_restart_bank(&tr); + + mce_threshold_vector = amd_threshold_interrupt; + } + } +} + +/* + * APIC Interrupt Handler + */ + +/* + * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. + * the interrupt goes off when error_count reaches threshold_limit. + * the handler will simply log mcelog w/ software defined bank number. + */ +static void amd_threshold_interrupt(void) +{ + u32 low = 0, high = 0, address = 0; + unsigned int bank, block; + struct mce m; + + mce_setup(&m); + + /* assume first bank caused it */ + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + continue; + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) { + address = MSR_IA32_MC0_MISC + bank * 4; + } else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else { + ++address; + } + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + /* + * Log the machine check that caused the threshold + * event. + */ + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); + + if (high & MASK_OVERFLOW_HI) { + rdmsrl(address, m.misc); + rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, + m.status); + m.bank = K8_MCE_THRESHOLD_BASE + + bank * NR_BLOCKS + + block; + mce_log(&m); + return; + } + } + } +} + +/* + * Sysfs Interface + */ + +struct threshold_attr { + struct attribute attr; + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); +}; + +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +} +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (strict_strtoul(buf, 0, &new) < 0) + return -EINVAL; + + b->interrupt_enable = !!new; + + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return size; +} + +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (strict_strtoul(buf, 0, &new) < 0) + return -EINVAL; + + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + + tr.old_limit = b->threshold_limit; + b->threshold_limit = new; + tr.b = b; + tr.reset = 0; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return size; +} + +struct threshold_block_cross_cpu { + struct threshold_block *tb; + long retval; +}; + +static void local_error_count_handler(void *_tbcc) +{ + struct threshold_block_cross_cpu *tbcc = _tbcc; + struct threshold_block *b = tbcc->tb; + u32 low, high; + + rdmsr(b->address, low, high); + tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + struct threshold_block_cross_cpu tbcc = { .tb = b, }; + + smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); + return sprintf(buf, "%lx\n", tbcc.retval); +} + +static ssize_t store_error_count(struct threshold_block *b, + const char *buf, size_t count) +{ + struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + return 1; +} + +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ +}; + +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); +RW_ATTR(error_count); + +static struct attribute *default_attrs[] = { + &interrupt_enable.attr, + &threshold_limit.attr, + &error_count.attr, + NULL +}; + +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->show ? a->show(b, buf) : -EIO; + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->store ? a->store(b, buf, count) : -EIO; + + return ret; +} + +static struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, +}; + +static __cpuinit int allocate_threshold_blocks(unsigned int cpu, + unsigned int bank, + unsigned int block, + u32 address) +{ + struct threshold_block *b = NULL; + u32 low, high; + int err; + + if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) + return 0; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; + + INIT_LIST_HEAD(&b->miscj); + + if (per_cpu(threshold_banks, cpu)[bank]->blocks) { + list_add(&b->miscj, + &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); + } else { + per_cpu(threshold_banks, cpu)[bank]->blocks = b; + } + + err = kobject_init_and_add(&b->kobj, &threshold_ktype, + per_cpu(threshold_banks, cpu)[bank]->kobj, + "misc%i", block); + if (err) + goto out_free; +recurse: + if (!block) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + return 0; + address += MCG_XBLK_ADDR; + } else { + ++address; + } + + err = allocate_threshold_blocks(cpu, bank, ++block, address); + if (err) + goto out_free; + + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); + + return err; + +out_free: + if (b) { + kobject_put(&b->kobj); + kfree(b); + } + return err; +} + +static __cpuinit long +local_allocate_threshold_blocks(int cpu, unsigned int bank) +{ + return allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); +} + +/* symlinks sibling shared banks to first core. first core owns dir/files. */ +static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) +{ + int i, err = 0; + struct threshold_bank *b = NULL; + char name[32]; + + sprintf(name, "threshold_bank%i", bank); + +#ifdef CONFIG_SMP + if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ + i = cpumask_first(cpu_core_mask(cpu)); + + /* first core not up yet */ + if (cpu_data(i).cpu_core_id) + goto out; + + /* already linked */ + if (per_cpu(threshold_banks, cpu)[bank]) + goto out; + + b = per_cpu(threshold_banks, i)[bank]; + + if (!b) + goto out; + + err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, + b->kobj, name); + if (err) + goto out; + + cpumask_copy(b->cpus, cpu_core_mask(cpu)); + per_cpu(threshold_banks, cpu)[bank] = b; + + goto out; + } +#endif + + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { + kfree(b); + err = -ENOMEM; + goto out; + } + + b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); + if (!b->kobj) + goto out_free; + +#ifndef CONFIG_SMP + cpumask_setall(b->cpus); +#else + cpumask_copy(b->cpus, cpu_core_mask(cpu)); +#endif + + per_cpu(threshold_banks, cpu)[bank] = b; + + err = local_allocate_threshold_blocks(cpu, bank); + if (err) + goto out_free; + + for_each_cpu(i, b->cpus) { + if (i == cpu) + continue; + + err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, + b->kobj, name); + if (err) + goto out; + + per_cpu(threshold_banks, i)[bank] = b; + } + + goto out; + +out_free: + per_cpu(threshold_banks, cpu)[bank] = NULL; + free_cpumask_var(b->cpus); + kfree(b); +out: + return err; +} + +/* create dir/files for all valid threshold banks */ +static __cpuinit int threshold_create_device(unsigned int cpu) +{ + unsigned int bank; + int err = 0; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto out; + } +out: + return err; +} + +/* + * let's be hotplug friendly. + * in case of multiple core processors, the first core always takes ownership + * of shared sysfs dir/files, and rest of the cores will be symlinked to it. + */ + +static void deallocate_threshold_block(unsigned int cpu, + unsigned int bank) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + + if (!head) + return; + + list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + kobject_put(&pos->kobj); + list_del(&pos->miscj); + kfree(pos); + } + + kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); + per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; +} + +static void threshold_remove_bank(unsigned int cpu, int bank) +{ + struct threshold_bank *b; + char name[32]; + int i = 0; + + b = per_cpu(threshold_banks, cpu)[bank]; + if (!b) + return; + if (!b->blocks) + goto free_out; + + sprintf(name, "threshold_bank%i", bank); + +#ifdef CONFIG_SMP + /* sibling symlink */ + if (shared_bank[bank] && b->blocks->cpu != cpu) { + sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); + per_cpu(threshold_banks, cpu)[bank] = NULL; + + return; + } +#endif + + /* remove all sibling symlinks before unregistering */ + for_each_cpu(i, b->cpus) { + if (i == cpu) + continue; + + sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); + per_cpu(threshold_banks, i)[bank] = NULL; + } + + deallocate_threshold_block(cpu, bank); + +free_out: + kobject_del(b->kobj); + kobject_put(b->kobj); + free_cpumask_var(b->cpus); + kfree(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; +} + +static void threshold_remove_device(unsigned int cpu) +{ + unsigned int bank; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + threshold_remove_bank(cpu, bank); + } +} + +/* get notified when a cpu comes on/off */ +static void __cpuinit +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + threshold_create_device(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + threshold_remove_device(cpu); + break; + default: + break; + } +} + +static __init int threshold_init_device(void) +{ + unsigned lcpu = 0; + + /* to hit CPUs online before the notifier is up */ + for_each_online_cpu(lcpu) { + int err = threshold_create_device(lcpu); + + if (err) + return err; + } + threshold_cpu_callback = amd_64_threshold_cpu_callback; + + return 0; +} +device_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c deleted file mode 100644 index ddae21620bd..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ /dev/null @@ -1,703 +0,0 @@ -/* - * (c) 2005, 2006 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Written by Jacob Shin - AMD, Inc. - * - * Support : jacob.shin@amd.com - * - * April 2006 - * - added support for AMD Family 0x10 processors - * - * All MC4_MISCi registers are shared between multi-cores - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" -#define NR_BANKS 6 -#define NR_BLOCKS 9 -#define THRESHOLD_MAX 0xFFF -#define INT_TYPE_APIC 0x00020000 -#define MASK_VALID_HI 0x80000000 -#define MASK_CNTP_HI 0x40000000 -#define MASK_LOCKED_HI 0x20000000 -#define MASK_LVTOFF_HI 0x00F00000 -#define MASK_COUNT_EN_HI 0x00080000 -#define MASK_INT_TYPE_HI 0x00060000 -#define MASK_OVERFLOW_HI 0x00010000 -#define MASK_ERR_COUNT_HI 0x00000FFF -#define MASK_BLKPTR_LO 0xFF000000 -#define MCG_XBLK_ADDR 0xC0000400 - -struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; -}; - -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; -}; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); - -#ifdef CONFIG_SMP -static unsigned char shared_bank[NR_BANKS] = { - 0, 0, 0, 0, 1 -}; -#endif - -static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ - -static void amd_threshold_interrupt(void); - -/* - * CPU Initialization - */ - -struct thresh_restart { - struct threshold_block *b; - int reset; - u16 old_limit; -}; - -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ -static void threshold_restart_bank(void *_tr) -{ - struct thresh_restart *tr = _tr; - u32 mci_misc_hi, mci_misc_lo; - - rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - - if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) - tr->reset = 1; /* limit cannot be lower than err count */ - - if (tr->reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - tr->b->threshold_limit); - } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + - (tr->old_limit - tr->b->threshold_limit); - - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | - (new_count & THRESHOLD_MAX); - } - - tr->b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); - - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); -} - -/* cpu init entry point, called from mce.c with preempt off */ -void mce_amd_feature_init(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - u32 low = 0, high = 0, address = 0; - unsigned int bank, block; - struct thresh_restart tr; - u8 lvt_off; - - for (bank = 0; bank < NR_BANKS; ++bank) { - for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) - address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } else - ++address; - - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; - - if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP - if (shared_bank[bank] && c->cpu_core_id) - break; -#endif - lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0); - - high &= ~MASK_LVTOFF_HI; - high |= lvt_off << 20; - wrmsr(address, low, high); - - threshold_defaults.address = address; - tr.b = &threshold_defaults; - tr.reset = 0; - tr.old_limit = 0; - threshold_restart_bank(&tr); - - mce_threshold_vector = amd_threshold_interrupt; - } - } -} - -/* - * APIC Interrupt Handler - */ - -/* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. - */ -static void amd_threshold_interrupt(void) -{ - u32 low = 0, high = 0, address = 0; - unsigned int bank, block; - struct mce m; - - mce_setup(&m); - - /* assume first bank caused it */ - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) - continue; - for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) { - address = MSR_IA32_MC0_MISC + bank * 4; - } else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } else { - ++address; - } - - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; - - /* - * Log the machine check that caused the threshold - * event. - */ - machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); - - if (high & MASK_OVERFLOW_HI) { - rdmsrl(address, m.misc); - rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, - m.status); - m.bank = K8_MCE_THRESHOLD_BASE - + bank * NR_BLOCKS - + block; - mce_log(&m); - return; - } - } - } -} - -/* - * Sysfs Interface - */ - -struct threshold_attr { - struct attribute attr; - ssize_t (*show) (struct threshold_block *, char *); - ssize_t (*store) (struct threshold_block *, const char *, size_t count); -}; - -#define SHOW_FIELDS(name) \ -static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ -{ \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ -} -SHOW_FIELDS(interrupt_enable) -SHOW_FIELDS(threshold_limit) - -static ssize_t -store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) -{ - struct thresh_restart tr; - unsigned long new; - - if (strict_strtoul(buf, 0, &new) < 0) - return -EINVAL; - - b->interrupt_enable = !!new; - - tr.b = b; - tr.reset = 0; - tr.old_limit = 0; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - - return size; -} - -static ssize_t -store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) -{ - struct thresh_restart tr; - unsigned long new; - - if (strict_strtoul(buf, 0, &new) < 0) - return -EINVAL; - - if (new > THRESHOLD_MAX) - new = THRESHOLD_MAX; - if (new < 1) - new = 1; - - tr.old_limit = b->threshold_limit; - b->threshold_limit = new; - tr.b = b; - tr.reset = 0; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - - return size; -} - -struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; -}; - -static void local_error_count_handler(void *_tbcc) -{ - struct threshold_block_cross_cpu *tbcc = _tbcc; - struct threshold_block *b = tbcc->tb; - u32 low, high; - - rdmsr(b->address, low, high); - tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); -} - -static ssize_t show_error_count(struct threshold_block *b, char *buf) -{ - struct threshold_block_cross_cpu tbcc = { .tb = b, }; - - smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); - return sprintf(buf, "%lx\n", tbcc.retval); -} - -static ssize_t store_error_count(struct threshold_block *b, - const char *buf, size_t count) -{ - struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return 1; -} - -#define RW_ATTR(val) \ -static struct threshold_attr val = { \ - .attr = {.name = __stringify(val), .mode = 0644 }, \ - .show = show_## val, \ - .store = store_## val, \ -}; - -RW_ATTR(interrupt_enable); -RW_ATTR(threshold_limit); -RW_ATTR(error_count); - -static struct attribute *default_attrs[] = { - &interrupt_enable.attr, - &threshold_limit.attr, - &error_count.attr, - NULL -}; - -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) - -static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - struct threshold_block *b = to_block(kobj); - struct threshold_attr *a = to_attr(attr); - ssize_t ret; - - ret = a->show ? a->show(b, buf) : -EIO; - - return ret; -} - -static ssize_t store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct threshold_block *b = to_block(kobj); - struct threshold_attr *a = to_attr(attr); - ssize_t ret; - - ret = a->store ? a->store(b, buf, count) : -EIO; - - return ret; -} - -static struct sysfs_ops threshold_ops = { - .show = show, - .store = store, -}; - -static struct kobj_type threshold_ktype = { - .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, -}; - -static __cpuinit int allocate_threshold_blocks(unsigned int cpu, - unsigned int bank, - unsigned int block, - u32 address) -{ - struct threshold_block *b = NULL; - u32 low, high; - int err; - - if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) - return 0; - - if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) - return 0; - - if (!(high & MASK_VALID_HI)) { - if (block) - goto recurse; - else - return 0; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - goto recurse; - - b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); - if (!b) - return -ENOMEM; - - b->block = block; - b->bank = bank; - b->cpu = cpu; - b->address = address; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; - - INIT_LIST_HEAD(&b->miscj); - - if (per_cpu(threshold_banks, cpu)[bank]->blocks) { - list_add(&b->miscj, - &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - } else { - per_cpu(threshold_banks, cpu)[bank]->blocks = b; - } - - err = kobject_init_and_add(&b->kobj, &threshold_ktype, - per_cpu(threshold_banks, cpu)[bank]->kobj, - "misc%i", block); - if (err) - goto out_free; -recurse: - if (!block) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - return 0; - address += MCG_XBLK_ADDR; - } else { - ++address; - } - - err = allocate_threshold_blocks(cpu, bank, ++block, address); - if (err) - goto out_free; - - if (b) - kobject_uevent(&b->kobj, KOBJ_ADD); - - return err; - -out_free: - if (b) { - kobject_put(&b->kobj); - kfree(b); - } - return err; -} - -static __cpuinit long -local_allocate_threshold_blocks(int cpu, unsigned int bank) -{ - return allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); -} - -/* symlinks sibling shared banks to first core. first core owns dir/files. */ -static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) -{ - int i, err = 0; - struct threshold_bank *b = NULL; - char name[32]; - - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(cpu_core_mask(cpu)); - - /* first core not up yet */ - if (cpu_data(i).cpu_core_id) - goto out; - - /* already linked */ - if (per_cpu(threshold_banks, cpu)[bank]) - goto out; - - b = per_cpu(threshold_banks, i)[bank]; - - if (!b) - goto out; - - err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, - b->kobj, name); - if (err) - goto out; - - cpumask_copy(b->cpus, cpu_core_mask(cpu)); - per_cpu(threshold_banks, cpu)[bank] = b; - - goto out; - } -#endif - - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); - if (!b) { - err = -ENOMEM; - goto out; - } - if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { - kfree(b); - err = -ENOMEM; - goto out; - } - - b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); - if (!b->kobj) - goto out_free; - -#ifndef CONFIG_SMP - cpumask_setall(b->cpus); -#else - cpumask_copy(b->cpus, cpu_core_mask(cpu)); -#endif - - per_cpu(threshold_banks, cpu)[bank] = b; - - err = local_allocate_threshold_blocks(cpu, bank); - if (err) - goto out_free; - - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, - b->kobj, name); - if (err) - goto out; - - per_cpu(threshold_banks, i)[bank] = b; - } - - goto out; - -out_free: - per_cpu(threshold_banks, cpu)[bank] = NULL; - free_cpumask_var(b->cpus); - kfree(b); -out: - return err; -} - -/* create dir/files for all valid threshold banks */ -static __cpuinit int threshold_create_device(unsigned int cpu) -{ - unsigned int bank; - int err = 0; - - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - err = threshold_create_bank(cpu, bank); - if (err) - goto out; - } -out: - return err; -} - -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - * of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ - -static void deallocate_threshold_block(unsigned int cpu, - unsigned int bank) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; - - if (!head) - return; - - list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { - kobject_put(&pos->kobj); - list_del(&pos->miscj); - kfree(pos); - } - - kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); - per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; -} - -static void threshold_remove_bank(unsigned int cpu, int bank) -{ - struct threshold_bank *b; - char name[32]; - int i = 0; - - b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) - return; - if (!b->blocks) - goto free_out; - - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - /* sibling symlink */ - if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - - return; - } -#endif - - /* remove all sibling symlinks before unregistering */ - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; - } - - deallocate_threshold_block(cpu, bank); - -free_out: - kobject_del(b->kobj); - kobject_put(b->kobj); - free_cpumask_var(b->cpus); - kfree(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; -} - -static void threshold_remove_device(unsigned int cpu) -{ - unsigned int bank; - - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - threshold_remove_bank(cpu, bank); - } -} - -/* get notified when a cpu comes on/off */ -static void __cpuinit -amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - threshold_create_device(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - threshold_remove_device(cpu); - break; - default: - break; - } -} - -static __init int threshold_init_device(void) -{ - unsigned lcpu = 0; - - /* to hit CPUs online before the notifier is up */ - for_each_online_cpu(lcpu) { - int err = threshold_create_device(lcpu); - - if (err) - return err; - } - threshold_cpu_callback = amd_64_threshold_cpu_callback; - - return 0; -} -device_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c new file mode 100644 index 00000000000..663a88e8b3c --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -0,0 +1,225 @@ +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen + */ + +#include +#include +#include +#include +#include +#include + +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static int cmci_supported(int *banks) +{ + u64 cap; + + if (mce_cmci_disabled || mce_ignore_ce) + return 0; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_irq(); +} + +static void print_update(char *type, int *hdr, int num) +{ + if (*hdr == 0) + printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); + *hdr = 1; + printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks, int boot) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + unsigned long flags; + int hdr = 0; + int i; + + spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) { + u64 val; + + if (test_bit(i, owned)) + continue; + + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Already owned by someone else? */ + if (val & CMCI_EN) { + if (test_and_clear_bit(i, owned) || boot) + print_update("SHD", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + val |= CMCI_EN | CMCI_THRESHOLD; + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & CMCI_EN) { + if (!test_and_set_bit(i, owned) || boot) + print_update("CMCI", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); + if (hdr) + printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ + unsigned long flags; + int i; + int banks; + u64 val; + + if (!cmci_supported(&banks)) + return; + spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) { + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + /* Disable CMCI */ + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + __clear_bit(i, __get_cpu_var(mce_banks_owned)); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void cmci_rediscover(int dying) +{ + int banks; + int cpu; + cpumask_var_t old; + + if (!cmci_supported(&banks)) + return; + if (!alloc_cpumask_var(&old, GFP_KERNEL)) + return; + cpumask_copy(old, ¤t->cpus_allowed); + + for_each_online_cpu(cpu) { + if (cpu == dying) + continue; + if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) + continue; + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks, 0); + } + + set_cpus_allowed_ptr(current, old); + free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks, 0); +} + +static void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks, 1); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + +void mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); + intel_init_cmci(); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c deleted file mode 100644 index 663a88e8b3c..00000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Intel specific MCE features. - * Copyright 2004 Zwane Mwaikambo - * Copyright (C) 2008, 2009 Intel Corporation - * Author: Andi Kleen - */ - -#include -#include -#include -#include -#include -#include - -/* - * Support for Intel Correct Machine Check Interrupts. This allows - * the CPU to raise an interrupt when a corrected machine check happened. - * Normally we pick those up using a regular polling timer. - * Also supports reliable discovery of shared banks. - */ - -static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); - -/* - * cmci_discover_lock protects against parallel discovery attempts - * which could race against each other. - */ -static DEFINE_SPINLOCK(cmci_discover_lock); - -#define CMCI_THRESHOLD 1 - -static int cmci_supported(int *banks) -{ - u64 cap; - - if (mce_cmci_disabled || mce_ignore_ce) - return 0; - - /* - * Vendor check is not strictly needed, but the initial - * initialization is vendor keyed and this - * makes sure none of the backdoors are entered otherwise. - */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - if (!cpu_has_apic || lapic_get_maxlvt() < 6) - return 0; - rdmsrl(MSR_IA32_MCG_CAP, cap); - *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); - return !!(cap & MCG_CMCI_P); -} - -/* - * The interrupt handler. This is called on every event. - * Just call the poller directly to log any events. - * This could in theory increase the threshold under high load, - * but doesn't for now. - */ -static void intel_threshold_interrupt(void) -{ - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - mce_notify_irq(); -} - -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - -/* - * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks - * on this CPU. Use the algorithm recommended in the SDM to discover shared - * banks. - */ -static void cmci_discover(int banks, int boot) -{ - unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); - unsigned long flags; - int hdr = 0; - int i; - - spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - u64 val; - - if (test_bit(i, owned)) - continue; - - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - - /* Already owned by someone else? */ - if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) - print_update("SHD", &hdr, i); - __clear_bit(i, __get_cpu_var(mce_poll_banks)); - continue; - } - - val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - - /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) - print_update("CMCI", &hdr, i); - __clear_bit(i, __get_cpu_var(mce_poll_banks)); - } else { - WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); - } - } - spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (hdr) - printk(KERN_CONT "\n"); -} - -/* - * Just in case we missed an event during initialization check - * all the CMCI owned banks. - */ -void cmci_recheck(void) -{ - unsigned long flags; - int banks; - - if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) - return; - local_irq_save(flags); - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - local_irq_restore(flags); -} - -/* - * Disable CMCI on this CPU for all banks it owns when it goes down. - * This allows other CPUs to claim the banks on rediscovery. - */ -void cmci_clear(void) -{ - unsigned long flags; - int i; - int banks; - u64 val; - - if (!cmci_supported(&banks)) - return; - spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } - spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - -/* - * After a CPU went down cycle through all the others and rediscover - * Must run in process context. - */ -void cmci_rediscover(int dying) -{ - int banks; - int cpu; - cpumask_var_t old; - - if (!cmci_supported(&banks)) - return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); - - for_each_online_cpu(cpu) { - if (cpu == dying) - continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) - continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks, 0); - } - - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); -} - -/* - * Reenable CMCI on this CPU in case a CPU down failed. - */ -void cmci_reenable(void) -{ - int banks; - if (cmci_supported(&banks)) - cmci_discover(banks, 0); -} - -static void intel_init_cmci(void) -{ - int banks; - - if (!cmci_supported(&banks)) - return; - - mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); - /* - * For CPU #0 this runs with still disabled APIC, but that's - * ok because only the vector is set up. We still do another - * check for the banks later for CPU #0 just to make sure - * to not miss any events. - */ - apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); - cmci_recheck(); -} - -void mce_intel_feature_init(struct cpuinfo_x86 *c) -{ - intel_init_thermal(c); - intel_init_cmci(); -} -- cgit v1.2.3 From 95ee14e4379c5e19c0897c872350570402014742 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 9 Jun 2009 18:20:39 -0700 Subject: x86: cap iomem_resource to addressable physical memory iomem_resource is by default initialized to -1, which means 64 bits of physical address space if 64-bit resources are enabled. However, x86 CPUs cannot address 64 bits of physical address space. Thus, we want to cap the physical address space to what the union of all CPU can actually address. Without this patch, we may end up assigning inaccessible values to uninitialized 64-bit PCI memory resources. Signed-off-by: H. Peter Anvin Cc: Matthew Wilcox Cc: Jesse Barnes Cc: Martin Mares Cc: stable@kernel.org --- arch/x86/kernel/cpu/common.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3ffdcfa9abd..5b9cb8839ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -853,6 +853,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif + + /* Cap the iomem address space to what is addressable on all CPUs */ + iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1; } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 203abd67b75f7714ce98ab0cdbd6cfd7ad79dec4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Jun 2009 14:52:01 +0200 Subject: x86: mce: Handle banks == 0 case in K7 quirk Vegard Nossum reported: > I get an MCE-related crash like this in latest linus tree: > > [ 0.115341] CPU: L1 I Cache: 64K (64 bytes/line), D cache 64K (64 bytes/line) > [ 0.116396] CPU: L2 Cache: 512K (64 bytes/line) > [ 0.120570] mce: CPU supports 0 MCE banks > [ 0.124870] BUG: unable to handle kernel NULL pointer dereference at 00000000 00000010 > [ 0.128001] IP: [] mcheck_init+0x278/0x320 > [ 0.128001] PGD 0 > [ 0.128001] Thread overran stack, or stack corrupted > [ 0.128001] Oops: 0002 [#1] PREEMPT SMP > [ 0.128001] last sysfs file: > [ 0.128001] CPU 0 > [ 0.128001] Modules linked in: > [ 0.128001] Pid: 0, comm: swapper Not tainted 2.6.30 #426 > [ 0.128001] RIP: 0010:[] [] mcheck_init+0x278/0x320 > [ 0.128001] RSP: 0018:ffffffff81595e38 EFLAGS: 00000246 > [ 0.128001] RAX: 0000000000000010 RBX: ffffffff8158f900 RCX: 0000000000000000 > [ 0.128001] RDX: 0000000000000000 RSI: 00000000000000ff RDI: 0000000000000010 > [ 0.128001] RBP: ffffffff81595e68 R08: 0000000000000001 R09: 0000000000000000 > [ 0.128001] R10: 0000000000000010 R11: 0000000000000000 R12: 0000000000000000 > [ 0.128001] R13: 00000000ffffffff R14: 0000000000000000 R15: 0000000000000000 > [ 0.128001] FS: 0000000000000000(0000) GS:ffff880002288000(0000) knlGS:00000 > 00000000000 > [ 0.128001] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > [ 0.128001] CR2: 0000000000000010 CR3: 0000000001001000 CR4: 00000000000006b0 > [ 0.128001] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 0.128001] DR3: 0000000000000000 DR6: 0000000000000000 DR7: 0000000000000000 > [ 0.128001] Process swapper (pid: 0, threadinfo ffffffff81594000, task ffffff > ff8152a4a0) > [ 0.128001] Stack: > [ 0.128001] 0000000081595e68 5aa50ed3b4ddbe6e ffffffff8158f900 ffffffff8158f > 914 > [ 0.128001] ffffffff8158f948 0000000000000000 ffffffff81595eb8 ffffffff813b8 > 69c > [ 0.128001] 5aa50ed3b4ddbe6e 00000001078bfbfd 0000062300000800 5aa50ed3b4ddb > e6e > [ 0.128001] Call Trace: > [ 0.128001] [] identify_cpu+0x331/0x392 > [ 0.128001] [] identify_boot_cpu+0x23/0x6e > [ 0.128001] [] check_bugs+0x1c/0x60 > [ 0.128001] [] start_kernel+0x403/0x46e > [ 0.128001] [] x86_64_start_reservations+0xac/0xd5 > [ 0.128001] [] x86_64_start_kernel+0x115/0x14b > [ 0.128001] [] ? early_idt_handler+0x0/0x71 This happens on QEMU which reports MCA capability, but no banks. Without this patch there is a buffer overrun and boot ops because the code would try to initialize the 0 element of a zero length kmalloc() buffer. Reported-by: Vegard Nossum Tested-by: Pekka Enberg Signed-off-by: Andi Kleen LKML-Reference: <20090615125200.GD31969@one.firstfloor.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fabba15e455..d9d77cfd8cc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1245,7 +1245,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6) + if (c->x86 == 6 && banks > 0) bank[0] = 0; } -- cgit v1.2.3 From 5ce4243dcefbbc43791ffc36e1be55067ceec916 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 15 Jun 2009 22:26:33 +0400 Subject: x86: mce: Don't touch THERMAL_APIC_VECTOR if no active APIC present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If APIC was disabled (for some reason) and as result it's not even mapped we should not try to enable thermal interrupts at all. Reported-by: Simon Holm Thøgersen Tested-by: Simon Holm Thøgersen Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090615182633.GA7606@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2b011d2d857..61e32881f41 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -21,9 +21,15 @@ void intel_init_thermal(struct cpuinfo_x86 *c) int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + /* + * Thermal monitoring depends on ACPI, clock modulation + * and APIC as well + */ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC) || + !cpu_has(c, X86_FEATURE_APIC)) { + pr_debug("Thermal monitoring disabled\n"); return; + } /* * First check if its enabled already, in which case there might -- cgit v1.2.3 From 1bf7b31efa0c322d93cb3f772cd9bc743e8bb42d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 17 Jun 2009 08:31:15 -0700 Subject: x86, mce: mce_intel.c needs mce_intel.c uses apic_write() and lapic_get_maxlvt(), and so it needs . Signed-off-by: H. Peter Anvin Cc: Andi Kleen Cc: Hidetoshi Seto --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 663a88e8b3c..e1acec0f7a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From fe955e5c793aab398794be4c5ede172d48446c4a Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 10 Jun 2009 12:41:02 -0700 Subject: x86: nmi: Add Intel processor 0x6f4 to NMI perfctr1 workaround Expand Intel NMI perfctr1 workaround to include a Core2 processor stepping (cpuid family-6, model-f, stepping-4). Resolves a situation where the NMI would not enable on these processors. Signed-off-by: Prarit Bhargava Acked-by: Suresh Siddha Cc: prarit@redhat.com Cc: suresh.b.siddha@intel.com Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d6f5b9fbde3..5c481f6205b 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void) wd_ops = &k7_wd_ops; break; case X86_VENDOR_INTEL: - /* - * Work around Core Duo (Yonah) errata AE49 where perfctr1 - * doesn't have a working enable bit. + /* Work around where perfctr1 doesn't have a working enable + * bit as described in the following errata: + * AE49 Core Duo and Intel Core Solo 65 nm + * AN49 Intel Pentium Dual-Core + * AF49 Dual-Core Intel Xeon Processor LV */ - if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || + ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && + boot_cpu_data.x86_mask == 4))) { intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; } -- cgit v1.2.3 From 60f916dee612130c9977a8edd4abee98334202ba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2009 19:00:20 +0200 Subject: perf_counter: x86: Set the period in the intel overflow handler Commit 9e350de37ac960 ("perf_counter: Accurate period data") missed a spot, which caused all Intel-PMU samples to have a period of 0. This broke auto-freq sampling. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e8c68a5091d..ce1ae3f1f86 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1224,6 +1224,8 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; + data.period = counter->hw.last_period; + if (perf_counter_overflow(counter, 1, &data)) intel_pmu_disable_counter(&counter->hw, bit); } -- cgit v1.2.3 From dc4c2a0aed3b09f6e255bd5c3faa50fe6e0b2ded Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 18 Jun 2009 00:35:58 +0200 Subject: i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum Acked-by: Stas Sergeev Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5b9cb8839ca..ba1131ac943 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -108,7 +108,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { /* data */ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, GDT_STACK_CANARY_INIT #endif -- cgit v1.2.3 From 74b602c7147212a7495879ec23fe6c2d3b470e06 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 17 Jun 2009 14:43:32 -0700 Subject: x86: fix duplicated sysfs attribute The sysfs attribute cmci_disabled was accidentall turned into a duplicate of ignore_ce, breaking all other attributes. Signed-off-by: Yinghai Lu Acked-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2a560cefb67..5aac9e4dd13 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1777,7 +1777,7 @@ static struct sysdev_ext_attribute attr_ignore_ce = { }; static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_cmci_disabled), + _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), &mce_cmci_disabled }; -- cgit v1.2.3 From e92fae064ae42b2a4a77646f7655bca4c87bb1eb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 17 Jun 2009 16:21:33 -0700 Subject: x86: use zalloc_cpumask_var for mce_dev_initialized We need a cleared cpu_mask to record if mce is initialized, especially when MAXSMP is used. used zalloc_... instead Signed-off-by: Yinghai Lu Reviewed-by: Hidetoshi Seto Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5aac9e4dd13..c2fb70d0286 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1969,7 +1969,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) -- cgit v1.2.3 From b1f49f9582f9be6de5055cfa97eabf6246f2eaf7 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 18 Jun 2009 14:53:24 +0900 Subject: x86, mce: fix error path in mce_create_device() Don't skip removing mce_attrs in route from error2. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Cc: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c2fb70d0286..284d1de968b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1798,7 +1798,7 @@ static cpumask_var_t mce_dev_initialized; static __cpuinit int mce_create_device(unsigned int cpu) { int err; - int i; + int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; @@ -1816,9 +1816,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } - for (i = 0; i < banks; i++) { + for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[i]); + &bank_attrs[j]); if (err) goto error2; } @@ -1826,8 +1826,8 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: - while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + while (--j >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); error: while (--i >= 0) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); -- cgit v1.2.3 From f9188e023c248d73f5b4a589b480e065c1864068 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jun 2009 22:20:52 +0200 Subject: perf_counter: Make callchain samples extensible Before exposing upstream tools to a callchain-samples ABI, tidy it up to make it more extensible in the future: Use markers in the IP chain to denote context, use (u64)-1..-4095 range for these context markers because we use them for ERR_PTR(), so these addresses are unlikely to be mapped. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ce1ae3f1f86..76dfef23f78 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1555,9 +1555,9 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) */ static inline -void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +void callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < MAX_STACK_DEPTH) + if (entry->nr < PERF_MAX_STACK_DEPTH) entry->ip[entry->nr++] = ip; } @@ -1602,22 +1602,10 @@ static const struct stacktrace_ops backtrace_ops = { static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bp; - char *stack; - int nr = entry->nr; - + callchain_store(entry, PERF_CONTEXT_KERNEL); callchain_store(entry, regs->ip); - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - get_bp(bp); -#else - bp = 0; -#endif - - dump_trace(NULL, regs, (void *)&stack, bp, &backtrace_ops, entry); - - entry->kernel = entry->nr - nr; + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } /* @@ -1669,16 +1657,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) { struct stack_frame frame; const void __user *fp; - int nr = entry->nr; if (!user_mode(regs)) regs = task_pt_regs(current); fp = (void __user *)regs->bp; + callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); - while (entry->nr < MAX_STACK_DEPTH) { + while (entry->nr < PERF_MAX_STACK_DEPTH) { frame.next_frame = NULL; frame.return_address = 0; @@ -1691,8 +1679,6 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, frame.return_address); fp = frame.next_frame; } - - entry->user = entry->nr - nr; } static void @@ -1728,9 +1714,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry = &__get_cpu_var(irq_entry); entry->nr = 0; - entry->hv = 0; - entry->kernel = 0; - entry->user = 0; perf_do_callchain(regs, entry); -- cgit v1.2.3 From 99bd0c0fc4b04da54cb311953ef9489931c19c63 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Jun 2009 10:59:09 +0200 Subject: x86: Set cpu_llc_id on AMD CPUs This counts when building sched domains in case NUMA information is not available. ( See cpu_coregroup_mask() which uses llc_shared_map which in turn is created based on cpu_llc_id. ) Currently Linux builds domains as follows: (example from a dual socket quad-core system) CPU0 attaching sched-domain: domain 0: span 0-7 level CPU groups: 0 1 2 3 4 5 6 7 ... CPU7 attaching sched-domain: domain 0: span 0-7 level CPU groups: 7 0 1 2 3 4 5 6 Ever since that is borked for multi-core AMD CPU systems. This patch fixes that and now we get a proper: CPU0 attaching sched-domain: domain 0: span 0-3 level MC groups: 0 1 2 3 domain 1: span 0-7 level CPU groups: 0-3 4-7 ... CPU7 attaching sched-domain: domain 0: span 4-7 level MC groups: 7 4 5 6 domain 1: span 0-7 level CPU groups: 4-7 0-3 This allows scheduler to assign tasks to cores on different sockets (i.e. that don't share last level cache) for performance reasons. Signed-off-by: Andreas Herrmann LKML-Reference: <20090619085909.GJ5218@alberich.amd.com> Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e5b27d8f1b4..28e5f595604 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; + int cpu = smp_processor_id(); bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; #endif } -- cgit v1.2.3 From d9f2a5ecb2846d0fd368fb4c45182e43f38e4471 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 20 Jun 2009 13:19:25 +0530 Subject: perf_counter, x8: Fix L1-data-Cache-Store-Referencees for AMD Fix AMD's Data Cache Refills from System event. After this patch : ./tools/perf/perf stat -e l1d -e l1d-misses -e l1d-write -e l1d-prefetch -e l1d-prefetch-miss -e l1i -e l1i-misses -e l1i-prefetch -e l2 -e l2-misses -e l2-write -e dtlb -e dtlb-misses -e itlb -e itlb-misses -e bpu -e bpu-misses ls /dev/ > /dev/null Performance counter stats for 'ls /dev/': 2499484 L1-data-Cache-Load-Referencees (scaled from 3.97%) 70347 L1-data-Cache-Load-Misses (scaled from 7.30%) 9360 L1-data-Cache-Store-Referencees (scaled from 8.64%) 32804 L1-data-Cache-Prefetch-Referencees (scaled from 17.72%) 7693 L1-data-Cache-Prefetch-Misses (scaled from 22.97%) 2180945 L1-instruction-Cache-Load-Referencees (scaled from 28.48%) 14518 L1-instruction-Cache-Load-Misses (scaled from 35.00%) 2405 L1-instruction-Cache-Prefetch-Referencees (scaled from 34.89%) 71387 L2-Cache-Load-Referencees (scaled from 34.94%) 18732 L2-Cache-Load-Misses (scaled from 34.92%) 79918 L2-Cache-Store-Referencees (scaled from 36.02%) 1295294 Data-TLB-Cache-Load-Referencees (scaled from 35.99%) 30896 Data-TLB-Cache-Load-Misses (scaled from 33.36%) 1222030 Instruction-TLB-Cache-Load-Referencees (scaled from 29.46%) 357 Instruction-TLB-Cache-Load-Misses (scaled from 20.46%) 530888 Branch-Cache-Load-Referencees (scaled from 11.48%) 8638 Branch-Cache-Load-Misses (scaled from 5.09%) 0.011295149 seconds time elapsed. Earlier it always shows value 0. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1245484165.3102.6.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 76dfef23f78..22eb3a1d4f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -401,7 +401,7 @@ static const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { -- cgit v1.2.3 From c14dab5c0782ef632742963a66276a195418a63c Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 24 Jun 2009 10:13:24 +0800 Subject: perf_counter, x86: Set global control MSR correctly Previous code made an assumption that the power on value of global control MSR has enabled all fixed and general purpose counters properly. However, this is not the case for certain Intel processors, such as Atom - and it might also be firmware dependent. Each enable bit in IA32_PERF_GLOBAL_CTRL is AND'ed with the enable bits for all privilege levels in the respective IA32_PERFEVTSELx or IA32_PERF_FIXED_CTR_CTRL MSRs to start/stop the counting of respective counters. Counting is enabled if the AND'ed results is true; counting is disabled when the result is false. The end result is that all fixed counters are always disabled on Atom processors because the assumption is just invalid. Fix this by not initializing the ctrl-mask out of the global MSR, but setting it to perf_counter_mask. Reported-by: Stephane Eranian Signed-off-by: Yong Wang Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <20090624021324.GA2788@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 22eb3a1d4f9..a310d19faca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -969,13 +969,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - /* - * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86_model == 28) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) @@ -1428,8 +1421,6 @@ static int intel_pmu_init(void) */ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - /* * Install the hw-cache-events table: */ @@ -1514,6 +1505,7 @@ void __init init_hw_perf_counters(void) perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_counter_mask; perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); -- cgit v1.2.3 From 194002b274e9169a04beb1b23dcc132159bb566c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 16:35:24 +0200 Subject: perf_counter, x86: Add mmap counter read support Update the mmap control page with the needed information to use the userspace RDPMC instruction for self monitoring. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a310d19faca..b83474b6021 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -912,6 +912,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + perf_counter_update_userpage(counter); + return ret; } @@ -1034,6 +1036,8 @@ try_generic: x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); + perf_counter_update_userpage(counter); + return 0; } @@ -1126,6 +1130,8 @@ static void x86_pmu_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); + + perf_counter_update_userpage(counter); } /* -- cgit v1.2.3 From 5be6066a7f8d917db347d94f1b359b9b70dcb572 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 24 Jun 2009 09:21:10 +0900 Subject: x86, mce: percpu mcheck_timer should be pinned If CONFIG_NO_HZ + CONFIG_SMP, timer added via add_timer() might be migrated on other cpu. Use add_timer_on() instead. Avoids the following failure: Maciej Rutecki wrote: > > After normal boot I try: > > > > echo 1 > /sys/devices/system/machinecheck/machinecheck0/check_interval > > > > I found this in dmesg: > > > > [ 141.704025] ------------[ cut here ]------------ > > [ 141.704039] WARNING: at arch/x86/kernel/cpu/mcheck/mce.c:1102 > > mcheck_timer+0xf5/0x100() Reported-by: Maciej Rutecki Signed-off-by: Hidetoshi Seto Tested-by: Maciej Rutecki Acked-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..af425b83202 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1117,7 +1117,7 @@ static void mcheck_timer(unsigned long data) *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); t->expires = jiffies + *n; - add_timer(t); + add_timer_on(t, smp_processor_id()); } static void mce_do_trigger(struct work_struct *work) @@ -1321,7 +1321,7 @@ static void mce_init_timer(void) return; setup_timer(t, mcheck_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); - add_timer(t); + add_timer_on(t, smp_processor_id()); } /* -- cgit v1.2.3 From ff8a4bae459a9b6455504127fcb78fdbc8e50e4c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 27 Jun 2009 12:22:27 -0700 Subject: Revert "x86: cap iomem_resource to addressable physical memory" This reverts commit 95ee14e4379c5e19c0897c872350570402014742. Mikael Petterson reported that at least one of his systems will not boot as a result. We have ruled out the detection algorithm malfunctioning, so it is not a matter of producing the incorrect bitmasks; rather, something in the application of them fails. Revert the commit until we can root cause and correct this problem. -stable team: this means the underlying commit should be rejected. Reported-and-isolated-by: Mikael Petterson Signed-off-by: H. Peter Anvin LKML-Reference: <200906261559.n5QFxJH8027336@pilspetsen.it.uu.se> Cc: stable@kernel.org Cc: Grant Grundler --- arch/x86/kernel/cpu/common.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b26d4deada..f1961c07af9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -848,9 +848,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif - - /* Cap the iomem address space to what is addressable on all CPUs */ - iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1; } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 4078c444cf667f018c3fc7ebf141131a2b7c9480 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 29 Jun 2009 00:41:11 -0700 Subject: perf_counter, x86: Update x86_pmu after WARN() The print out should read the value before changing the value. Signed-off-by: Yinghai Lu Cc: Peter Zijlstra LKML-Reference: <4A487017.4090007@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b83474b6021..d4cf4ce19aa 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1496,17 +1496,17 @@ void __init init_hw_perf_counters(void) pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", x86_pmu.num_counters, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } perf_counter_mask = (1 << x86_pmu.num_counters) - 1; perf_max_counters = x86_pmu.num_counters; if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; } perf_counter_mask |= -- cgit v1.2.3 From 0406ca6d8e849d9dd027c8cb6791448e81411aef Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 1 Jul 2009 21:02:09 +0200 Subject: perf_counter: Ignore the nmi call frames in the x86-64 backtraces About every callchains recorded with perf record are filled up including the internal perfcounter nmi frame: perf_callchain perf_counter_overflow intel_pmu_handle_irq perf_counter_nmi_handler notifier_call_chain atomic_notifier_call_chain notify_die do_nmi nmi We want ignore this frame as it's not interesting for instrumentation. To solve this, we simply ignore every frames from nmi context. New example of "perf report -s sym -c" after this patch: 9.59% [k] search_by_key 4.88% search_by_key reiserfs_read_locked_inode reiserfs_iget reiserfs_lookup do_lookup __link_path_walk path_walk do_path_lookup user_path_at vfs_fstatat vfs_lstat sys_newlstat system_call_fastpath __lxstat 0x406fb1 3.19% search_by_key search_by_entry_key reiserfs_find_entry reiserfs_lookup do_lookup __link_path_walk path_walk do_path_lookup user_path_at vfs_fstatat vfs_lstat sys_newlstat system_call_fastpath __lxstat 0x406fb1 [...] For now this patch only solves the problem in x86-64. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Anton Blanchard Cc: Arnaldo Carvalho de Melo LKML-Reference: <1246474930-6088-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d4cf4ce19aa..36c3dc7b899 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1561,6 +1561,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); static void @@ -1576,7 +1577,9 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - /* Process all stacks: */ + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + return 0; } @@ -1584,6 +1587,9 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + if (reliable) callchain_store(entry, addr); } -- cgit v1.2.3 From c7210e1ff8a95a0a6dba0d04a95b3ddd9d165fab Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 2 Jul 2009 22:35:35 +0530 Subject: x86: Remove unused function lapic_watchdog_ok() lapic_watchdog_ok() is a global function but no one is using it. Signed-off-by: Jaswinder Singh Rajput Cc: Andi Kleen Cc: Yinghai Lu LKML-Reference: <1246554335.2242.29.camel@jaswinder.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 5c481f6205b..e60ed740d2b 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -803,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz) wd_ops->rearm(wd, nmi_hz); return 1; } - -int lapic_watchdog_ok(void) -{ - return wd_ops != NULL; -} -- cgit v1.2.3 From a2e1b4c31257c07f148a89eb7eea7ca959fd0642 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Sun, 26 Jul 2009 10:55:25 -0500 Subject: [CPUFREQ] Powernow-k8: support family 0xf with 2 low p-states Provide support for family 0xf processors with 2 P-states below the elevator voltage. Remove the checks that prevent this configuration from being supported and increase the transition voltage to prevent errors during the transition. Signed-off-by: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 30 +++++++++++------------------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 3 ++- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 81cbe64ed6b..ae068f59603 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -299,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) { - if (core_voltage_pre_transition(data, reqvid)) + if (core_voltage_pre_transition(data, reqvid, reqfid)) return 1; if (core_frequency_transition(data, reqfid)) @@ -327,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data, /* Phase 1 - core voltage transition ... setup voltage */ static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid) + u32 reqvid, u32 reqfid) { u32 rvosteps = data->rvo; u32 savefid = data->currfid; - u32 maxvid, lo; + u32 maxvid, lo, rvomult = 1; dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " "reqvid 0x%x, rvo 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); + if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) + rvomult = 2; + rvosteps *= rvomult; rdmsr(MSR_FIDVID_STATUS, lo, maxvid); maxvid = 0x1f & (maxvid >> 16); dprintk("ph1 maxvid=0x%x\n", maxvid); @@ -351,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, return 1; } - while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { + while ((rvosteps > 0) && + ((rvomult * data->rvo + data->currvid) > reqvid)) { if (data->currvid == maxvid) { rvosteps = 0; } else { @@ -384,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) u32 vcoreqfid, vcocurrfid, vcofiddiff; u32 fid_interval, savevid = data->currvid; - if ((reqfid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX "ph2: illegal lo-lo transition " - "0x%x 0x%x\n", reqfid, data->currfid); - return 1; - } - if (data->currfid == reqfid) { printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); @@ -407,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; + if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) + vcofiddiff = 0; + while (vcofiddiff > 2) { (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); @@ -1081,14 +1081,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, return 0; } - if ((fid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX - "ignoring illegal change in lo freq table-%x to 0x%x\n", - data->currfid, fid); - return 1; - } - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", smp_processor_id(), fid, vid); freqs.old = find_khz_freq_from_fid(data->currfid); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index c9c1190b5e1..02ce824073c 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -215,7 +215,8 @@ struct pst_s { #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) -static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 regfid); static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); -- cgit v1.2.3 From ad361c9884e809340f6daca80d56a9e9c871690a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 6 Jul 2009 13:05:40 -0700 Subject: Remove multiple KERN_ prefixes from printk formats Commit 5fd29d6ccbc98884569d6f3105aeca70858b3e0f ("printk: clean up handling of log-levels and newlines") changed printk semantics. printk lines with multiple KERN_ prefixes are no longer emitted as before the patch. is now included in the output on each additional use. Remove all uses of multiple KERN_s in formats. Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index ae068f59603..2a50ef89100 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1259,7 +1259,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { static const char ACPI_PSS_BIOS_BUG_MSG[] = KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; + FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; struct init_on_cpu init_on_cpu; int rc; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index af425b83202..484c1e5f658 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -194,14 +194,14 @@ static void print_mce(struct mce *m) m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk("\n"); + printk(KERN_CONT "\n"); } printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %llx ", m->addr); + printk(KERN_CONT "ADDR %llx ", m->addr); if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); + printk(KERN_CONT "MISC %llx ", m->misc); + printk(KERN_CONT "\n"); printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); @@ -209,13 +209,13 @@ static void print_mce(struct mce *m) static void print_mce_head(void) { - printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + printk(KERN_EMERG "\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ -- cgit v1.2.3 From 11d1578f9454159c43499d1d8fe8a7d728c176a3 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 8 Jul 2009 17:46:14 -0400 Subject: perf_counter: Add P6 PMU support Add basic P6 PMU support. The P6 uses the EVNTSEL0 EN bit to enable/disable both its counters. We use this for the global enable/disable, and clear all config bits (except EN) to disable individual counters. Actual ia32 hardware doesn't support lfence, so use a locked op without side-effect to implement a full barrier. perf stat and perf record seem to function correctly. [a.p.zijlstra@chello.nl: cleanups and complete the enable/disable code] Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 227 ++++++++++++++++++++++++++++++++++--- 1 file changed, 213 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 36c3dc7b899..1910f39ff19 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -65,6 +65,44 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int event) +{ + return p6_perfmon_event_map[event]; +} + +static u64 p6_pmu_raw_event(u64 event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_COUNTER_MASK) + + return event & P6_EVNTSEL_MASK; +} + + /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -726,6 +764,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } +static void p6_pmu_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + static void intel_pmu_disable_all(void) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); @@ -767,6 +822,23 @@ void hw_perf_disable(void) return x86_pmu.disable_all(); } +static void p6_pmu_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + static void intel_pmu_enable_all(void) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); @@ -819,16 +891,13 @@ static inline void intel_pmu_ack_status(u64 ack) static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - int err; - err = checking_wrmsrl(hwc->config_base + idx, + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - int err; - err = checking_wrmsrl(hwc->config_base + idx, - hwc->config); + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static inline void @@ -836,13 +905,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; - int err; mask = 0xfULL << (idx * 4); rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; - err = checking_wrmsrl(hwc->config_base, ctrl_val); + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long val = ARCH_PERFMON_EVENTSEL0_ENABLE; + + if (!cpuc->enabled) + val = 0; + + (void)checking_wrmsrl(hwc->config_base + idx, val); } static inline void @@ -943,6 +1023,17 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } +static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + x86_pmu_enable_counter(hwc, idx); + else + x86_pmu_disable_counter(hwc, idx); +} + + static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { @@ -1176,6 +1267,49 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } +static int p6_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_counters *cpuc; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx, handled = 0; + u64 val; + + data.regs = regs; + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_counters); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + counter = cpuc->counters[idx]; + hwc = &counter->hw; + + val = x86_perf_counter_update(counter, hwc, idx); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + + if (perf_counter_overflow(counter, 1, &data)) + p6_pmu_disable_counter(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} /* * This handler is triggered by the local APIC, so the APIC IRQ handling @@ -1185,14 +1319,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_counters *cpuc; - int bit, cpu, loops; + int bit, loops; u64 ack, status; data.regs = regs; data.addr = 0; - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); status = intel_pmu_get_status(); @@ -1249,14 +1382,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; - int cpu, idx, handled = 0; + int idx, handled = 0; u64 val; data.regs = regs; data.addr = 0; - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &__get_cpu_var(cpu_hw_counters); for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) @@ -1353,6 +1485,32 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; +static struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = p6_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_counter, + .disable = p6_pmu_disable_counter, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_counters = 2, + /* + * Counters have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a counter for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .counter_bits = 32, + .counter_mask = (1ULL << 32) - 1, +}; + static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1392,6 +1550,41 @@ static struct x86_pmu amd_pmu = { .max_period = (1ULL << 47) - 1, }; +static int p6_pmu_init(void) +{ + int high, low; + + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + break; + case 9: + case 13: + /* for Pentium M, we need to check if PMU exist */ + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (low & MSR_IA32_MISC_ENABLE_EMON) + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + if (!cpu_has_apic) { + pr_info("no Local APIC, try rebooting with lapic"); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + return 0; +} + static int intel_pmu_init(void) { union cpuid10_edx edx; @@ -1400,8 +1593,14 @@ static int intel_pmu_init(void) unsigned int ebx; int version; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { return -ENODEV; + } + } /* * Check whether the Architectural PerfMon supports -- cgit v1.2.3 From 9c74fb50867e8fb5f3be3be06716492c0f79309e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jul 2009 10:21:41 +0200 Subject: perf_counter: Fix up P6 PMU details The P6 doesn't seem to support cache ref/hit/miss counts, so we extend the generic hardware event codes to have 0 and -1 mean the same thing as for the generic cache events. Furthermore, it turns out the 0 event does not count (that is, its reported that on PPro it actually does count something), therefore use a event configuration that's specified not to count to disable the counters. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1910f39ff19..c7cc6eac14e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -84,6 +84,14 @@ static u64 p6_pmu_event_map(int event) return p6_perfmon_event_map[event]; } +/* + * Counter setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_COUNTER 0x0000002EULL + static u64 p6_pmu_raw_event(u64 event) { #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL @@ -704,6 +712,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; + u64 config; int err; if (!x86_pmu_initialized()) @@ -756,10 +765,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (attr->config >= x86_pmu.max_events) return -EINVAL; + /* * The generic map: */ - hwc->config |= x86_pmu.event_map(attr->config); + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + hwc->config |= config; return 0; } @@ -767,7 +785,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) static void p6_pmu_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long val; + u64 val; if (!cpuc->enabled) return; @@ -917,10 +935,10 @@ static inline void p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long val = ARCH_PERFMON_EVENTSEL0_ENABLE; + u64 val = P6_NOP_COUNTER; - if (!cpuc->enabled) - val = 0; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; (void)checking_wrmsrl(hwc->config_base + idx, val); } -- cgit v1.2.3 From 984b838ce69c063a91b87550598ab7f3439dd94a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 09:59:56 +0200 Subject: perf_counter: Clean up global vs counter enable Ingo noticed that both AMD and P6 call x86_pmu_disable_counter() on *_pmu_enable_counter(). This is because we rely on the side effect of that call to program the event config but not touch the EN bit. We change that for AMD by having enable_all() simply write the full config in, and for P6 by explicitly coding it. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c7cc6eac14e..bed1c4c2f25 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -874,13 +874,13 @@ static void amd_pmu_enable_all(void) barrier(); for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_counter *counter = cpuc->counters[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) - continue; + + val = counter->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } @@ -1044,11 +1044,13 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + u64 val; + val = hwc->config; if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); - else - x86_pmu_disable_counter(hwc, idx); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); } @@ -1068,8 +1070,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) if (cpuc->enabled) x86_pmu_enable_counter(hwc, idx); - else - x86_pmu_disable_counter(hwc, idx); } static int -- cgit v1.2.3 From f1c6a58121f9846ac665b0fbd3cbab90ce8bcbac Mon Sep 17 00:00:00 2001 From: Daniel Qarras Date: Sun, 12 Jul 2009 04:32:40 -0700 Subject: perf_counter, x86: Extend perf_counter Pentium M support I've attached a patch to remove the Pentium M special casing of EMON and as noticed at least with my Pentium M the hardware PMU now works: Performance counter stats for '/bin/ls /var/tmp': 1.809988 task-clock-msecs # 0.125 CPUs 1 context-switches # 0.001 M/sec 0 CPU-migrations # 0.000 M/sec 224 page-faults # 0.124 M/sec 1425648 cycles # 787.656 M/sec 912755 instructions # 0.640 IPC Vince suggested that this code was trying to address erratum Y17 in Pentium-M's: http://download.intel.com/support/processors/mobile/pm/sb/25266532.pdf But that erratum (related to IA32_MISC_ENABLES.7) does not affect perfcounters as we dont use this toggle to disable RDPMC and WRMSR/RDMSR access to performance counters. We keep cr4's bit 8 (X86_CR4_PCE) clear so unprivileged RDPMC access is not allowed anyway. Cc: Vince Weaver Cc: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index bed1c4c2f25..7e346d4bc0f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1583,10 +1583,8 @@ static int p6_pmu_init(void) break; case 9: case 13: - /* for Pentium M, we need to check if PMU exist */ - rdmsr(MSR_IA32_MISC_ENABLE, low, high); - if (low & MSR_IA32_MISC_ENABLE_EMON) - break; + /* Pentium M */ + break; default: pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model); -- cgit v1.2.3 From e9084ec98bb9aa3abc6cf73181177780ce7546f8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 16 Jul 2009 09:45:11 +0100 Subject: x86, mce: Fix set_trigger() accessor Fix the condition checking the result of strchr() (which previously could result in an oops), and make the function return the number of bytes actively used. [ Impact: fix oops ] Signed-off-by: Jan Beulich Cc: Andi Kleen LKML-Reference: <4A5F04B7020000780000AB59@vpn.id2.novell.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 484c1e5f658..1cfb623ce11 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1692,17 +1692,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, const char *buf, size_t siz) { char *p; - int len; strncpy(mce_helper, buf, sizeof(mce_helper)); mce_helper[sizeof(mce_helper)-1] = 0; - len = strlen(mce_helper); p = strchr(mce_helper, '\n'); - if (*p) + if (p) *p = 0; - return len; + return strlen(mce_helper) + !!p; } static ssize_t set_ignore_ce(struct sys_device *s, -- cgit v1.2.3 From 9b7019ae6a542a801a80df6694c66e240e2c3e39 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 16:31:36 +0200 Subject: perf_counter: Remove unused variables Fix a gcc unused variables warning. Signed-off-by: Peter Zijlstra --- arch/x86/kernel/cpu/perf_counter.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7e346d4bc0f..a7aa8f90095 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1570,8 +1570,6 @@ static struct x86_pmu amd_pmu = { static int p6_pmu_init(void) { - int high, low; - switch (boot_cpu_data.x86_model) { case 1: case 3: /* Pentium Pro */ -- cgit v1.2.3 From 2cb078603abb612e3bcd428fb8122c3d39e08832 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 22 Jul 2009 09:59:35 -0700 Subject: x86, amd: Don't probe for extended APIC ID if APICs are disabled If we've logically disabled apics, don't probe the PCI space for the AMD extended APIC ID. [ Impact: prevent boot crash under Xen. ] Signed-off-by: Jeremy Fitzhardinge Reported-by: Bastian Blank Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 28e5f595604..e2485b03f1c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -356,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) /* check CPU config space for extended APIC ID */ - if (c->x86 >= 0xf) { + if (cpu_has_apic && c->x86 >= 0xf) { unsigned int val; val = read_pci_config(0, 24, 0, 0x68); if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) -- cgit v1.2.3 From 0d01f31439c1e4d602bf9fdc924ab66f407f5e38 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 9 Aug 2009 21:44:49 -0700 Subject: x86, mce: therm_throt - change when we print messages My Latitude d630 seems to be handling thermal events in SMI by lowering the max frequency of the CPU till it cools down but still leaks the "everything is normal" events. This spams the console and with high priority printks. Adjust therm_throt driver to only print messages about the fact that temperatire returned back to normal when leaving the throttling state. Also lower the severity of "back to normal" message from KERN_CRIT to KERN_INFO. Signed-off-by: Dmitry Torokhov Acked-by: H. Peter Anvin LKML-Reference: <20090810051513.0558F526EC9@mailhub.coreip.homeip.net> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd..8bc64cfbe93 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -36,6 +36,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +static DEFINE_PER_CPU(bool, thermal_throttle_active); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -96,24 +97,27 @@ static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + bool was_throttled = __get_cpu_var(thermal_throttle_active); + bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; - if (curr) + if (is_throttled) __get_cpu_var(thermal_throttle_count)++; - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (!(was_throttled ^ is_throttled) && + time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ - if (curr) { + if (is_throttled) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + "cpu clock throttled (total events = %lu)\n", + cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } else if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); } return 1; -- cgit v1.2.3 From 3c581a7f94542341bf0da496a226b44ac63521a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:47:36 +0200 Subject: perf_counter, x86: Fix lapic printk message Instead of this garbled bootup on UP Pentium-M systems: [ 0.015048] Performance Counters: [ 0.016004] no Local APIC, try rebooting with lapicno PMU driver, software counters only. Print: [ 0.015050] Performance Counters: [ 0.016004] no APIC, boot with the "lapic" boot parameter to force-enable it. [ 0.017003] no PMU driver, software counters only. Cf: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a7aa8f90095..40e233a24d9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1590,7 +1590,7 @@ static int p6_pmu_init(void) } if (!cpu_has_apic) { - pr_info("no Local APIC, try rebooting with lapic"); + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); return -ENODEV; } -- cgit v1.2.3 From f64ccccb8afa43abdd63fcbd230f818d6ea0883f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:26:33 +0200 Subject: perf_counter, x86: Fix generic cache events on P6-mobile CPUs Johannes Stezenbach reported that 'perf stat' does not count cache-miss and cache-references events on his Pentium-M based laptop. This is because we left them blank in p6_perfmon_event_map[], fill them in. Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 40e233a24d9..fffc126dbdf 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -72,8 +72,8 @@ static const u64 p6_perfmon_event_map[] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -- cgit v1.2.3 From fbd8b1819e80ac5a176d085fdddc3a34d1499318 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Mon, 10 Aug 2009 19:56:45 -0300 Subject: x86: Clear incorrectly forced X86_FEATURE_LAHF_LM flag Due to an erratum with certain AMD Athlon 64 processors, the BIOS may need to force enable the LAHF_LM capability. Unfortunately, in at least one case, the BIOS does this even for processors that do not support the functionality. Add a specific check that will clear the feature bit for processors known not to support the LAHF/SAHF instructions. Signed-off-by: Kevin Winchester Acked-by: Borislav Petkov LKML-Reference: <4A80A5AD.2000209@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e2485b03f1c..63fddcd082c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + */ + if (c->x86_model < 0x14) + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); -- cgit v1.2.3 From e8055139d996e85722984968472868d6dccb1490 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Tue, 11 Aug 2009 20:00:11 +0200 Subject: x86: Fix oops in identify_cpu() on CPUs without CPUID Kernel is broken for x86 CPUs without CPUID since 2.6.28. It crashes with NULL pointer dereference in identify_cpu(): 766 generic_identify(c); 767 768--> if (this_cpu->c_identify) 769 this_cpu->c_identify(c); this_cpu is NULL. This is because it's only initialized in get_cpu_vendor() function, which is not called if the CPU has no CPUID instruction. Signed-off-by: Ondrej Zary LKML-Reference: <200908112000.15993.linux@rainbow-software.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9..5ce60a88027 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; -- cgit v1.2.3 From 04da8a43da804723a550f00dd158fd5b5e25ae35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:40:08 +0200 Subject: perf_counter, x86: Fix/improve apic fallback Johannes Stezenbach reported that his Pentium-M based laptop does not have the local APIC enabled by default, and hence perfcounters do not get initialized. Add a fallback for this case: allow non-sampled counters and return with an error on sampled counters. This allows 'perf stat' to work out of box - and allows 'perf top' and 'perf record' to fall back on a hrtimer based sampling method. ( Passing 'lapic' on the boot line will allow hardware sampling to occur - but if the APIC is disabled permanently by the hardware then this fallback still allows more systems to use perfcounters. ) Also decouple perfcounter support from X86_LOCAL_APIC. -v2: fix typo breaking counters on all other systems ... Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fffc126dbdf..900332b800f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -55,6 +55,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + int apic; u64 max_period; u64 intel_ctrl; }; @@ -613,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) @@ -627,9 +629,11 @@ static bool reserve_pmc_hardware(void) if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } +#endif return true; +#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); @@ -644,10 +648,12 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; +#endif } static void release_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; for (i = 0; i < x86_pmu.num_counters; i++) { @@ -657,6 +663,7 @@ static void release_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); +#endif } static void hw_perf_counter_destroy(struct perf_counter *counter) @@ -748,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * counters (user-space has to fall back and + * sample via a hrtimer based software counter): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; } counter->destroy = hw_perf_counter_destroy; @@ -1449,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) void set_perf_counter_pending(void) { +#ifdef CONFIG_X86_LOCAL_APIC apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif } void perf_counters_lapic_init(void) { - if (!x86_pmu_initialized()) +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif } static int __kprobes @@ -1484,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; +#ifdef CONFIG_X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif /* * Can't rely on the handled return value to say it was our NMI, two * counters could trigger 'simultaneously' raising two back-to-back NMIs. @@ -1515,6 +1537,7 @@ static struct x86_pmu p6_pmu = { .event_map = p6_pmu_event_map, .raw_event = p6_pmu_raw_event, .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, .num_counters = 2, @@ -1541,6 +1564,7 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -1564,6 +1588,7 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, }; @@ -1589,13 +1614,14 @@ static int p6_pmu_init(void) return -ENODEV; } + x86_pmu = p6_pmu; + if (!cpu_has_apic) { pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - return -ENODEV; + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; } - x86_pmu = p6_pmu; - return 0; } -- cgit v1.2.3 From 4e5c25d405e18a2f279ca2bfc855508ec3a0186b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 16 Aug 2009 15:54:37 +0100 Subject: x86, mce: therm_throt: Don't log redundant normality 0d01f31439c1e4d602bf9fdc924ab66f407f5e38 "x86, mce: therm_throt - change when we print messages" removed redundant announcements of "Temperature/speed normal". They're not worth logging and remove their accompanying "Machine check events logged" messages as well from the console. Signed-off-by: Hugh Dickins Cc: Hidetoshi Seto Cc: Andi Kleen Cc: Dmitry Torokhov LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 8bc64cfbe93..5957a93e517 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -116,11 +116,14 @@ static int therm_throt_process(int curr) cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else if (was_throttled) { + return 1; + } + if (was_throttled) { printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + return 1; } - return 1; + return 0; } #ifdef CONFIG_SYSFS -- cgit v1.2.3 From c7f6fa44115d401e89db730f357629d39f8e4ba6 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:52:54 +0200 Subject: x86, mce: don't log boot MCEs on Pentium M (model == 13) CPUs On my legacy Pentium M laptop (Acer Extensa 2900) I get bogus MCE on a cold boot with CONFIG_X86_NEW_MCE enabled, i.e. (after decoding it with mcelog): MCE 0 HARDWARE ERROR. This is *NOT* a software problem! Please contact your hardware vendor CPU 0 BANK 1 MCG status: MCi status: Error overflow Uncorrected error Error enabled Processor context corrupt MCA: Data CACHE Level-1 UNKNOWN Error STATUS f200000000000195 MCGSTATUS 0 [ The other STATUS values observed: f2000000000001b5 (... UNKNOWN error) and f200000000000115 (... READ Error). To verify that this is not a CONFIG_X86_NEW_MCE bug I also modified the CONFIG_X86_OLD_MCE code (which doesn't log any MCEs) to dump content of STATUS MSR before it is cleared during initialization. ] Since the bogus MCE results in a kernel taint (which in turn disables lockdep support) don't log boot MCEs on Pentium M (model == 13) CPUs by default ("mce=bootlog" boot parameter can be be used to get the old behavior). Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1cfb623ce11..a0c2910d96a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1273,6 +1273,10 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; + + /* There are also broken BIOSes on some Pentium M systems. */ + if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) + mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; -- cgit v1.2.3 From e412cd257e0d51e0ecbb89f50953835b5a0681b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 17 Aug 2009 10:19:00 +0200 Subject: x86, mce: Don't initialize MCEs on unknown CPUs An older test-box started hanging at the following point during bootup: [ 0.022996] Mount-cache hash table entries: 512 [ 0.024996] Initializing cgroup subsys debug [ 0.025996] Initializing cgroup subsys cpuacct [ 0.026995] Initializing cgroup subsys devices [ 0.027995] Initializing cgroup subsys freezer [ 0.028995] mce: CPU supports 5 MCE banks I've bisected it down to commit 4efc0670 ("x86, mce: use 64bit machine check code on 32bit"), which utilizes the MCE code on 32-bit systems too. The problem is caused by this detail in my config: # CONFIG_CPU_SUP_INTEL is not set This disables the quirks in mce_cpu_quirks() but still enables MCE support - which then hangs due to the missing quirk workaround needed on this CPU: if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) mce_banks[0].init = 0; The safe solution is to not initialize MCEs if we dont know on what CPU we are running (or if that CPU's support code got disabled in the config). Also be a bit more defensive on 32-bit systems: dont do a boot-time dump of pending MCEs not just on the specific system that we found a problem with (Pentium-M), but earlier ones as well. Now this problem is probably not common and disabling CPU support is rare - but still being more defensive in something we turned on for a wide range of CPUs is prudent. Cc: Hidetoshi Seto LKML-Reference: Message-ID: <4A88E3E4.40506@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a0c2910d96a..01213048f62 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1226,8 +1226,13 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static int mce_cpu_quirks(struct cpuinfo_x86 *c) { + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + return -EOPNOTSUPP; + } + /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { if (c->x86 == 15 && banks > 4) { @@ -1274,14 +1279,19 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; - /* There are also broken BIOSes on some Pentium M systems. */ - if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; if (mce_bootlog != 0) mce_panic_timeout = 30; + + return 0; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) @@ -1342,11 +1352,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) if (!mce_available(c)) return; - if (mce_cap_init() < 0) { + if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { mce_disabled = 1; return; } - mce_cpu_quirks(c); machine_check_vector = do_machine_check; -- cgit v1.2.3 From 5416c2663517ebd0be0664c4d4ce3df0b116c059 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Aug 2009 12:25:41 -0700 Subject: x86: make sure load_percpu_segment has no stackprotector load_percpu_segment() is used to set up the per-cpu segment registers, which are also used for -fstack-protector. Make sure that the load_percpu_segment() function doesn't have stackprotector enabled. [ Impact: allow percpu setup before calling stack-protected functions ] Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/cpu/Makefile | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4e242f9a06e..8b5b9b625ed 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg endif +# Make sure load_percpu_segment has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_common.o := $(nostackp) + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o -- cgit v1.2.3