From 3aa4b37d3e899cfe7a9cbdcda2b277df4c1f210d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 6 Feb 2008 22:39:43 +0100 Subject: x86: make traps on entry code be debuggable in user space, 64-bit Unify the x86-64 behavior for 32-bit processes that set bogus %cs/%ss values (the only ones that can fault in iret) match what the native i386 behavior is. (do not kill the task via do_exit but generate a SIGSEGV signal) [ tglx@linutronix.de: build fix ] Signed-off-by: Roland McGrath Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bea8474744ff..e518928114db 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -593,13 +593,22 @@ ENTRY(native_iret) .quad native_iret, bad_iret .previous .section .fixup,"ax" - /* force a signal here? this matches i386 behaviour */ - /* running with kernel gs */ bad_iret: - movq $11,%rdi /* SIGSEGV */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) - jmp do_exit + /* + * The iret traps when the %cs or %ss being restored is bogus. + * We've lost the original trap vector and error code. + * #GPF is the most likely one to get for an invalid selector. + * So pretend we completed the iret and took the #GPF in user mode. + * + * We are now running with the kernel GS after exception recovery. + * But error_entry expects us to have user GS to match the user %cs, + * so swap back. + */ + pushq $0 + + SWAPGS + jmp general_protection + .previous /* edi: workmask, edx: work */ -- cgit v1.2.3 From d8b57bb700a73872fd06b891d7c9bc4cea1a6af4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Feb 2008 22:39:43 +0100 Subject: x86: make spurious fault handler aware of large mappings In very rare cases, on certain CPUs, we could end up in the spurious fault handler and ignore a large pud/pmd mapping. The resulting pte pointer points into the mapped physical space and dereferencing it will fault recursively. Make the code aware of large mappings and do the permission check on the pmd/pud entry, when a large pud/pmd mapping is detected. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ad8b9733d6b3..d8ed4006b3d2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -428,6 +428,16 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static int spurious_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + /* * Handle a spurious fault caused by a stale TLB entry. This allows * us to lazily refresh the TLB when increasing the permissions of a @@ -457,20 +467,21 @@ static int spurious_fault(unsigned long address, if (!pud_present(*pud)) return 0; + if (pud_large(*pud)) + return spurious_fault_check(error_code, (pte_t *) pud); + pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; + if (pmd_large(*pmd)) + return spurious_fault_check(error_code, (pte_t *) pmd); + pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; - if ((error_code & PF_WRITE) && !pte_write(*pte)) - return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) - return 0; - - return 1; + return spurious_fault_check(error_code, pte); } /* -- cgit v1.2.3 From 2d684cd6d9cf0c6a0e28978362671b6e2d8fb56c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86: remove X2 workaround With the spurious handler fix, the X2 does not lock up anymore. Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 16ce841f08d6..c870424aa9ad 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -260,17 +260,6 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_t old_prot, new_prot; int level, do_split = 1; - /* - * An Athlon 64 X2 showed hard hangs if we tried to preserve - * largepages and changed the PSE entry from RW to RO. - * - * As AMD CPUs have a long series of erratas in this area, - * (and none of the known ones seem to explain this hang), - * disable this code until the hang can be debugged: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return 1; - spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page -- cgit v1.2.3 From 4cc6028d4040f95cdb590a87db478b42b8be0508 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: brk: check the lower bound properly There is a check in sys_brk(), that tries to make sure that we do not underflow the area that is dedicated to brk heap. The check is however wrong, as it assumes that brk area starts immediately after the end of the code (+bss), which is wrong for example in environments with randomized brk start. The proper way is to check whether the address is not below the start_brk address. Signed-off-by: Jiri Kosina Signed-off-by: Ingo Molnar --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index bb4c963cc534..ad6e4eaf34f8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk) down_write(&mm->mmap_sem); - if (brk < mm->end_code) + if (brk < mm->start_brk) goto out; /* -- cgit v1.2.3 From 32a932332c8bad842804842eaf9651ad6268e637 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: brk randomization: introduce CONFIG_COMPAT_BRK based on similar patch from: Pavel Machek Introduce CONFIG_COMPAT_BRK. If disabled then the kernel is free (but not obliged to) randomize the brk area. Heap randomization breaks ancient binaries, so we keep COMPAT_BRK enabled by default. Signed-off-by: Ingo Molnar --- fs/binfmt_elf.c | 2 +- init/Kconfig | 12 ++++++++++++ mm/memory.c | 13 ++++++++++++- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4628c42ca892..111771d38e6e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1077,7 +1077,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->start_stack = bprm->p; #ifdef arch_randomize_brk - if (current->flags & PF_RANDOMIZE) + if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); #endif diff --git a/init/Kconfig b/init/Kconfig index 87f50df58893..92b23e256614 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -541,6 +541,18 @@ config ELF_CORE help Enable support for generating core dumps. Disabling saves about 4k. +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overriden runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) Y is usually a safe choice. + config BASE_FULL default y bool "Enable full-sized data structures for core" if EMBEDDED diff --git a/mm/memory.c b/mm/memory.c index 7bb70728bb52..9d073fa0a2d0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -82,7 +82,18 @@ void * high_memory; EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); -int randomize_va_space __read_mostly = 1; +/* + * Randomize the address space (stacks, mmaps, brk, etc.). + * + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, + * as ancient (libc5 based) binaries can segfault. ) + */ +int randomize_va_space __read_mostly = +#ifdef CONFIG_COMPAT_BRK + 1; +#else + 2; +#endif static int __init disable_randmaps(char *s) { -- cgit v1.2.3 From c1f766b5519f9b5a51b0e6884ed9e02bce775ea8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: MAINTAINERS: RDC R-321x SoC maintainer Signed-off-by: Florian Fainelli Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 4f3da8b56979..cffe13d71baa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3219,6 +3219,12 @@ M: mporter@kernel.crashing.org L: linux-kernel@vger.kernel.org S: Maintained +RDC R-321X SoC +P: Florian Fainelli +M: florian.fainelli@telecomint.eu +L: linux-kernel@vger.kernel.org +S: Maintained + RDC R6040 FAST ETHERNET DRIVER P: Florian Fainelli M: florian.fainelli@telecomint.eu -- cgit v1.2.3 From c63855d04034c96db791a7217954c93aa66d24cb Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86 ptrace: disallow null cs/ss In my revamp of the x86 ptrace code for setting register values, I accidentally omitted a check that was there in the old code. Allowing %cs to be 0 causes a bad crash in recovery from iret failure. This patch fixes that regression against 2.6.24, and adds a comment that should help prevent this subtlety from being overlooked again. Signed-off-by: Roland McGrath Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 96286df1bb81..702c33efea84 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -103,9 +103,26 @@ static int set_segment_reg(struct task_struct *task, if (invalid_selector(value)) return -EIO; - if (offset != offsetof(struct user_regs_struct, gs)) + /* + * For %cs and %ss we cannot permit a null selector. + * We can permit a bogus selector as long as it has USER_RPL. + * Null selectors are fine for other segment registers, but + * we will never get back to user mode with invalid %cs or %ss + * and will take the trap in iret instead. Much code relies + * on user_mode() to distinguish a user trap frame (which can + * safely use invalid selectors) from a kernel trap frame. + */ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + if (unlikely(value == 0)) + return -EIO; + + default: *pt_regs_access(task_pt_regs(task), offset) = value; - else { + break; + + case offsetof(struct user_regs_struct, gs): task->thread.gs = value; if (task == current) /* @@ -227,12 +244,16 @@ static int set_segment_reg(struct task_struct *task, * Can't actually change these in 64-bit mode. */ case offsetof(struct user_regs_struct,cs): + if (unlikely(value == 0)) + return -EIO; #ifdef CONFIG_IA32_EMULATION if (test_tsk_thread_flag(task, TIF_IA32)) task_pt_regs(task)->cs = value; #endif break; case offsetof(struct user_regs_struct,ss): + if (unlikely(value == 0)) + return -EIO; #ifdef CONFIG_IA32_EMULATION if (test_tsk_thread_flag(task, TIF_IA32)) task_pt_regs(task)->ss = value; -- cgit v1.2.3 From 4a5a77d106d6b43183662d4ad37a613bbaa9b829 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86: trivial sparse/checkpatch in quirks.c arch/x86/kernel/quirks.c:384:3: warning: returning void-valued expression arch/x86/kernel/quirks.c:387:3: warning: returning void-valued expression arch/x86/kernel/quirks.c:390:3: warning: returning void-valued expression arch/x86/kernel/quirks.c:393:3: warning: returning void-valued expression Signed-off-by: Harvey Harrison Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 3cd7a2dcd4fe..6ba33ca8715a 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -380,19 +380,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, void force_hpet_resume(void) { switch (force_hpet_resume_type) { - case ICH_FORCE_HPET_RESUME: - return ich_force_hpet_resume(); - - case OLD_ICH_FORCE_HPET_RESUME: - return old_ich_force_hpet_resume(); - - case VT8237_FORCE_HPET_RESUME: - return vt8237_force_hpet_resume(); - - case NVIDIA_FORCE_HPET_RESUME: - return nvidia_force_hpet_resume(); - - default: + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + default: break; } } -- cgit v1.2.3 From deef79ef351225a9fe02e41a40cb125ed03a3e6b Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86: fix sparse error in traps_32.c This was being used to ensure the proper alignment of the FXSAVE/FXRSTOR data. This would create a sparse error in the _correct_ cases, hiding further warnings. Use BUILD_BUG_ON instead. Signed-off-by: Harvey Harrison Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 3cf72977d012..b22c01e05a18 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1176,17 +1176,12 @@ void __init trap_init(void) #endif set_trap_gate(19,&simd_coprocessor_error); + /* + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. + * Generate a build-time error if the alignment is wrong. + */ + BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15); if (cpu_has_fxsr) { - /* - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. - * Generates a compile-time "error: zero width for bit-field" if - * the alignment is wrong. - */ - struct fxsrAlignAssert { - int _:!(offsetof(struct task_struct, - thread.i387.fxsave) & 15); - }; - printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); printk("done.\n"); -- cgit v1.2.3 From d7ac12fa05ed839d5a426795409fdf1a480e3f7a Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86: fix sparse warnings in powernow-k8.c arch/x86/kernel/cpu/cpufreq/powernow-k8.c:830:7: warning: symbol 'hi' shadows an earlier one arch/x86/kernel/cpu/cpufreq/powernow-k8.c:824:6: originally declared here arch/x86/kernel/cpu/cpufreq/powernow-k8.c:830:15: warning: symbol 'lo' shadows an earlier one arch/x86/kernel/cpu/cpufreq/powernow-k8.c:824:14: originally declared here Signed-off-by: Harvey Harrison Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a0522735dd9d..5affe91ca1e5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -827,7 +827,6 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; - u32 hi = 0, lo = 0; index = data->acpi_data.states[i].control & HW_PSTATE_MASK; if (index > data->max_hw_pstate) { -- cgit v1.2.3 From b5556a67f08559b6c1597f6396c1f9ef460f62b4 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: cpuidle: dubious one-bit signed bitfield in cpuidle.h fix these sparse warnings: CHECK arch/x86/kernel/acpi/cstate.c include/linux/cpuidle.h:82:17: error: dubious one-bit signed bitfield CHECK arch/x86/kernel/acpi/processor.c include/linux/cpuidle.h:82:17: error: dubious one-bit signed bitfield CHECK arch/x86/kernel/cpu/cpufreq/powernow-k7.c include/linux/cpuidle.h:82:17: error: dubious one-bit signed bitfield CHECK arch/x86/kernel/cpu/cpufreq/powernow-k8.c include/linux/cpuidle.h:82:17: error: dubious one-bit signed bitfield CHECK arch/x86/kernel/cpu/cpufreq/longhaul.c include/linux/cpuidle.h:82:17: error: dubious one-bit signed bitfield CHECK arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c include/linux/cpuidle.h:82:17: error: dubious one-bit signed bitfield Signed-off-by: Harvey Harrison Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/cpuidle.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index c4e00161a247..b0fd85ab9efb 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -79,7 +79,7 @@ struct cpuidle_state_kobj { }; struct cpuidle_device { - int enabled:1; + unsigned int enabled:1; unsigned int cpu; int last_residency; -- cgit v1.2.3 From a57dae3aa4d00a000b5bac4238025438204c78b2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: fix iret exception recovery on 64-bit This change broke recovery of exceptions in iret: commit 72fe4858544292ad64600765cb78bc02298c6b1c Author: Glauber de Oliveira Costa x86: replace privileged instructions with paravirt macros The ENTRY(native_iret) macro adds alignment padding before the iretq instruction, so "iret_label" no longer points exactly at the instruction. It was sloppy to leave the old "iret_label" label behind when replacing its nearby use. Removing it would have revealed the other use of the label later in the file, and upon noticing that use, anyone exercising the minimum of attention to detail expected of anyone touching this subtle code would realize it needed to change as well. Signed-off-by: Roland McGrath Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e518928114db..c7341e81941c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -582,7 +582,6 @@ retint_restore_args: /* return to kernel space */ TRACE_IRQS_IRETQ restore_args: RESTORE_ARGS 0,8,0 -iret_label: #ifdef CONFIG_PARAVIRT INTERRUPT_RETURN #endif @@ -920,7 +919,7 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp + leaq native_iret(%rip),%rbp cmpq %rbp,RIP(%rsp) je error_swapgs movl %ebp,%ebp /* zero extend */ -- cgit v1.2.3 From 984bb80d94d891592ab16d4d129b988792752c7b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: mark the .rodata section also NX The .rodata section shouldn't just be read-only, but also non-executable. This is free since we've broken up the 2MB page already anyway. also update test_nx to check for this. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/test_nx.c | 2 +- arch/x86/mm/init_64.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 36c100c323aa..10b8a6f69f84 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -139,7 +139,6 @@ static int test_NX(void) * Until then, don't run them to avoid too many people getting scared * by the error message */ -#if 0 #ifdef CONFIG_DEBUG_RODATA /* Test 3: Check if the .rodata section is executable */ @@ -152,6 +151,7 @@ static int test_NX(void) } #endif +#if 0 /* Test 4: Check if the .data section of a module is executable */ if (test_address(&test_data)) { printk(KERN_ERR "test_nx: .data section is executable\n"); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3a98d6f724ab..9b61c75a2355 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -591,10 +591,17 @@ void mark_rodata_ro(void) if (end <= start) return; - set_memory_ro(start, (end - start) >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + /* + * The rodata section (but not the kernel text!) should also be + * not-executable. + */ + start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + set_memory_nx(start, (end - start) >> PAGE_SHIFT); rodata_test(); -- cgit v1.2.3 From cc842b82cc513ebc78bef6eeaacb5f6335851bcb Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: remove suprious ifdefs from pageattr.c The .rodata section really should just be read only; the config option is there to make breaking up the 2Mb page an option (so people whos machines give more performance for the 2Mb case can opt to do so). But when the page gets split anyway, this is no longer an issue, so clean up the code and remove the ifdefs Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c870424aa9ad..8493c855582b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -167,8 +167,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) pgprot_val(forbidden) |= _PAGE_NX; - -#ifdef CONFIG_DEBUG_RODATA /* The .rodata section needs to be read-only */ if (within(address, (unsigned long)__start_rodata, (unsigned long)__end_rodata)) @@ -179,7 +177,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) if (within(address, virt_to_highmap(__start_rodata), virt_to_highmap(__end_rodata))) pgprot_val(forbidden) |= _PAGE_RW; -#endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); -- cgit v1.2.3 From 9f9975a55dbcd82ff4a222691a6dcd7b3145b9c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: generic: add __FINITDATA Signed-off-by: Ingo Molnar --- include/linux/init.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/init.h b/include/linux/init.h index 90cdbbbbe077..a404a0055dd7 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -110,6 +110,7 @@ #define __FINIT .previous #define __INITDATA .section ".init.data","aw" +#define __FINITDATA .previous #define __DEVINIT .section ".devinit.text", "ax" #define __DEVINITDATA .section ".devinit.data", "aw" -- cgit v1.2.3 From f1fbabb312d657262322f4ce68b30a95f501945c Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: fix 64-bit sections fix 64-bit section warnings. Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4f283ad215ec..09b38d539b09 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -250,18 +250,13 @@ ENTRY(secondary_startup_64) lretq /* SMP bootup changes these two */ -#ifndef CONFIG_HOTPLUG_CPU - .pushsection .init.data -#endif + __CPUINITDATA .align 8 - .globl initial_code -initial_code: + ENTRY(initial_code) .quad x86_64_start_kernel -#ifndef CONFIG_HOTPLUG_CPU - .popsection -#endif - .globl init_rsp -init_rsp: + __FINITDATA + + ENTRY(init_rsp) .quad init_thread_union+THREAD_SIZE-8 bad_address: -- cgit v1.2.3 From 971a52d66a3e87d4d2f5d3455e62680447cdb8e9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: delay CPA self-test and repeat it delay the CPA self-test so that any impact (corruption) of user-space pagetables can be triggered. Repeat the test every 30 seconds. this would have prevented the bug fixed by 8cb2a7c1e95e472b5, at its source. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 4 +-- arch/x86/mm/pageattr-test.c | 65 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2e1e3af28c3a..fa555148823d 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -220,9 +220,9 @@ config DEBUG_BOOT_PARAMS This option will cause struct boot_params to be exported via debugfs. config CPA_DEBUG - bool "CPA self test code" + bool "CPA self-test code" depends on DEBUG_KERNEL help - Do change_page_attr self tests at boot. + Do change_page_attr() self-tests every 30 seconds. endmenu diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 398f3a578dde..ed8201600354 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -5,6 +5,7 @@ * and compares page tables forwards and afterwards. */ #include +#include #include #include #include @@ -14,8 +15,13 @@ #include #include +/* + * Only print the results of the first pass: + */ +static __read_mostly int print = 1; + enum { - NTEST = 4000, + NTEST = 400, #ifdef CONFIG_X86_64 LPS = (1 << PMD_SHIFT), #elif defined(CONFIG_X86_PAE) @@ -31,7 +37,7 @@ struct split_state { long min_exec, max_exec; }; -static __init int print_split(struct split_state *s) +static int print_split(struct split_state *s) { long i, expected, missed = 0; int printed = 0; @@ -82,10 +88,13 @@ static __init int print_split(struct split_state *s) s->max_exec = addr; } } - printk(KERN_INFO - "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", - s->spg, s->lpg, s->gpg, s->exec, - s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed); + if (print) { + printk(KERN_INFO + " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, + s->max_exec, missed); + } expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; if (expected != i) { @@ -96,11 +105,11 @@ static __init int print_split(struct split_state *s) return err; } -static unsigned long __initdata addr[NTEST]; -static unsigned int __initdata len[NTEST]; +static unsigned long addr[NTEST]; +static unsigned int len[NTEST]; /* Change the global bit on random pages in the direct mapping */ -static __init int exercise_pageattr(void) +static int pageattr_test(void) { struct split_state sa, sb, sc; unsigned long *bm; @@ -110,7 +119,8 @@ static __init int exercise_pageattr(void) int i, k; int err; - printk(KERN_INFO "CPA exercising pageattr\n"); + if (print) + printk(KERN_INFO "CPA self-test:\n"); bm = vmalloc((max_pfn_mapped + 7) / 8); if (!bm) { @@ -186,7 +196,6 @@ static __init int exercise_pageattr(void) failed += print_split(&sb); - printk(KERN_INFO "CPA reverting everything\n"); for (i = 0; i < NTEST; i++) { if (!addr[i]) continue; @@ -214,12 +223,40 @@ static __init int exercise_pageattr(void) failed += print_split(&sc); if (failed) { - printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n"); + printk(KERN_ERR "NOT PASSED. Please report.\n"); WARN_ON(1); + return -EINVAL; } else { - printk(KERN_INFO "CPA selftests PASSED\n"); + if (print) + printk(KERN_INFO "ok.\n"); } return 0; } -module_init(exercise_pageattr); + +static int do_pageattr_test(void *__unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(HZ*30); + if (pageattr_test() < 0) + break; + if (print) + print--; + } + return 0; +} + +static int start_pageattr_test(void) +{ + struct task_struct *p; + + p = kthread_create(do_pageattr_test, NULL, "pageattr-test"); + if (!IS_ERR(p)) + wake_up_process(p); + else + WARN_ON(1); + + return 0; +} + +module_init(start_pageattr_test); -- cgit v1.2.3 From 20651af9ac60fd6e31360688ad44861a7d05256a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: fix mttr trimming Pavel Emelyanov reported that his networking card did not work and bisected it down to: " The commit 093af8d7f0ba3c6be1485973508584ef081e9f93 x86_32: trim memory by updating e820 broke my e1000 card: on loading driver says that e1000: probe of 0000:04:03.0 failed with error -5 and the interface doesn't appear. " on a 32-bit kernel, base will overflow when try to do PAGE_SHIFT, and highest_addr will always less 4G. So use pfn instead of address to avoid the overflow when more than 4g RAM is installed on a 32-bit kernel. Many thanks to Pavel Emelyanov for reporting and testing it. Bisected-by: Pavel Emelyanov Signed-off-by: Yinghai Lu Tested-by: Pavel Emelyanov Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 1e27b69a7a0e..b6e136f23d3d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -659,7 +659,7 @@ static __init int amd_special_default_mtrr(void) */ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { - unsigned long i, base, size, highest_addr = 0, def, dummy; + unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 trim_start, trim_size; @@ -682,28 +682,27 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) mtrr_if->get(i, &base, &size, &type); if (type != MTRR_TYPE_WRBACK) continue; - base <<= PAGE_SHIFT; - size <<= PAGE_SHIFT; - if (highest_addr < base + size) - highest_addr = base + size; + if (highest_pfn < base + size) + highest_pfn = base + size; } /* kvm/qemu doesn't have mtrr set right, don't trim them all */ - if (!highest_addr) { + if (!highest_pfn) { printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); WARN_ON(1); return 0; } - if ((highest_addr >> PAGE_SHIFT) < end_pfn) { + if (highest_pfn < end_pfn) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %LdMB of RAM.\n", - (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20); + " all of memory, losing %luMB of RAM.\n", + (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); WARN_ON(1); printk(KERN_INFO "update e820 for mtrr\n"); - trim_start = highest_addr; + trim_start = highest_pfn; + trim_start <<= PAGE_SHIFT; trim_size = end_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; -- cgit v1.2.3 From a09771bef9a375091f8ae706d992e20970e5d1e7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: virtio: fix trivial build bug fix build bug: drivers/virtio/virtio_balloon.c: In function 'fill_balloon': drivers/virtio/virtio_balloon.c:98: error: implicit declaration of function 'msleep' Acked-by: Rusty Russell Signed-off-by: Ingo Molnar --- drivers/virtio/virtio_balloon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 622aece1acce..c8a4332d1132 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -23,6 +23,7 @@ #include #include #include +#include struct virtio_balloon { -- cgit v1.2.3 From 58d5d0d8dd52cbca988af24b5692a20b00285543 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: fix deadlock, make pgd_lock irq-safe lockdep just caught this one: ================================= [ INFO: inconsistent lock state ] 2.6.24 #38 --------------------------------- inconsistent {in-softirq-W} -> {softirq-on-W} usage. swapper/1 [HC0[0]:SC0[0]:HE1:SE1] takes: (pgd_lock){-+..}, at: [] mm_init+0x1da/0x250 {in-softirq-W} state was registered at: [] 0xffffffffffffffff irq event stamp: 394559 hardirqs last enabled at (394559): [] get_page_from_freelist+0x30a/0x4c0 hardirqs last disabled at (394558): [] get_page_from_freelist+0x125/0x4c0 softirqs last enabled at (393952): [] __do_softirq+0xce/0xe0 softirqs last disabled at (393945): [] call_softirq+0x1c/0x30 other info that might help us debug this: no locks held by swapper/1. stack backtrace: Pid: 1, comm: swapper Not tainted 2.6.24 #38 Call Trace: [] print_usage_bug+0x18b/0x190 [] mark_lock+0x53d/0x560 [] __lock_acquire+0x3ca/0xed0 [] lock_acquire+0xa8/0xe0 [] ? mm_init+0x1da/0x250 [] _spin_lock+0x30/0x70 [] mm_init+0x1da/0x250 [] mm_alloc+0x39/0x50 [] bprm_mm_init+0x2a/0x1a0 [] do_execve+0x7b/0x220 [] sys_execve+0x46/0x70 [] kernel_execve+0x64/0xd0 [] ? _stext+0x1e/0x20 [] init_post+0x9a/0xf0 [] ? trace_hardirqs_on_thunk+0x35/0x3a [] ? trace_hardirqs_on+0xba/0xd0 [] ? child_rip+0xa/0x12 [] ? restore_args+0x0/0x44 [] ? child_rip+0x0/0x12 turns out that pgd_lock has been used on 64-bit x86 in an irq-unsafe way for almost two years, since commit 8c914cb704a11460e. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 5 +++-- include/asm-x86/pgalloc_64.h | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d8ed4006b3d2..621afb6343dc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -958,11 +958,12 @@ void vmalloc_sync_all(void) for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { if (!test_bit(pgd_index(address), insync)) { const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); @@ -971,7 +972,7 @@ void vmalloc_sync_all(void) else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); set_bit(pgd_index(address), insync); } if (address == start) diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h index 315314ce4bfb..4f6220db22b1 100644 --- a/include/asm-x86/pgalloc_64.h +++ b/include/asm-x86/pgalloc_64.h @@ -42,19 +42,21 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pgd_list_add(pgd_t *pgd) { struct page *page = virt_to_page(pgd); + unsigned long flags; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_add(&page->lru, &pgd_list); - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } static inline void pgd_list_del(pgd_t *pgd) { struct page *page = virt_to_page(pgd); + unsigned long flags; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_del(&page->lru); - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) -- cgit v1.2.3