From ab4f2ee130d5ffcf35616e1f5c6ab75af5b463b6 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 6 Nov 2008 17:11:07 +0000 Subject: [ARM] fix naming of MODULE_START / MODULE_END As of 73bdf0a60e607f4b8ecc5aec597105976565a84f, the kernel needs to know where modules are located in the virtual address space. On ARM, we located this region between MODULE_START and MODULE_END. Unfortunately, everyone else calls it MODULES_VADDR and MODULES_END. Update ARM to use the same naming, so is_vmalloc_or_module_addr() can work properly. Also update the comment on mm/vmalloc.c to reflect that ARM also places modules in a separate region from the vmalloc space. Signed-off-by: Russell King --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f1cc03bbf6a..66fad3fc02b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -178,7 +178,7 @@ static int vmap_page_range(unsigned long addr, unsigned long end, static inline int is_vmalloc_or_module_addr(const void *x) { /* - * x86-64 and sparc64 put modules in a special place, + * ARM, x86-64 and sparc64 put modules in a special place, * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */ -- cgit v1.2.3 From 69d177c2fc702d402b17fdca2190d5a7e3ca55c5 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 6 Nov 2008 12:53:26 -0800 Subject: hugetlbfs: handle pages higher order than MAX_ORDER When working with hugepages, hugetlbfs assumes that those hugepages are smaller than MAX_ORDER. Specifically it assumes that the mem_map is contigious and uses that to optimise access to the elements of the mem_map that represent the hugepage. Gigantic pages (such as 16GB pages on powerpc) by definition are of greater order than MAX_ORDER (larger than MAX_ORDER_NR_PAGES in size). This means that we can no longer make use of the buddy alloctor guarentees for the contiguity of the mem_map, which ensures that the mem_map is at least contigious for maximmally aligned areas of MAX_ORDER_NR_PAGES pages. This patch adds new mem_map accessors and iterator helpers which handle any discontiguity at MAX_ORDER_NR_PAGES boundaries. It then uses these to implement gigantic page versions of copy_huge_page and clear_huge_page, and to allow follow_hugetlb_page handle gigantic pages. Signed-off-by: Andy Whitcroft Cc: Jon Tollefson Cc: Mel Gorman Cc: Nick Piggin Cc: Christoph Lameter Cc: [2.6.27.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 37 ++++++++++++++++++++++++++++++++++++- mm/internal.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 421aee99b84..e6afe527bd0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -354,11 +354,26 @@ static int vma_has_reserves(struct vm_area_struct *vma) return 0; } +static void clear_gigantic_page(struct page *page, + unsigned long addr, unsigned long sz) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} static void clear_huge_page(struct page *page, unsigned long addr, unsigned long sz) { int i; + if (unlikely(sz > MAX_ORDER_NR_PAGES)) + return clear_gigantic_page(page, addr, sz); + might_sleep(); for (i = 0; i < sz/PAGE_SIZE; i++) { cond_resched(); @@ -366,12 +381,32 @@ static void clear_huge_page(struct page *page, } } +static void copy_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma) +{ + int i; + struct hstate *h = hstate_vma(vma); + struct page *dst_base = dst; + struct page *src_base = src; + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} static void copy_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) + return copy_gigantic_page(dst, src, addr, vma); + might_sleep(); for (i = 0; i < pages_per_huge_page(h); i++) { cond_resched(); @@ -2130,7 +2165,7 @@ same_page: if (zeropage_ok) pages[i] = ZERO_PAGE(0); else - pages[i] = page + pfn_offset; + pages[i] = mem_map_offset(page, pfn_offset); get_page(pages[i]); } diff --git a/mm/internal.h b/mm/internal.h index e4e728bdf32..f482460de3e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -175,6 +175,34 @@ static inline void free_page_mlock(struct page *page) { } #endif /* CONFIG_UNEVICTABLE_LRU */ +/* + * Return the mem_map entry representing the 'offset' subpage within + * the maximally aligned gigantic page 'base'. Handle any discontiguity + * in the mem_map at MAX_ORDER_NR_PAGES boundaries. + */ +static inline struct page *mem_map_offset(struct page *base, int offset) +{ + if (unlikely(offset >= MAX_ORDER_NR_PAGES)) + return pfn_to_page(page_to_pfn(base) + offset); + return base + offset; +} + +/* + * Iterator over all subpages withing the maximally aligned gigantic + * page 'base'. Handle any discontiguity in the mem_map. + */ +static inline struct page *mem_map_next(struct page *iter, + struct page *base, int offset) +{ + if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { + unsigned long pfn = page_to_pfn(base) + offset; + if (!pfn_valid(pfn)) + return NULL; + return pfn_to_page(pfn); + } + return iter + 1; +} + /* * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, * so all functions starting at paging_init should be marked __init -- cgit v1.2.3 From 18229df5b613ed0732a766fc37850de2e7988e43 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 6 Nov 2008 12:53:27 -0800 Subject: hugetlb: pull gigantic page initialisation out of the default path As we can determine exactly when a gigantic page is in use we can optimise the common regular page cases by pulling out gigantic page initialisation into its own function. As gigantic pages are never released to buddy we do not need a destructor. This effectivly reverts the previous change to the main buddy allocator. It also adds a paranoid check to ensure we never release gigantic pages from hugetlbfs to the main buddy. Signed-off-by: Andy Whitcroft Cc: Jon Tollefson Cc: Mel Gorman Cc: Nick Piggin Cc: Christoph Lameter Cc: [2.6.27.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 +++++++++++- mm/internal.h | 1 + mm/page_alloc.c | 28 +++++++++++++++++++++------- 3 files changed, 33 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e6afe527bd0..d143ab67be4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -491,6 +491,8 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; + VM_BUG_ON(h->order >= MAX_ORDER); + h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { @@ -1005,6 +1007,14 @@ found: return 1; } +static void prep_compound_huge_page(struct page *page, int order) +{ + if (unlikely(order > (MAX_ORDER - 1))) + prep_compound_gigantic_page(page, order); + else + prep_compound_page(page, order); +} + /* Put bootmem huge pages into the standard lists after mem_map is up */ static void __init gather_bootmem_prealloc(void) { @@ -1015,7 +1025,7 @@ static void __init gather_bootmem_prealloc(void) struct hstate *h = m->hstate; __ClearPageReserved(page); WARN_ON(page_count(page) != 1); - prep_compound_page(page, h->order); + prep_compound_huge_page(page, h->order); prep_new_huge_page(h, page, page_to_nid(page)); } } diff --git a/mm/internal.h b/mm/internal.h index f482460de3e..13333bc2eb6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -17,6 +17,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); extern void prep_compound_page(struct page *page, unsigned long order); +extern void prep_compound_gigantic_page(struct page *page, unsigned long order); static inline void set_page_count(struct page *page, int v) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d0a240fbb8b..54069e64e3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -260,6 +260,23 @@ static void free_compound_page(struct page *page) } void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + set_compound_page_dtor(page, free_compound_page); + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++) { + struct page *p = page + i; + + __SetPageTail(p); + p->first_page = page; + } +} + +#ifdef CONFIG_HUGETLBFS +void prep_compound_gigantic_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; @@ -268,19 +285,17 @@ void prep_compound_page(struct page *page, unsigned long order) set_compound_page_dtor(page, free_compound_page); set_compound_order(page, order); __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p++) { - if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) - p = pfn_to_page(page_to_pfn(page) + i); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); p->first_page = page; } } +#endif static void destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; - struct page *p = page + 1; if (unlikely(compound_order(page) != order)) bad_page(page); @@ -288,9 +303,8 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageHead(page))) bad_page(page); __ClearPageHead(page); - for (i = 1; i < nr_pages; i++, p++) { - if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) - p = pfn_to_page(page_to_pfn(page) + i); + for (i = 1; i < nr_pages; i++) { + struct page *p = page + i; if (unlikely(!PageTail(p) | (p->first_page != page))) -- cgit v1.2.3 From b4416d2bea007f07f2e74cdc4cb64042ec996c83 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 6 Nov 2008 12:53:29 -0800 Subject: oom: do not dump task state for non thread group leaders When /proc/sys/vm/oom_dump_tasks is enabled, it's only necessary to dump task state information for thread group leaders. The kernel log gets quickly overwhelmed on machines with a massive number of threads by dumping non-thread group leaders. Reviewed-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 64e5b4bcd96..2846a58e5de 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -295,6 +295,8 @@ static void dump_tasks(const struct mem_cgroup *mem) continue; if (mem && !task_in_mem_cgroup(p, mem)) continue; + if (!thread_group_leader(p)) + continue; task_lock(p); printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", -- cgit v1.2.3 From 0aedadf91a70a11c4a3e7c7d99b21e5528af8d5d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 6 Nov 2008 12:53:30 -0800 Subject: mm: move migrate_prep out from under mmap_sem Move the migrate_prep outside the mmap_sem for the following system calls 1. sys_move_pages 2. sys_migrate_pages 3. sys_mbind() It really does not matter when we flush the lru. The system is free to add pages onto the lru even during migration which will make the page migration either skip the page (mbind, migrate_pages) or return a busy state (move_pages). Fixes this lockdep warning (and potential deadlock): Some VM place has mmap_sem -> kevent_wq via lru_add_drain_all() net/core/dev.c::dev_ioctl() has rtnl_lock -> mmap_sem (*) the ioctl has copy_from_user() and it can do page fault. linkwatch_event has kevent_wq -> rtnl_lock Signed-off-by: Christoph Lameter Cc: KOSAKI Motohiro Reported-by: Heiko Carstens Cc: Nick Piggin Cc: Hugh Dickins Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 18 +++++++++++------- mm/migrate.c | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36f42573a33..e9493b1c111 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -489,12 +489,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, int err; struct vm_area_struct *first, *vma, *prev; - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - - err = migrate_prep(); - if (err) - return ERR_PTR(err); - } first = find_vma(mm, start); if (!first) @@ -809,9 +803,13 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { int busy = 0; - int err = 0; + int err; nodemask_t tmp; + err = migrate_prep(); + if (err) + return err; + down_read(&mm->mmap_sem); err = migrate_vmas(mm, from_nodes, to_nodes, flags); @@ -974,6 +972,12 @@ static long do_mbind(unsigned long start, unsigned long len, start, start + len, mode, mode_flags, nmask ? nodes_addr(*nmask)[0] : -1); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + + err = migrate_prep(); + if (err) + return err; + } down_write(&mm->mmap_sem); vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index 6602941bfab..385db89f0c3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -841,12 +841,12 @@ static int do_move_page_to_node_array(struct mm_struct *mm, struct page_to_node *pp; LIST_HEAD(pagelist); + migrate_prep(); down_read(&mm->mmap_sem); /* * Build a list of pages to migrate */ - migrate_prep(); for (pp = pm; pp->node != MAX_NUMNODES; pp++) { struct vm_area_struct *vma; struct page *page; -- cgit v1.2.3 From b41ad14c30acf023d09ac064096a4cf41248ce46 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 6 Nov 2008 12:53:31 -0800 Subject: vmemmap: warn about page_structs with remote distance It's insufficient to simply compare node ids when warning about offnode page_structs since it's possible to still have local affinity. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse-vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a91b5f8fcaf..a13ea6401ae 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -64,7 +64,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long pfn = pte_pfn(*pte); int actual_node = early_pfn_to_nid(pfn); - if (actual_node != node) + if (node_distance(actual_node, node) > LOCAL_DISTANCE) printk(KERN_WARNING "[%lx-%lx] potential offnode " "page_structs\n", start, end - 1); } -- cgit v1.2.3 From fbdd12676c83df77480f00ebd32fc98fbe3bf836 Mon Sep 17 00:00:00 2001 From: Qinghuang Feng Date: Thu, 6 Nov 2008 12:53:34 -0800 Subject: mm/oom_kill.c: fix badness() kerneldoc Paramter @mem has been removed since v2.6.26, now delete it's comment. Signed-off-by: Qinghuang Feng Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2846a58e5de..a0a01902f55 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -38,7 +38,6 @@ static DEFINE_SPINLOCK(zone_scan_mutex); * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate * @uptime: current uptime in seconds - * @mem: target memory controller * * The formula used is relatively simple and documented inline in the * function. The main rationale is that we want to select a good task -- cgit v1.2.3 From a70dcb969f64e2fa98c24f47854f20bf02ff0092 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 6 Nov 2008 12:53:36 -0800 Subject: memory hotplug: fix page_zone() calculation in test_pages_isolated() My last bugfix here (adding zone->lock) introduced a new problem: Using page_zone(pfn_to_page(pfn)) to get the zone after the for() loop is wrong. pfn will then be >= end_pfn, which may be in a different zone or not present at all. This may lead to an addressing exception in page_zone() or spin_lock_irqsave(). Now I use __first_valid_page() again after the loop to find a valid page for page_zone(). Signed-off-by: Gerald Schaefer Acked-by: Nathan Fontenot Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index b70a7fec1ff..5e0ffd96745 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -130,10 +130,11 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) break; } - if (pfn < end_pfn) + page = __first_valid_page(start_pfn, end_pfn - start_pfn); + if ((pfn < end_pfn) || !page) return -EBUSY; /* Check all pages are free or Marked as ISOLATED */ - zone = page_zone(pfn_to_page(pfn)); + zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); spin_unlock_irqrestore(&zone->lock, flags); -- cgit v1.2.3 From 9b46333406b9cb3397ab538485a4d57c316af0ff Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 28 Oct 2008 19:22:34 +1100 Subject: vmap: cope with vm_unmap_aliases before vmalloc_init() Xen can end up calling vm_unmap_aliases() before vmalloc_init() has been called. In this case its safe to make it a simple no-op. Signed-off-by: Jeremy Fitzhardinge Cc: Linux Memory Management List Cc: Nick Piggin Signed-off-by: Ingo Molnar --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 66fad3fc02b..ba6b0f5f7fa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -592,6 +592,8 @@ static void free_unmap_vmap_area_addr(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) +static bool vmap_initialized __read_mostly = false; + struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -828,6 +830,9 @@ void vm_unmap_aliases(void) int cpu; int flush = 0; + if (unlikely(!vmap_initialized)) + return; + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -942,6 +947,8 @@ void __init vmalloc_init(void) INIT_LIST_HEAD(&vbq->dirty); vbq->nr_dirty = 0; } + + vmap_initialized = true; } void unmap_kernel_range(unsigned long addr, unsigned long size) -- cgit v1.2.3 From 1c1271850494f06b63ae6b485e2e1b9c27ffb2d1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 12 Nov 2008 01:24:41 +0100 Subject: parisc: fix find_extend_vma() breakage The STACK_GROWSUP case of stack expansion was missing a test for 'prev', which got removed by commit cb8f488c33539f096580e202f5438a809195008f ("mmap.c: deinline a few functions") by mistake. I found my original email in "sent" folder. The patch in that mail does NOT remove !prev. That change had beed added by someone else. Ok, I think we are not much interested in who did it, let's fix it for good. [ "It looks like this was caused by me fixing rejects. That was the fancy include-lots-of-context-so-it-wont-apply patch." - akpm ] Reported-and-bisected-by: Helge Deller Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: Jiri Kosina Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index de14ac21e5b..d4855a682ab 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1704,7 +1704,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (expand_stack(prev, addr)) + if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) { if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) -- cgit v1.2.3 From 7526674de0c921e7f1e9b6f71a1f9d832557b554 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Wed, 12 Nov 2008 13:24:56 -0800 Subject: hugetlb: make unmap_ref_private multi-size-aware Oops. Part of the hugetlb private reservation code was not fully converted to use hstates. When a huge page must be unmapped from VMAs due to a failed COW, HPAGE_SIZE is used in the call to unmap_hugepage_range() regardless of the page size being used. This works if the VMA is using the default huge page size. Otherwise we might unmap too much, too little, or trigger a BUG_ON. Rare but serious -- fix it. Signed-off-by: Adam Litke Cc: Jon Tollefson Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d143ab67be4..6058b53dcb8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1796,6 +1796,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, unsigned long address) { + struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; struct prio_tree_iter iter; @@ -1805,7 +1806,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * vm_pgoff is in PAGE_SIZE units, hence the different calculation * from page cache lookup which is in HPAGE_SIZE units. */ - address = address & huge_page_mask(hstate_vma(vma)); + address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + (vma->vm_pgoff >> PAGE_SHIFT); mapping = (struct address_space *)page_private(page); @@ -1824,7 +1825,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, - address, address + HPAGE_SIZE, + address, address + huge_page_size(h), page); } -- cgit v1.2.3 From e33c3b5e172e2e45456f42fba47227d48745543f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 12 Nov 2008 13:25:37 -0800 Subject: cpusets: update mems allowed in page allocator If all allowable memory is unreclaimable, it is possible to loop forever in the page allocator for ~__GFP_NORETRY allocations. During this time, it is also possible for a task's cpuset to expand its set of allowable nodes so that it now includes free memory. The cached copy of this set, current->mems_allowed, is stale, however, since there has not been a subsequent call to cpuset_update_task_memory_state(). The cached copy of the set of allowable nodes is now updated in the page allocator's slow path so the additional memory is available to get_page_from_freelist(). [akpm@linux-foundation.org: add comment] Signed-off-by: David Rientjes Cc: Paul Menage Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 54069e64e3a..d8ac0147456 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1561,6 +1561,10 @@ nofail_alloc: /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + cpuset_update_task_memory_state(); p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; -- cgit v1.2.3 From 8891d6da17db0f9bb507d3a017f130b9970c3087 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 12 Nov 2008 13:26:53 -0800 Subject: mm: remove lru_add_drain_all() from the munlock path lockdep warns about following message at boot time on one of my test machine. Then, schedule_on_each_cpu() sholdn't be called when the task have mmap_sem. Actually, lru_add_drain_all() exist to prevent the unevictalble pages stay on reclaimable lru list. but currenct unevictable code can rescue unevictable pages although it stay on reclaimable list. So removing is better. In addition, this patch add lru_add_drain_all() to sys_mlock() and sys_mlockall(). it isn't must. but it reduce the failure of moving to unevictable list. its failure can rescue in vmscan later. but reducing is better. Note, if above rescuing happend, the Mlocked and the Unevictable field mismatching happend in /proc/meminfo. but it doesn't cause any real trouble. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.28-rc2-mm1 #2 ------------------------------------------------------- lvm/1103 is trying to acquire lock: (&cpu_hotplug.lock){--..}, at: [] get_online_cpus+0x29/0x50 but task is already holding lock: (&mm->mmap_sem){----}, at: [] sys_mlockall+0x4e/0xb0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){----}: [] check_noncircular+0x82/0x110 [] might_fault+0x4a/0xa0 [] validate_chain+0xb11/0x1070 [] might_fault+0x4a/0xa0 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 (*) grab mmap_sem [] might_fault+0x4a/0xa0 [] might_fault+0x7b/0xa0 [] might_fault+0x4a/0xa0 [] copy_to_user+0x30/0x60 [] filldir+0x7c/0xd0 [] sysfs_readdir+0x11a/0x1f0 (*) grab sysfs_mutex [] filldir+0x0/0xd0 [] filldir+0x0/0xd0 [] vfs_readdir+0x86/0xa0 (*) grab i_mutex [] sys_getdents+0x6b/0xc0 [] syscall_call+0x7/0xb [] 0xffffffff -> #2 (sysfs_mutex){--..}: [] check_noncircular+0x82/0x110 [] sysfs_addrm_start+0x2c/0xc0 [] validate_chain+0xb11/0x1070 [] sysfs_addrm_start+0x2c/0xc0 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 (*) grab sysfs_mutex [] sysfs_addrm_start+0x2c/0xc0 [] mutex_lock_nested+0xa5/0x2f0 [] sysfs_addrm_start+0x2c/0xc0 [] sysfs_addrm_start+0x2c/0xc0 [] sysfs_addrm_start+0x2c/0xc0 [] create_dir+0x3f/0x90 [] sysfs_create_dir+0x29/0x50 [] _spin_unlock+0x25/0x40 [] kobject_add_internal+0xcd/0x1a0 [] kobject_set_name_vargs+0x3a/0x50 [] kobject_init_and_add+0x2d/0x40 [] sysfs_slab_add+0xd2/0x180 [] sysfs_add_func+0x0/0x70 [] sysfs_add_func+0x5c/0x70 (*) grab slub_lock [] run_workqueue+0x172/0x200 [] run_workqueue+0x10f/0x200 [] worker_thread+0x0/0xf0 [] worker_thread+0x9c/0xf0 [] autoremove_wake_function+0x0/0x50 [] worker_thread+0x0/0xf0 [] kthread+0x42/0x70 [] kthread+0x0/0x70 [] kernel_thread_helper+0x7/0x1c [] 0xffffffff -> #1 (slub_lock){----}: [] check_noncircular+0xd/0x110 [] slab_cpuup_callback+0x11f/0x1d0 [] validate_chain+0xb11/0x1070 [] slab_cpuup_callback+0x11f/0x1d0 [] mark_lock+0x35d/0xd00 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] slab_cpuup_callback+0x11f/0x1d0 [] down_read+0x43/0x80 [] slab_cpuup_callback+0x11f/0x1d0 (*) grab slub_lock [] slab_cpuup_callback+0x11f/0x1d0 [] notifier_call_chain+0x3c/0x70 [] _cpu_up+0x84/0x110 [] cpu_up+0x4b/0x70 (*) grab cpu_hotplug.lock [] kernel_init+0x0/0x170 [] kernel_init+0xb5/0x170 [] kernel_init+0x0/0x170 [] kernel_thread_helper+0x7/0x1c [] 0xffffffff -> #0 (&cpu_hotplug.lock){--..}: [] validate_chain+0x5af/0x1070 [] dev_status+0x0/0x50 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] get_online_cpus+0x29/0x50 [] mutex_lock_nested+0xa5/0x2f0 [] get_online_cpus+0x29/0x50 [] get_online_cpus+0x29/0x50 [] lru_add_drain_per_cpu+0x0/0x10 [] get_online_cpus+0x29/0x50 (*) grab cpu_hotplug.lock [] schedule_on_each_cpu+0x32/0xe0 [] __mlock_vma_pages_range+0x85/0x2c0 [] __lock_acquire+0x285/0xa10 [] vma_merge+0xa9/0x1d0 [] mlock_fixup+0x180/0x200 [] do_mlockall+0x78/0x90 (*) grab mmap_sem [] sys_mlockall+0x81/0xb0 [] syscall_call+0x7/0xb [] 0xffffffff other info that might help us debug this: 1 lock held by lvm/1103: #0: (&mm->mmap_sem){----}, at: [] sys_mlockall+0x4e/0xb0 stack backtrace: Pid: 1103, comm: lvm Not tainted 2.6.28-rc2-mm1 #2 Call Trace: [] print_circular_bug_tail+0x7c/0xd0 [] validate_chain+0x5af/0x1070 [] dev_status+0x0/0x50 [] __lock_acquire+0x263/0xa10 [] lock_acquire+0x7c/0xb0 [] get_online_cpus+0x29/0x50 [] mutex_lock_nested+0xa5/0x2f0 [] get_online_cpus+0x29/0x50 [] get_online_cpus+0x29/0x50 [] lru_add_drain_per_cpu+0x0/0x10 [] get_online_cpus+0x29/0x50 [] schedule_on_each_cpu+0x32/0xe0 [] __mlock_vma_pages_range+0x85/0x2c0 [] __lock_acquire+0x285/0xa10 [] vma_merge+0xa9/0x1d0 [] mlock_fixup+0x180/0x200 [] do_mlockall+0x78/0x90 [] sys_mlockall+0x81/0xb0 [] syscall_call+0x7/0xb Signed-off-by: KOSAKI Motohiro Tested-by: Kamalesh Babulal Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Heiko Carstens Cc: Nick Piggin Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 008ea70b7af..a6da2aee940 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -66,14 +66,10 @@ void __clear_page_mlock(struct page *page) putback_lru_page(page); } else { /* - * Page not on the LRU yet. Flush all pagevecs and retry. + * We lost the race. the page already moved to evictable list. */ - lru_add_drain_all(); - if (!isolate_lru_page(page)) - putback_lru_page(page); - else if (PageUnevictable(page)) + if (PageUnevictable(page)) count_vm_event(UNEVICTABLE_PGSTRANDED); - } } @@ -187,8 +183,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_WRITE) gup_flags |= GUP_FLAGS_WRITE; - lru_add_drain_all(); /* push cached pages to LRU */ - while (nr_pages > 0) { int i; @@ -251,8 +245,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, ret = 0; } - lru_add_drain_all(); /* to update stats */ - return ret; /* count entire vma as locked_vm */ } @@ -546,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) if (!can_do_mlock()) return -EPERM; + lru_add_drain_all(); /* flush pagevec */ + down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; @@ -612,6 +606,8 @@ asmlinkage long sys_mlockall(int flags) if (!can_do_mlock()) goto out; + lru_add_drain_all(); /* flush pagevec */ + down_write(¤t->mm->mmap_sem); lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; -- cgit v1.2.3 From 33c5d3d64589c5d379db5a5615735f6d08438369 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 12 Nov 2008 13:27:01 -0800 Subject: memcg: bugfix for memory hotplug The start pfn calculation in page_cgroup's memory hotplug notifier chain is wrong. Tested-by: Badari Pulavarty Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f59d797dc5a..1223d927904 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -165,7 +165,7 @@ int online_page_cgroup(unsigned long start_pfn, unsigned long start, end, pfn; int fail = 0; - start = start_pfn & (PAGES_PER_SECTION - 1); + start = start_pfn & ~(PAGES_PER_SECTION - 1); end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { @@ -188,7 +188,7 @@ int offline_page_cgroup(unsigned long start_pfn, { unsigned long start, end, pfn; - start = start_pfn & (PAGES_PER_SECTION - 1); + start = start_pfn & ~(PAGES_PER_SECTION - 1); end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) -- cgit v1.2.3