aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2010-02-22 13:48:51 +0100
committerJens Axboe <jens.axboe@oracle.com>2010-02-22 13:48:51 +0100
commitf11cbd74c5ff3614f6390b4de67a6ffdc614c378 (patch)
tree6a30920ade9eeaac5bf6d6263b5d09712e882eb0 /mm
parent429c42c9d246f5bda868495c09974312a0177328 (diff)
parentaea187c46f7d03ce985e55eb1398d0776a15b928 (diff)
Merge branch 'master' into for-2.6.34
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c103
-rw-r--r--mm/hugetlb.c9
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/memcontrol.c11
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/nommu.c119
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/truncate.c30
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c114
-rw-r--r--mm/vmscan.c3
12 files changed, 287 insertions, 131 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0eb6c..698ea80f210 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *,struct page*),
- void *data)
+ void *data,
+ gfp_t gfp)
{
struct page *page;
int err;
repeat:
page = find_get_page(mapping, index);
if (!page) {
- page = page_cache_alloc_cold(mapping);
+ page = __page_cache_alloc(gfp | __GFP_COLD);
if (!page)
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1661,31 +1662,18 @@ repeat:
return page;
}
-/**
- * read_cache_page_async - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: destination for read data
- *
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page_async(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *,struct page*),
- void *data)
+ void *data,
+ gfp_t gfp)
+
{
struct page *page;
int err;
retry:
- page = __read_cache_page(mapping, index, filler, data);
+ page = __read_cache_page(mapping, index, filler, data, gfp);
if (IS_ERR(page))
return page;
if (PageUptodate(page))
@@ -1710,8 +1698,67 @@ out:
mark_page_accessed(page);
return page;
}
+
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: destination for read data
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+}
EXPORT_SYMBOL(read_cache_page_async);
+static struct page *wait_on_page_read(struct page *page)
+{
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags. Note
+ * that the Radix tree operations will still use GFP_KERNEL, so you can't
+ * expect to do this atomically or anything like that - but you can pass in
+ * other page requirements.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+ pgoff_t index,
+ gfp_t gfp)
+{
+ filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+
+ return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
+
/**
* read_cache_page - read into page cache, fill it if needed
* @mapping: the page's address_space
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
int (*filler)(void *,struct page*),
void *data)
{
- struct page *page;
-
- page = read_cache_page_async(mapping, index, filler, data);
- if (IS_ERR(page))
- goto out;
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- page_cache_release(page);
- page = ERR_PTR(-EIO);
- }
- out:
- return page;
+ return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
}
EXPORT_SYMBOL(read_cache_page);
@@ -2196,6 +2232,9 @@ again:
if (unlikely(status))
break;
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
pagefault_enable();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65f38c21820..2d16fa6b8c2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page,
{
int i;
- if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
+ if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, sz);
return;
}
@@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
-static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
- struct kobject *parent,
- struct kobject **hstate_kobjs,
- struct attribute_group *hstate_attr_group)
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
+ struct kobject **hstate_kobjs,
+ struct attribute_group *hstate_attr_group)
{
int retval;
int hi = h - hstates;
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25..4e348dbaecd 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
* Safely read from address @src to the buffer at @dst. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-long probe_kernel_read(void *dst, void *src, size_t size)
+
+long __weak probe_kernel_read(void *dst, void *src, size_t size)
+ __attribute__((alias("__probe_kernel_read")));
+
+long __probe_kernel_read(void *dst, void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-long notrace __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, void *src, size_t size)
+ __attribute__((alias("__probe_kernel_write")));
+
+long __probe_kernel_write(void *dst, void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 488b644e0e8..954032b80be 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2586,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
if (free_all)
goto try_to_free;
move_account:
- while (mem->res.usage > 0) {
+ do {
ret = -EBUSY;
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
goto out;
@@ -2614,8 +2614,8 @@ move_account:
if (ret == -ENOMEM)
goto try_to_free;
cond_resched();
- }
- ret = 0;
+ /* "ret" should also be checked to ensure all lists are empty. */
+ } while (mem->res.usage > 0 || ret);
out:
css_put(&mem->css);
return ret;
@@ -2648,10 +2648,7 @@ try_to_free:
}
lru_add_drain();
/* try move_account...there may be some *locked* pages. */
- if (mem->res.usage)
- goto move_account;
- ret = 0;
- goto out;
+ goto move_account;
}
int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
diff --git a/mm/migrate.c b/mm/migrate.c
index efddbf0926b..9a0db5bbabe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
goto out_pm;
err = -ENODEV;
+ if (node < 0 || node >= MAX_NUMNODES)
+ goto out_pm;
+
if (!node_state(node, N_HIGH_MEMORY))
goto out_pm;
diff --git a/mm/nommu.c b/mm/nommu.c
index 6f9248f89bd..48a2ecfaf05 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Ok, looks good - let it rip.
*/
+ flush_icache_range(mm->brk, brk);
return mm->brk = brk;
}
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
static void __put_nommu_region(struct vm_region *region)
__releases(nommu_region_sem)
{
- kenter("%p{%d}", region, atomic_read(&region->vm_usage));
+ kenter("%p{%d}", region, region->vm_usage);
BUG_ON(!nommu_region_tree.rb_node);
- if (atomic_dec_and_test(&region->vm_usage)) {
+ if (--region->vm_usage == 0) {
if (region->vm_top > region->vm_start)
delete_nommu_region(region);
up_write(&nommu_region_sem);
@@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
if (!vma)
goto error_getting_vma;
- atomic_set(&region->vm_usage, 1);
+ region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
@@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
}
/* we've found a region we can share */
- atomic_inc(&pregion->vm_usage);
+ pregion->vm_usage++;
vma->vm_region = pregion;
start = pregion->vm_start;
start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_region = NULL;
vma->vm_start = 0;
vma->vm_end = 0;
- atomic_dec(&pregion->vm_usage);
+ pregion->vm_usage--;
pregion = NULL;
goto error_just_free;
}
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
share:
add_vma_to_mm(current->mm, vma);
- up_write(&nommu_region_sem);
+ /* we flush the region from the icache only when the first executable
+ * mapping of it is made */
+ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
+ flush_icache_range(region->vm_start, region->vm_end);
+ region->vm_icache_flushed = true;
+ }
- if (prot & PROT_EXEC)
- flush_icache_range(result, result + len);
+ up_write(&nommu_region_sem);
kleave(" = %lx", result);
return result;
@@ -1436,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
kenter("");
- /* we're only permitted to split anonymous regions that have a single
- * owner */
- if (vma->vm_file ||
- atomic_read(&vma->vm_region->vm_usage) != 1)
+ /* we're only permitted to split anonymous regions (these should have
+ * only a single usage on the region) */
+ if (vma->vm_file)
return -ENOMEM;
if (mm->map_count >= sysctl_max_map_count)
@@ -1513,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
/* cut the backing region down to size */
region = vma->vm_region;
- BUG_ON(atomic_read(&region->vm_usage) != 1);
+ BUG_ON(region->vm_usage != 1);
down_write(&nommu_region_sem);
delete_nommu_region(region);
@@ -1757,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL(unmap_mapping_range);
/*
- * ask for an unmapped area at which to create a mapping on a file
- */
-unsigned long get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
- unsigned long, unsigned long);
-
- get_area = current->mm->get_unmapped_area;
- if (file && file->f_op && file->f_op->get_unmapped_area)
- get_area = file->f_op->get_unmapped_area;
-
- if (!get_area)
- return -ENOSYS;
-
- return get_area(file, addr, len, pgoff, flags);
-}
-EXPORT_SYMBOL(get_unmapped_area);
-
-/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
* succeed and -ENOMEM implies there is not.
@@ -1916,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
/* only read or write mappings where it is permitted */
if (write && vma->vm_flags & VM_MAYWRITE)
- len -= copy_to_user((void *) addr, buf, len);
+ copy_to_user_page(vma, NULL, addr,
+ (void *) addr, buf, len);
else if (!write && vma->vm_flags & VM_MAYREAD)
- len -= copy_from_user(buf, (void *) addr, len);
+ copy_from_user_page(vma, NULL, addr,
+ buf, (void *) addr, len);
else
len = 0;
} else {
@@ -1929,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
mmput(mm);
return len;
}
+
+/**
+ * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
+ * @inode: The inode to check
+ * @size: The current filesize of the inode
+ * @newsize: The proposed filesize of the inode
+ *
+ * Check the shared mappings on an inode on behalf of a shrinking truncate to
+ * make sure that that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * automatically grant mappings that are too large.
+ */
+int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
+ size_t newsize)
+{
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+ struct vm_region *region;
+ pgoff_t low, high;
+ size_t r_size, r_top;
+
+ low = newsize >> PAGE_SHIFT;
+ high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ down_write(&nommu_region_sem);
+
+ /* search for VMAs that fall within the dead zone */
+ vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+ low, high) {
+ /* found one - only interested if it's shared out of the page
+ * cache */
+ if (vma->vm_flags & VM_SHARED) {
+ up_write(&nommu_region_sem);
+ return -ETXTBSY; /* not quite true, but near enough */
+ }
+ }
+
+ /* reduce any regions that overlap the dead zone - if in existence,
+ * these will be pointed to by VMAs that don't overlap the dead zone
+ *
+ * we don't check for any regions that start beyond the EOF as there
+ * shouldn't be any
+ */
+ vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+ 0, ULONG_MAX) {
+ if (!(vma->vm_flags & VM_SHARED))
+ continue;
+
+ region = vma->vm_region;
+ r_size = region->vm_top - region->vm_start;
+ r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
+
+ if (r_top > newsize) {
+ region->vm_top -= r_top - newsize;
+ if (region->vm_end > region->vm_top)
+ region->vm_end = region->vm_top;
+ }
+ }
+
+ up_write(&nommu_region_sem);
+ return 0;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e9f5cc5fb5..8deb9d0fd5b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -556,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- __free_one_page(page, zone, 0, migratetype);
- trace_mm_page_pcpu_drain(page, 0, migratetype);
+ /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+ __free_one_page(page, zone, 0, page_private(page));
+ trace_mm_page_pcpu_drain(page, 0, page_private(page));
} while (--count && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
@@ -1222,10 +1223,10 @@ again:
}
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order, migratetype);
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -3998,7 +3999,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
}
/* Merge backward if suitable */
- if (start_pfn < early_node_map[i].end_pfn &&
+ if (start_pfn < early_node_map[i].start_pfn &&
end_pfn >= early_node_map[i].start_pfn) {
early_node_map[i].start_pfn = start_pfn;
return;
diff --git a/mm/percpu.c b/mm/percpu.c
index 442010cc91c..083e7c91e5f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1271,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work)
*/
void free_percpu(void *ptr)
{
- void *addr = __pcpu_ptr_to_addr(ptr);
+ void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
@@ -1279,6 +1279,8 @@ void free_percpu(void *ptr)
if (!ptr)
return;
+ addr = __pcpu_ptr_to_addr(ptr);
+
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee2268..e87e3724482 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
*/
void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
{
- if (new < old) {
- struct address_space *mapping = inode->i_mapping;
-
- /*
- * unmap_mapping_range is called twice, first simply for
- * efficiency so that truncate_inode_pages does fewer
- * single-page unmaps. However after this first call, and
- * before truncate_inode_pages finishes, it is possible for
- * private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second
- * unmap_mapping_range call must be made for correctness.
- */
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, new);
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
- }
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, new);
+ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);
diff --git a/mm/util.c b/mm/util.c
index 7c35ad95f92..834db7be240 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -220,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
}
EXPORT_SYMBOL(strndup_user);
-#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37e69295f25..ae007462b7f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/* for per-CPU blocks */
+static void purge_fragmented_blocks_allcpus(void);
+
/*
* Purges all lazily-freed vmap areas.
*
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
} else
spin_lock(&purge_lock);
+ if (sync)
+ purge_fragmented_blocks_allcpus();
+
rcu_read_lock();
list_for_each_entry_rcu(va, &vmap_area_list, list) {
if (va->flags & VM_LAZY_FREE) {
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
}
rcu_read_unlock();
- if (nr) {
- BUG_ON(nr > atomic_read(&vmap_lazy_nr));
+ if (nr)
atomic_sub(nr, &vmap_lazy_nr);
- }
if (nr || force_flush)
flush_tlb_kernel_range(*start, *end);
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
- struct list_head dirty;
- unsigned int nr_dirty;
};
struct vmap_block {
@@ -680,10 +682,9 @@ struct vmap_block {
unsigned long free, dirty;
DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
- union {
- struct list_head free_list;
- struct rcu_head rcu_head;
- };
+ struct list_head free_list;
+ struct rcu_head rcu_head;
+ struct list_head purge;
};
/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
vbq = &get_cpu_var(vmap_block_queue);
vb->vbq = vbq;
spin_lock(&vbq->lock);
- list_add(&vb->free_list, &vbq->free);
+ list_add_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
put_cpu_var(vmap_block_queue);
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
struct vmap_block *tmp;
unsigned long vb_idx;
- BUG_ON(!list_empty(&vb->free_list));
-
vb_idx = addr_to_vb_idx(vb->va->va_start);
spin_lock(&vmap_block_tree_lock);
tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
call_rcu(&vb->rcu_head, rcu_free_vb);
}
+static void purge_fragmented_blocks(int cpu)
+{
+ LIST_HEAD(purge);
+ struct vmap_block *vb;
+ struct vmap_block *n_vb;
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+
+ if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ continue;
+
+ spin_lock(&vb->lock);
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
+ vb->free = 0; /* prevent further allocs after releasing lock */
+ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+ bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
+ bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ spin_unlock(&vb->lock);
+ list_add_tail(&vb->purge, &purge);
+ } else
+ spin_unlock(&vb->lock);
+ }
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(vb, n_vb, &purge, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
+static void purge_fragmented_blocks_thiscpu(void)
+{
+ purge_fragmented_blocks(smp_processor_id());
+}
+
+static void purge_fragmented_blocks_allcpus(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ purge_fragmented_blocks(cpu);
+}
+
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
unsigned long addr = 0;
unsigned int order;
+ int purge = 0;
BUG_ON(size & ~PAGE_MASK);
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -808,24 +856,38 @@ again:
int i;
spin_lock(&vb->lock);
+ if (vb->free < 1UL << order)
+ goto next;
+
i = bitmap_find_free_region(vb->alloc_map,
VMAP_BBMAP_BITS, order);
- if (i >= 0) {
- addr = vb->va->va_start + (i << PAGE_SHIFT);
- BUG_ON(addr_to_vb_idx(addr) !=
- addr_to_vb_idx(vb->va->va_start));
- vb->free -= 1UL << order;
- if (vb->free == 0) {
- spin_lock(&vbq->lock);
- list_del_init(&vb->free_list);
- spin_unlock(&vbq->lock);
+ if (i < 0) {
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
+ /* fragmented and no outstanding allocations */
+ BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
+ purge = 1;
}
- spin_unlock(&vb->lock);
- break;
+ goto next;
+ }
+ addr = vb->va->va_start + (i << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) !=
+ addr_to_vb_idx(vb->va->va_start));
+ vb->free -= 1UL << order;
+ if (vb->free == 0) {
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
}
spin_unlock(&vb->lock);
+ break;
+next:
+ spin_unlock(&vb->lock);
}
+
+ if (purge)
+ purge_fragmented_blocks_thiscpu();
+
put_cpu_var(vmap_block_queue);
rcu_read_unlock();
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size)
BUG_ON(!vb);
spin_lock(&vb->lock);
- bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+ BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
vb->dirty += 1UL << order;
if (vb->dirty == VMAP_BBMAP_BITS) {
- BUG_ON(vb->free || !list_empty(&vb->free_list));
+ BUG_ON(vb->free);
spin_unlock(&vb->lock);
free_vmap_block(vb);
} else
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void)
vbq = &per_cpu(vmap_block_queue, i);
spin_lock_init(&vbq->lock);
INIT_LIST_HEAD(&vbq->free);
- INIT_LIST_HEAD(&vbq->dirty);
- vbq->nr_dirty = 0;
}
/* Import existing vmlist entries. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 885207a6b6b..c26986c85ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
if (!populated_zone(zone))
continue;
+ if (zone_is_all_unreclaimable(zone))
+ continue;
+
if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
0, 0))
return 1;