aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig27
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c16
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c81
-rw-r--r--mm/early_ioremap.c245
-rw-r--r--mm/fremap.c8
-rw-r--r--mm/huge_memory.c113
-rw-r--r--mm/hugetlb.c648
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c42
-rw-r--r--mm/memory-failure.c111
-rw-r--r--mm/memory.c94
-rw-r--r--mm/mempolicy.c56
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/oom_kill.c70
-rw-r--r--mm/page-writeback.c72
-rw-r--r--mm/page_alloc.c32
-rw-r--r--mm/page_cgroup.c1
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu-vm.c22
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/rmap.c29
-rw-r--r--mm/shmem.c116
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c12
-rw-r--r--mm/swap.c74
-rw-r--r--mm/truncate.c58
-rw-r--r--mm/util.c9
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c99
-rw-r--r--mm/vmstat.c96
39 files changed, 1621 insertions, 594 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e742d06285b7..b2d1aed56439 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -477,3 +477,30 @@ config FRONTSWAP
and swap data is stored as normal on the matching swap device.
If unsure, say Y to enable frontswap.
+
+config GENERIC_EARLY_IOREMAP
+ bool
+
+config CMA
+ bool "Contiguous Memory Allocator"
+ depends on HAVE_MEMBLOCK
+ select MIGRATION
+ select MEMORY_ISOLATION
+ help
+ This enables the Contiguous Memory Allocator which allows other
+ subsystems to allocate big physically-contiguous blocks of memory.
+ CMA reserves a region of memory and allows only movable pages to
+ be allocated from it. This way, the kernel can use the memory for
+ pagecache and when a subsystem requests for contiguous area, the
+ allocated pages are migrated away to serve the contiguous request.
+
+ If unsure, say "n".
+
+config CMA_DEBUG
+ bool "CMA debug messages (DEVELOPMENT)"
+ depends on DEBUG_KERNEL && CMA
+ help
+ Turns on debug messages in CMA. This produces KERN_DEBUG
+ messages for every CMA call as well as various messages while
+ processing calls such as dma_alloc_from_contiguous().
+ This option does not affect warning and error messages.
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb9345f..89244cb96221 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -58,3 +58,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 502517492258..eea1a9dfac38 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -287,13 +287,19 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
* Note, we wouldn't bother setting up the timer, but this function is on the
* fast-path (used by '__mark_inode_dirty()'), so we save few context switches
* by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
*/
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
{
unsigned long timeout;
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
+ spin_lock_bh(&bdi->wb_lock);
+ if (test_bit(BDI_registered, &bdi->state))
+ queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
+ spin_unlock_bh(&bdi->wb_lock);
}
/*
@@ -306,9 +312,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
spin_unlock_bh(&bdi_lock);
synchronize_rcu_expedited();
-
- /* bdi_list is now unused, clear it to mark @bdi dying */
- INIT_LIST_HEAD(&bdi->bdi_list);
}
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -359,6 +362,11 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
*/
bdi_remove_from_list(bdi);
+ /* Make sure nobody queues further work */
+ spin_lock_bh(&bdi->wb_lock);
+ clear_bit(BDI_registered, &bdi->state);
+ spin_unlock_bh(&bdi->wb_lock);
+
/*
* Drain work list and shutdown the delayed_work. At this point,
* @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
diff --git a/mm/bounce.c b/mm/bounce.c
index c9f0a4339a7d..5a7d58fb883b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -204,6 +204,8 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
struct bio_vec *to, *from;
unsigned i;
+ if (force)
+ goto bounce;
bio_for_each_segment(from, *bio_orig, i)
if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
goto bounce;
diff --git a/mm/compaction.c b/mm/compaction.c
index 05ccb4cc0bdb..fb797a32362f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc,
bool migrate_scanner)
{
struct zone *zone = cc->zone;
+
+ if (cc->ignore_skip_hint)
+ return;
+
if (!page)
return;
@@ -248,7 +252,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
{
int nr_scanned = 0, total_isolated = 0;
struct page *cursor, *valid_page = NULL;
- unsigned long nr_strict_required = end_pfn - blockpfn;
unsigned long flags;
bool locked = false;
@@ -261,11 +264,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
nr_scanned++;
if (!pfn_valid_within(blockpfn))
- continue;
+ goto isolate_fail;
+
if (!valid_page)
valid_page = page;
if (!PageBuddy(page))
- continue;
+ goto isolate_fail;
/*
* The zone lock must be held to isolate freepages.
@@ -286,12 +290,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
- continue;
+ goto isolate_fail;
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
- if (!isolated && strict)
- break;
total_isolated += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
@@ -302,7 +304,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (isolated) {
blockpfn += isolated - 1;
cursor += isolated - 1;
+ continue;
}
+
+isolate_fail:
+ if (strict)
+ break;
+ else
+ continue;
+
}
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
@@ -312,7 +322,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
* pages requested were isolated. If there were any failures, 0 is
* returned and CMA will fail.
*/
- if (strict && nr_strict_required > total_isolated)
+ if (strict && blockpfn < end_pfn)
total_isolated = 0;
if (locked)
@@ -647,17 +657,21 @@ static void isolate_freepages(struct zone *zone,
struct compact_control *cc)
{
struct page *page;
- unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
+ unsigned long high_pfn, low_pfn, pfn, z_end_pfn;
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
* Initialise the free scanner. The starting point is where we last
- * scanned from (or the end of the zone if starting). The low point
- * is the end of the pageblock the migration scanner is using.
+ * successfully isolated from, zone-cached value, or the end of the
+ * zone when isolating for the first time. We need this aligned to
+ * the pageblock boundary, because we do pfn -= pageblock_nr_pages
+ * in the for loop.
+ * The low boundary is the end of the pageblock the migration scanner
+ * is using.
*/
- pfn = cc->free_pfn;
- low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+ pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
/*
* Take care that if the migration scanner is at the end of the zone
@@ -673,9 +687,10 @@ static void isolate_freepages(struct zone *zone,
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+ for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
+ unsigned long end_pfn;
if (!pfn_valid(pfn))
continue;
@@ -703,13 +718,10 @@ static void isolate_freepages(struct zone *zone,
isolated = 0;
/*
- * As pfn may not start aligned, pfn+pageblock_nr_page
- * may cross a MAX_ORDER_NR_PAGES boundary and miss
- * a pfn_valid check. Ensure isolate_freepages_block()
- * only scans within a pageblock
+ * Take care when isolating in last pageblock of a zone which
+ * ends in the middle of a pageblock.
*/
- end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
- end_pfn = min(end_pfn, z_end_pfn);
+ end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn);
isolated = isolate_freepages_block(cc, pfn, end_pfn,
freelist, false);
nr_freepages += isolated;
@@ -728,7 +740,14 @@ static void isolate_freepages(struct zone *zone,
/* split_free_page does not map the pages */
map_pages(freelist);
- cc->free_pfn = high_pfn;
+ /*
+ * If we crossed the migrate scanner, we want to keep it that way
+ * so that compact_finished() may detect this
+ */
+ if (pfn < low_pfn)
+ cc->free_pfn = max(pfn, zone->zone_start_pfn);
+ else
+ cc->free_pfn = high_pfn;
cc->nr_freepages = nr_freepages;
}
@@ -937,6 +956,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
/*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
+
+ /*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start but check that it
* is initialised by ensuring the values are within zone boundaries.
@@ -952,14 +979,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
zone->compact_cached_migrate_pfn = cc->migrate_pfn;
}
- /*
- * Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
- */
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
- __reset_isolation_suitable(zone);
-
migrate_prep_local();
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -993,7 +1012,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
if (err) {
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- if (err == -ENOMEM) {
+ /*
+ * migrate_pages() may return -ENOMEM when scanners meet
+ * and we want compact_finished() to detect it
+ */
+ if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
ret = COMPACT_PARTIAL;
goto out;
}
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
new file mode 100644
index 000000000000..e10ccd299d66
--- /dev/null
+++ b/mm/early_ioremap.c
@@ -0,0 +1,245 @@
+/*
+ * Provide common bits of early_ioremap() support for architectures needing
+ * temporary mappings during boot before ioremap() is available.
+ *
+ * This is mostly a direct copy of the x86 early_ioremap implementation.
+ *
+ * (C) Copyright 1995 1996, 2014 Linus Torvalds
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/fixmap.h>
+
+#ifdef CONFIG_MMU
+static int early_ioremap_debug __initdata;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+ early_ioremap_debug = 1;
+
+ return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static int after_paging_init __initdata;
+
+void __init __weak early_ioremap_shutdown(void)
+{
+}
+
+void __init early_ioremap_reset(void)
+{
+ early_ioremap_shutdown();
+ after_paging_init = 1;
+}
+
+/*
+ * Generally, ioremap() is available after paging_init() has been called.
+ * Architectures wanting to allow early_ioremap after paging_init() can
+ * define __late_set_fixmap and __late_clear_fixmap to do the right thing.
+ */
+#ifndef __late_set_fixmap
+static inline void __init __late_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t prot)
+{
+ BUG();
+}
+#endif
+
+#ifndef __late_clear_fixmap
+static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
+{
+ BUG();
+}
+#endif
+
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
+void __init early_ioremap_setup(void)
+{
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (WARN_ON(prev_map[i]))
+ break;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+}
+
+static int __init check_early_ioremap_leak(void)
+{
+ int count = 0;
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (prev_map[i])
+ count++;
+
+ if (WARN(count, KERN_WARNING
+ "Debug warning: early ioremap leak of %d areas detected.\n"
+ "please boot with early_ioremap_debug and report the dmesg.\n",
+ count))
+ return 1;
+ return 0;
+}
+late_initcall(check_early_ioremap_leak);
+
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
+{
+ unsigned long offset;
+ resource_size_t last_addr;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (!prev_map[i]) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
+ __func__, (u64)phys_addr, size))
+ return NULL;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (WARN_ON(!size || last_addr < phys_addr))
+ return NULL;
+
+ prev_size[slot] = size;
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+
+ /*
+ * Mappings have to fit in the FIX_BTMAP area.
+ */
+ nrpages = size >> PAGE_SHIFT;
+ if (WARN_ON(nrpages > NR_FIX_BTMAPS))
+ return NULL;
+
+ /*
+ * Ok, go for it..
+ */
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_set_fixmap(idx, phys_addr, prot);
+ else
+ __early_set_fixmap(idx, phys_addr, prot);
+ phys_addr += PAGE_SIZE;
+ --idx;
+ --nrpages;
+ }
+ WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
+ __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
+
+ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
+ return prev_map[slot];
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+ unsigned long virt_addr;
+ unsigned long offset;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (prev_map[i] == addr) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
+ addr, size))
+ return;
+
+ if (WARN(prev_size[slot] != size,
+ "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+ addr, size, slot, prev_size[slot]))
+ return;
+
+ WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
+ addr, size, slot);
+
+ virt_addr = (unsigned long)addr;
+ if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
+ return;
+
+ offset = virt_addr & ~PAGE_MASK;
+ nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_clear_fixmap(idx);
+ else
+ __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+ --idx;
+ --nrpages;
+ }
+ prev_map[slot] = NULL;
+}
+
+/* Remap an IO device */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void *)__early_ioremap(phys_addr, size,
+ FIXMAP_PAGE_NORMAL);
+}
+#else /* CONFIG_MMU */
+
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void __iomem *)phys_addr;
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (void *)phys_addr;
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+}
+
+#endif /* CONFIG_MMU */
+
+
+void __init early_memunmap(void *addr, unsigned long size)
+{
+ early_iounmap((__force void __iomem *)addr, size);
+}
diff --git a/mm/fremap.c b/mm/fremap.c
index 87da3590c61e..1fb6bfe39d8c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -203,9 +203,10 @@ get_write_lock:
if (mapping_cap_account_dirty(mapping)) {
unsigned long addr;
struct file *file = get_file(vma->vm_file);
+ /* mmap_region may free vma; grab the info now */
+ vm_flags = vma->vm_flags;
- addr = mmap_region(file, start, size,
- vma->vm_flags, pgoff);
+ addr = mmap_region(file, start, size, vm_flags, pgoff);
fput(file);
if (IS_ERR_VALUE(addr)) {
err = addr;
@@ -213,7 +214,7 @@ get_write_lock:
BUG_ON(addr != start);
err = 0;
}
- goto out;
+ goto out_freed;
}
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
@@ -248,6 +249,7 @@ get_write_lock:
out:
if (vma)
vm_flags = vma->vm_flags;
+out_freed:
if (likely(!has_write_lock))
up_read(&mm->mmap_sem);
else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..d21c9ef0943c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1166,7 +1166,7 @@ alloc:
if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
- if (is_huge_zero_pmd(orig_pmd)) {
+ if (!page) {
ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
address, pmd, orig_pmd, haddr);
} else {
@@ -1190,7 +1190,7 @@ alloc:
goto out;
}
- if (is_huge_zero_pmd(orig_pmd))
+ if (!page)
clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
@@ -1215,7 +1215,7 @@ alloc:
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- if (is_huge_zero_pmd(orig_pmd)) {
+ if (!page) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
@@ -1288,64 +1288,104 @@ out:
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{
+ struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int page_nid = -1, this_nid = numa_node_id();
int target_nid;
- int current_nid = -1;
- bool migrated;
+ bool page_locked;
+ bool migrated = false;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
page = pmd_page(pmd);
- get_page(page);
- current_nid = page_to_nid(page);
+ page_nid = page_to_nid(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == this_nid)
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ /*
+ * Acquire the page lock to serialise THP migrations but avoid dropping
+ * page_table_lock if at all possible
+ */
+ page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
- put_page(page);
- goto clear_pmdnuma;
+ /* If the page was locked, there are no parallel migrations */
+ if (page_locked)
+ goto clear_pmdnuma;
+
+ /*
+ * Otherwise wait for potential migrations and retry. We do
+ * relock and check_same as the page may no longer be mapped.
+ * As the fault is being retried, do not account for it.
+ */
+ spin_unlock(&mm->page_table_lock);
+ wait_on_page_locked(page);
+ page_nid = -1;
+ goto out;
}
- /* Acquire the page lock to serialise THP migrations */
+ /* Page is misplaced, serialise migrations and parallel THP splits */
+ get_page(page);
spin_unlock(&mm->page_table_lock);
- lock_page(page);
+ if (!page_locked)
+ lock_page(page);
+ anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PTE did not while locked */
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
+ page_nid = -1;
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
- /* Migrate the THP to the requested node */
+ /* Bail if we fail to protect against THP splits for any reason */
+ if (unlikely(!anon_vma)) {
+ put_page(page);
+ page_nid = -1;
+ goto clear_pmdnuma;
+ }
+
+ /*
+ * The page_table_lock above provides a memory barrier
+ * with change_protection_range.
+ */
+ if (mm_tlb_flush_pending(mm))
+ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
+ /*
+ * Migrate the THP to the requested node, returns with page unlocked
+ * and pmd_numa cleared.
+ */
+ spin_unlock(&mm->page_table_lock);
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (!migrated)
- goto check_same;
+ if (migrated)
+ page_nid = target_nid;
- task_numa_fault(target_nid, HPAGE_PMD_NR, true);
- return 0;
-
-check_same:
- spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(pmd, *pmdp)))
- goto out_unlock;
+ goto out;
clear_pmdnuma:
+ BUG_ON(!PageLocked(page));
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
+ unlock_page(page);
out_unlock:
spin_unlock(&mm->page_table_lock);
- if (current_nid != -1)
- task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+
+out:
+ if (anon_vma)
+ page_unlock_anon_vma_read(anon_vma);
+
+ if (page_nid != -1)
+ task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
+
return 0;
}
@@ -1693,21 +1733,24 @@ static int __split_huge_page_map(struct page *page,
if (pmd) {
pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
+ if (pmd_write(*pmd))
+ BUG_ON(page_mapcount(page) != 1);
haddr = address;
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
+ /*
+ * Note that pmd_numa is not transferred deliberately
+ * to avoid any possibility that pte_numa leaks to
+ * a PROT_NONE VMA by accident.
+ */
entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (!pmd_write(*pmd))
entry = pte_wrprotect(entry);
- else
- BUG_ON(page_mapcount(page) != 1);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
- if (pmd_numa(*pmd))
- entry = pte_mknuma(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -2286,6 +2329,8 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out;
vma = find_vma(mm, address);
+ if (!vma)
+ goto out;
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -2697,6 +2742,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
+again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2719,7 +2765,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
split_huge_page(page);
put_page(page);
- BUG_ON(pmd_trans_huge(*pmd));
+
+ /*
+ * We don't always have down_write of mmap_sem here: a racing
+ * do_huge_pmd_wp_page() might have copied-on-write to another
+ * huge page before our split_huge_page() got the anon_vma lock.
+ */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ goto again;
}
void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e2bfbf73a551..ea32a04296f0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -21,6 +21,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/page-isolation.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -434,25 +435,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Decrement the reserved pages in the hugepage pool by one */
-static void decrement_hugepage_resv_vma(struct hstate *h,
- struct vm_area_struct *vma)
-{
- if (vma->vm_flags & VM_NORESERVE)
- return;
-
- if (vma->vm_flags & VM_MAYSHARE) {
- /* Shared mappings always use reserves */
- h->resv_huge_pages--;
- } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- /*
- * Only the process that called mmap() has reserves for
- * private mappings.
- */
- h->resv_huge_pages--;
- }
-}
-
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
@@ -462,12 +444,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
}
/* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma)
+static int vma_has_reserves(struct vm_area_struct *vma, long chg)
{
+ if (vma->vm_flags & VM_NORESERVE) {
+ /*
+ * This address is already reserved by other process(chg == 0),
+ * so, we should decrement reserved count. Without decrementing,
+ * reserve count remains after releasing inode, because this
+ * allocated page will go into page cache and is regarded as
+ * coming from reserved pool in releasing step. Currently, we
+ * don't have any other solution to deal with this situation
+ * properly, so add work-around here.
+ */
+ if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+ return 1;
+ else
+ return 0;
+ }
+
+ /* Shared mappings always use reserves */
if (vma->vm_flags & VM_MAYSHARE)
return 1;
+
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return 1;
+
return 0;
}
@@ -517,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- if (list_empty(&h->hugepage_freelists[nid]))
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
+ if (!is_migrate_isolate_page(page))
+ break;
+ /*
+ * if 'non-isolated free hugepage' not found on the list,
+ * the allocation fails.
+ */
+ if (&h->hugepage_freelists[nid] == &page->lru)
return NULL;
- page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
h->free_huge_pages--;
@@ -529,7 +540,8 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, int avoid_reserve)
+ unsigned long address, int avoid_reserve,
+ long chg)
{
struct page *page = NULL;
struct mempolicy *mpol;
@@ -548,7 +560,7 @@ retry_cpuset:
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
- if (!vma_has_reserves(vma) &&
+ if (!vma_has_reserves(vma, chg) &&
h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
@@ -561,8 +573,13 @@ retry_cpuset:
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
page = dequeue_huge_page_node(h, zone_to_nid(zone));
if (page) {
- if (!avoid_reserve)
- decrement_hugepage_resv_vma(h, vma);
+ if (avoid_reserve)
+ break;
+ if (!vma_has_reserves(vma, chg))
+ break;
+
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
break;
}
}
@@ -620,15 +637,20 @@ static void free_huge_page(struct page *page)
int nid = page_to_nid(page);
struct hugepage_subpool *spool =
(struct hugepage_subpool *)page_private(page);
+ bool restore_reserve;
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
+ restore_reserve = PagePrivate(page);
spin_lock(&hugetlb_lock);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
+ if (restore_reserve)
+ h->resv_huge_pages++;
+
if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
/* remove the page from active list */
list_del(&page->lru);
@@ -690,6 +712,40 @@ int PageHuge(struct page *page)
}
EXPORT_SYMBOL_GPL(PageHuge);
+/*
+ * PageHeadHuge() only returns true for hugetlbfs head page, but not for
+ * normal or transparent huge pages.
+ */
+int PageHeadHuge(struct page *page_head)
+{
+ compound_page_dtor *dtor;
+
+ if (!PageHead(page_head))
+ return 0;
+
+ dtor = get_compound_page_dtor(page_head);
+
+ return dtor == free_huge_page;
+}
+EXPORT_SYMBOL_GPL(PageHeadHuge);
+
+pgoff_t __basepage_index(struct page *page)
+{
+ struct page *page_head = compound_head(page);
+ pgoff_t index = page_index(page_head);
+ unsigned long compound_idx;
+
+ if (!PageHuge(page_head))
+ return page_index(page);
+
+ if (compound_order(page_head) >= MAX_ORDER)
+ compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+ else
+ compound_idx = page - page_head;
+
+ return (index << compound_order(page_head)) + compound_idx;
+}
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -755,33 +811,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
return nid;
}
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
-{
- struct page *page;
- int start_nid;
- int next_nid;
- int ret = 0;
-
- start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
- page = alloc_fresh_huge_page_node(h, next_nid);
- if (page) {
- ret = 1;
- break;
- }
- next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- } while (next_nid != start_nid);
-
- if (ret)
- count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-
- return ret;
-}
-
/*
* helper for free_pool_huge_page() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
@@ -800,6 +829,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
return nid;
}
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ struct page *page;
+ int nr_nodes, node;
+ int ret = 0;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_huge_page_node(h, node);
+ if (page) {
+ ret = 1;
+ break;
+ }
+ }
+
+ if (ret)
+ count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ return ret;
+}
+
/*
* Free huge page from pool from next node to free.
* Attempt to keep persistent huge pages more or less
@@ -809,36 +872,31 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
bool acct_surplus)
{
- int start_nid;
- int next_nid;
+ int nr_nodes, node;
int ret = 0;
- start_nid = hstate_next_node_to_free(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
/*
* If we're returning unused surplus pages, only examine
* nodes with surplus pages.
*/
- if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
- !list_empty(&h->hugepage_freelists[next_nid])) {
+ if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
+ !list_empty(&h->hugepage_freelists[node])) {
struct page *page =
- list_entry(h->hugepage_freelists[next_nid].next,
+ list_entry(h->hugepage_freelists[node].next,
struct page, lru);
list_del(&page->lru);
h->free_huge_pages--;
- h->free_huge_pages_node[next_nid]--;
+ h->free_huge_pages_node[node]--;
if (acct_surplus) {
h->surplus_huge_pages--;
- h->surplus_huge_pages_node[next_nid]--;
+ h->surplus_huge_pages_node[node]--;
}
update_and_free_page(h, page);
ret = 1;
break;
}
- next_nid = hstate_next_node_to_free(h, nodes_allowed);
- } while (next_nid != start_nid);
+ }
return ret;
}
@@ -927,10 +985,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
*/
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
- struct page *page;
+ struct page *page = NULL;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_node(h, nid);
+ if (h->free_huge_pages - h->resv_huge_pages > 0)
+ page = dequeue_huge_page_node(h, nid);
spin_unlock(&hugetlb_lock);
if (!page)
@@ -1018,11 +1077,8 @@ free:
spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
- if (!list_empty(&surplus_list)) {
- list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- put_page(page);
- }
- }
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru)
+ put_page(page);
spin_lock(&hugetlb_lock);
return ret;
@@ -1059,6 +1115,7 @@ static void return_unused_surplus_pages(struct hstate *h,
while (nr_pages--) {
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
+ cond_resched_lock(&hugetlb_lock);
}
}
@@ -1089,9 +1146,9 @@ static long vma_needs_reservation(struct hstate *h,
} else {
long err;
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
- err = region_chg(&reservations->regions, idx, idx + 1);
+ err = region_chg(&resv->regions, idx, idx + 1);
if (err < 0)
return err;
return 0;
@@ -1109,10 +1166,10 @@ static void vma_commit_reservation(struct hstate *h,
} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
/* Mark this page used in the map. */
- region_add(&reservations->regions, idx, idx + 1);
+ region_add(&resv->regions, idx, idx + 1);
}
}
@@ -1138,38 +1195,35 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
return ERR_PTR(-ENOMEM);
- if (chg)
- if (hugepage_subpool_get_pages(spool, chg))
+ if (chg || avoid_reserve)
+ if (hugepage_subpool_get_pages(spool, 1))
return ERR_PTR(-ENOSPC);
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret) {
- hugepage_subpool_put_pages(spool, chg);
+ if (chg || avoid_reserve)
+ hugepage_subpool_put_pages(spool, 1);
return ERR_PTR(-ENOSPC);
}
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
- if (page) {
- /* update page cgroup details */
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
- h_cg, page);
- spin_unlock(&hugetlb_lock);
- } else {
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+ if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
hugetlb_cgroup_uncharge_cgroup(idx,
pages_per_huge_page(h),
h_cg);
- hugepage_subpool_put_pages(spool, chg);
+ if (chg || avoid_reserve)
+ hugepage_subpool_put_pages(spool, 1);
return ERR_PTR(-ENOSPC);
}
spin_lock(&hugetlb_lock);
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
- h_cg, page);
list_move(&page->lru, &h->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+ /* Fall through */
}
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ spin_unlock(&hugetlb_lock);
set_page_private(page, (unsigned long)spool);
@@ -1180,14 +1234,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
- int nr_nodes = nodes_weight(node_states[N_MEMORY]);
+ int nr_nodes, node;
- while (nr_nodes) {
+ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = __alloc_bootmem_node_nopanic(
- NODE_DATA(hstate_next_node_to_alloc(h,
- &node_states[N_MEMORY])),
+ addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
huge_page_size(h), huge_page_size(h), 0);
if (addr) {
@@ -1199,7 +1251,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
m = addr;
goto found;
}
- nr_nodes--;
}
return 0;
@@ -1338,48 +1389,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
int delta)
{
- int start_nid, next_nid;
- int ret = 0;
+ int nr_nodes, node;
VM_BUG_ON(delta != -1 && delta != 1);
- if (delta < 0)
- start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- else
- start_nid = hstate_next_node_to_free(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
- int nid = next_nid;
- if (delta < 0) {
- /*
- * To shrink on this node, there must be a surplus page
- */
- if (!h->surplus_huge_pages_node[nid]) {
- next_nid = hstate_next_node_to_alloc(h,
- nodes_allowed);
- continue;
- }
+ if (delta < 0) {
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node])
+ goto found;
}
- if (delta > 0) {
- /*
- * Surplus cannot exceed the total number of pages
- */
- if (h->surplus_huge_pages_node[nid] >=
- h->nr_huge_pages_node[nid]) {
- next_nid = hstate_next_node_to_free(h,
- nodes_allowed);
- continue;
- }
+ } else {
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node] <
+ h->nr_huge_pages_node[node])
+ goto found;
}
+ }
+ return 0;
- h->surplus_huge_pages += delta;
- h->surplus_huge_pages_node[nid] += delta;
- ret = 1;
- break;
- } while (next_nid != start_nid);
-
- return ret;
+found:
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[node] += delta;
+ return 1;
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
@@ -1446,6 +1477,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
while (min_count < persistent_huge_pages(h)) {
if (!free_pool_huge_page(h, nodes_allowed, 0))
break;
+ cond_resched_lock(&hugetlb_lock);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -2190,7 +2222,7 @@ out:
static void hugetlb_vm_op_open(struct vm_area_struct *vma)
{
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
/*
* This new VMA should share its siblings reservation map if present.
@@ -2200,34 +2232,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
- if (reservations)
- kref_get(&reservations->refs);
+ if (resv)
+ kref_get(&resv->refs);
}
static void resv_map_put(struct vm_area_struct *vma)
{
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
- if (!reservations)
+ if (!resv)
return;
- kref_put(&reservations->refs, resv_map_release);
+ kref_put(&resv->refs, resv_map_release);
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *reservations = vma_resv_map(vma);
+ struct resv_map *resv = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve;
unsigned long start;
unsigned long end;
- if (reservations) {
+ if (resv) {
start = vma_hugecache_offset(h, vma, vma->vm_start);
end = vma_hugecache_offset(h, vma, vma->vm_end);
reserve = (end - start) -
- region_count(&reservations->regions, start, end);
+ region_count(&resv->regions, start, end);
resv_map_put(vma);
@@ -2285,6 +2317,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp))
+ return 1;
+ else
+ return 0;
+}
+
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+ return 1;
+ else
+ return 0;
+}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -2295,16 +2352,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ mmun_start = vma->vm_start;
+ mmun_end = vma->vm_end;
+ if (cow)
+ mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte)
- goto nomem;
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ break;
+ }
/* If the pagetables are shared don't copy or take references */
if (dst_pte == src_pte)
@@ -2312,7 +2379,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock(&dst->page_table_lock);
spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
- if (!huge_pte_none(huge_ptep_get(src_pte))) {
+ entry = huge_ptep_get(src_pte);
+ if (huge_pte_none(entry)) { /* skip none entry */
+ ;
+ } else if (unlikely(is_hugetlb_entry_migration(entry) ||
+ is_hugetlb_entry_hwpoisoned(entry))) {
+ swp_entry_t swp_entry = pte_to_swp_entry(entry);
+
+ if (is_write_migration_entry(swp_entry) && cow) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&swp_entry);
+ entry = swp_entry_to_pte(swp_entry);
+ set_huge_pte_at(src, addr, src_pte, entry);
+ }
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else {
if (cow)
huge_ptep_set_wrprotect(src, addr, src_pte);
entry = huge_ptep_get(src_pte);
@@ -2324,36 +2408,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_unlock(&src->page_table_lock);
spin_unlock(&dst->page_table_lock);
}
- return 0;
-nomem:
- return -ENOMEM;
-}
+ if (cow)
+ mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
-static int is_hugetlb_entry_migration(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
- return 1;
- else
- return 0;
-}
-
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
- else
- return 0;
+ return ret;
}
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
@@ -2473,7 +2532,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
mm = vma->vm_mm;
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2540,7 +2599,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
- int avoidcopy;
int outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2550,10 +2608,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_mapcount(old_page) == 1);
- if (avoidcopy) {
- if (PageAnon(old_page))
- page_move_anon_rmap(old_page, vma, address);
+ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2567,8 +2623,7 @@ retry_avoidcopy:
* at the time of fork() could consume its reserves on COW instead
* of the full address range.
*/
- if (!(vma->vm_flags & VM_MAYSHARE) &&
- is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_page != pagecache_page)
outside_reserve = 1;
@@ -2640,6 +2695,8 @@ retry_avoidcopy:
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+ ClearPagePrivate(new_page);
+
/* Break COW */
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
@@ -2651,10 +2708,11 @@ retry_avoidcopy:
}
spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- /* Caller expects lock to be held */
- spin_lock(&mm->page_table_lock);
page_cache_release(new_page);
page_cache_release(old_page);
+
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
return 0;
}
@@ -2750,6 +2808,7 @@ retry:
goto retry;
goto out;
}
+ ClearPagePrivate(page);
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
@@ -2796,8 +2855,10 @@ retry:
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
- if (anon_rmap)
+ if (anon_rmap) {
+ ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
+ }
else
page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -2931,15 +2992,6 @@ out_mutex:
return ret;
}
-/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- BUG();
- return NULL;
-}
-
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -3169,6 +3221,216 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+ struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t idx)
+{
+ unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+ svma->vm_start;
+ unsigned long sbase = saddr & PUD_MASK;
+ unsigned long s_end = sbase + PUD_SIZE;
+
+ /* Allow segments to share if only one is marked locked */
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+
+ /*
+ * match the virtual addresses, permission and the alignment of the
+ * page table page.
+ */
+ if (pmd_index(addr) != pmd_index(saddr) ||
+ vm_flags != svm_flags ||
+ sbase < svma->vm_start || svma->vm_end < s_end)
+ return 0;
+
+ return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long base = addr & PUD_MASK;
+ unsigned long end = base + PUD_SIZE;
+
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (vma->vm_flags & VM_MAYSHARE &&
+ vma->vm_start <= base && end <= vma->vm_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ struct vm_area_struct *svma;
+ unsigned long saddr;
+ pte_t *spte = NULL;
+ pte_t *pte;
+
+ if (!vma_shareable(vma, addr))
+ return (pte_t *)pmd_alloc(mm, pud, addr);
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+ if (svma == vma)
+ continue;
+
+ saddr = page_table_shareable(svma, vma, addr, idx);
+ if (saddr) {
+ spte = huge_pte_offset(svma->vm_mm, saddr);
+ if (spte) {
+ get_page(virt_to_page(spte));
+ break;
+ }
+ }
+ }
+
+ if (!spte)
+ goto out;
+
+ spin_lock(&mm->page_table_lock);
+ if (pud_none(*pud))
+ pud_populate(mm, pud,
+ (pmd_t *)((unsigned long)spte & PAGE_MASK));
+ else
+ put_page(virt_to_page(spte));
+ spin_unlock(&mm->page_table_lock);
+out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return pte;
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ * 0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+ pgd_t *pgd = pgd_offset(mm, *addr);
+ pud_t *pud = pud_offset(pgd, *addr);
+
+ BUG_ON(page_count(virt_to_page(ptep)) == 0);
+ if (page_count(virt_to_page(ptep)) == 1)
+ return 0;
+
+ pud_clear(pud);
+ put_page(virt_to_page(ptep));
+ *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ return 1;
+}
+#define want_pmd_share() (1)
+#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ return NULL;
+}
+#define want_pmd_share() (0)
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ if (sz == PUD_SIZE) {
+ pte = (pte_t *)pud;
+ } else {
+ BUG_ON(sz != PMD_SIZE);
+ if (want_pmd_share() && pud_none(*pud))
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ }
+ }
+ BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+ return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, addr);
+ if (pud_present(*pud)) {
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+ pmd = pmd_offset(pud, addr);
+ }
+ }
+ return (pte_t *) pmd;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pmd);
+ if (page)
+ page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pud);
+ if (page)
+ page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
#ifdef CONFIG_MEMORY_FAILURE
/* Should be called in hugetlb_lock */
diff --git a/mm/ksm.c b/mm/ksm.c
index b6afe0c440d8..784d1e4bc385 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
static struct page *page_trans_compound_anon(struct page *page)
{
if (PageTransCompound(page)) {
- struct page *head = compound_trans_head(page);
+ struct page *head = compound_head(page);
/*
* head may actually be splitted and freed from under
* us but it's ok here.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..f45e21ab9cea 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -379,7 +379,7 @@ struct mem_cgroup {
static size_t memcg_size(void)
{
return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node);
+ nr_node_ids * sizeof(struct mem_cgroup_per_node *);
}
/* internal only representation about the status of kmem accounting. */
@@ -1220,7 +1220,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (dead_count == iter->last_dead_count) {
smp_rmb();
last_visited = iter->last_visited;
- if (last_visited &&
+ if (last_visited && last_visited != root &&
!css_tryget(&last_visited->css))
last_visited = NULL;
}
@@ -1229,7 +1229,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- if (last_visited)
+ if (last_visited && last_visited != root)
css_put(&last_visited->css);
iter->last_visited = memcg;
@@ -3186,11 +3186,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
if (!s->memcg_params)
return -ENOMEM;
- INIT_WORK(&s->memcg_params->destroy,
- kmem_cache_destroy_work_func);
if (memcg) {
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
+ INIT_WORK(&s->memcg_params->destroy,
+ kmem_cache_destroy_work_func);
} else
s->memcg_params->is_root_cache = true;
@@ -5584,7 +5584,13 @@ static int compare_thresholds(const void *a, const void *b)
const struct mem_cgroup_threshold *_a = a;
const struct mem_cgroup_threshold *_b = b;
- return _a->threshold - _b->threshold;
+ if (_a->threshold > _b->threshold)
+ return 1;
+
+ if (_a->threshold < _b->threshold)
+ return -1;
+
+ return 0;
}
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
@@ -6296,16 +6302,6 @@ mem_cgroup_css_online(struct cgroup *cont)
error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- if (error) {
- /*
- * We call put now because our (and parent's) refcnts
- * are already in place. mem_cgroup_put() will internally
- * call __mem_cgroup_free, so return directly
- */
- mem_cgroup_put(memcg);
- if (parent->use_hierarchy)
- mem_cgroup_put(parent);
- }
return error;
}
@@ -6330,9 +6326,23 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ struct cgroup *iter;
mem_cgroup_invalidate_reclaim_iterators(memcg);
+
+ /*
+ * This requires that offlining is serialized. Right now that is
+ * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+ */
+ rcu_read_lock();
+ cgroup_for_each_descendant_post(iter, cont) {
+ rcu_read_unlock();
+ mem_cgroup_reparent_charges(mem_cgroup_from_cont(iter));
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
mem_cgroup_reparent_charges(memcg);
+
mem_cgroup_destroy_all_caches(memcg);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f1932f..603f1fa1b7a3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -208,9 +208,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
#endif
si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
- if ((flags & MF_ACTION_REQUIRED) && t == current) {
+ if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
si.si_code = BUS_MCEERR_AR;
- ret = force_sig_info(SIGBUS, &si, t);
+ ret = force_sig_info(SIGBUS, &si, current);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -382,10 +382,12 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
}
}
-static int task_early_kill(struct task_struct *tsk)
+static int task_early_kill(struct task_struct *tsk, int force_early)
{
if (!tsk->mm)
return 0;
+ if (force_early)
+ return 1;
if (tsk->flags & PF_MCE_PROCESS)
return !!(tsk->flags & PF_MCE_EARLY);
return sysctl_memory_failure_early_kill;
@@ -395,7 +397,7 @@ static int task_early_kill(struct task_struct *tsk)
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -411,7 +413,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
for_each_process (tsk) {
struct anon_vma_chain *vmac;
- if (!task_early_kill(tsk))
+ if (!task_early_kill(tsk, force_early))
continue;
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
@@ -430,7 +432,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -441,7 +443,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- if (!task_early_kill(tsk))
+ if (!task_early_kill(tsk, force_early))
continue;
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
@@ -467,7 +469,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* First preallocate one tokill structure outside the spin locks,
* so that we can kill at least one process reasonably reliable.
*/
-static void collect_procs(struct page *page, struct list_head *tokill)
+static void collect_procs(struct page *page, struct list_head *tokill,
+ int force_early)
{
struct to_kill *tk;
@@ -478,9 +481,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
if (!tk)
return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk);
+ collect_procs_anon(page, tokill, &tk, force_early);
else
- collect_procs_file(page, tokill, &tk);
+ collect_procs_file(page, tokill, &tk, force_early);
kfree(tk);
}
@@ -854,14 +857,14 @@ static int page_action(struct page_state *ps, struct page *p,
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno, int flags)
+ int trapno, int flags, struct page **hpagep)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
int kill = 1, forcekill;
- struct page *hpage = compound_head(p);
+ struct page *hpage = *hpagep;
struct page *ppage;
if (PageReserved(p) || PageSlab(p))
@@ -936,6 +939,21 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
BUG_ON(!PageHWPoison(p));
return SWAP_FAIL;
}
+ /*
+ * We pinned the head page for hwpoison handling,
+ * now we split the thp and we are interested in
+ * the hwpoisoned raw page, so move the refcount
+ * to it. Similarly, page lock is shifted.
+ */
+ if (hpage != p) {
+ if (!(flags & MF_COUNT_INCREASED)) {
+ put_page(hpage);
+ get_page(p);
+ }
+ lock_page(p);
+ unlock_page(hpage);
+ *hpagep = p;
+ }
/* THP is split, so ppage should be the real poisoned page. */
ppage = p;
}
@@ -950,19 +968,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(ppage, &tokill);
-
- if (hpage != ppage)
- lock_page(ppage);
+ collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(ppage));
- if (hpage != ppage)
- unlock_page(ppage);
-
/*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
@@ -1074,15 +1086,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
} else if (PageHuge(hpage)) {
/*
- * Check "just unpoisoned", "filter hit", and
- * "race with other subpage."
+ * Check "filter hit" and "race with other subpage."
*/
lock_page(hpage);
- if (!PageHWPoison(hpage)
- || (hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- return 0;
+ if (PageHWPoison(hpage)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ return 0;
+ }
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1143,6 +1156,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(hpage);
res = 0;
goto out;
}
@@ -1179,8 +1194,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ *
+ * When the raw error page is thp tail page, hpage points to the raw
+ * page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
+ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+ != SWAP_SUCCESS) {
printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
res = -EBUSY;
goto out;
@@ -1410,7 +1429,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/*
* Isolate the page, so that it doesn't get reallocated if it
- * was free.
+ * was free. This flag should be kept set until the source page
+ * is freed and PG_hwpoison on it is set.
*/
set_migratetype_isolate(p, true);
/*
@@ -1433,7 +1453,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/* Not a free page */
ret = 1;
}
- unset_migratetype_isolate(p, MIGRATE_MOVABLE);
unlock_memory_hotplug();
return ret;
}
@@ -1489,12 +1508,17 @@ static int soft_offline_huge_page(struct page *page, int flags)
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
} else {
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_trans_order(hpage),
- &num_poisoned_pages);
+ /* overcommit hugetlb page will be freed to buddy */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
}
- /* keep elevated page count for bad page */
return ret;
}
@@ -1526,7 +1550,7 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_trans_head(page);
+ struct page *hpage = compound_head(page);
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
@@ -1559,7 +1583,7 @@ int soft_offline_page(struct page *page, int flags)
atomic_long_inc(&num_poisoned_pages);
}
}
- /* keep elevated page count for bad page */
+ unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}
@@ -1625,7 +1649,22 @@ static int __soft_offline_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
+ /*
+ * After page migration succeeds, the source page can
+ * be trapped in pagevec and actual freeing is delayed.
+ * Freeing code works differently based on PG_hwpoison,
+ * so there's a race. We need to make sure that the
+ * source page should be freed back to buddy before
+ * setting PG_hwpoison.
+ */
+ if (!is_free_buddy_page(page))
+ lru_add_drain_all();
+ if (!is_free_buddy_page(page))
+ drain_all_pages();
SetPageHWPoison(page);
+ if (!is_free_buddy_page(page))
+ pr_info("soft offline: %#lx: page leaked\n",
+ pfn);
atomic_long_inc(&num_poisoned_pages);
}
} else {
diff --git a/mm/memory.c b/mm/memory.c
index 61a262b08e53..8a6ab7be24b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,8 @@ __setup("norandmaps", disable_randmaps);
unsigned long zero_pfn __read_mostly;
unsigned long highest_memmap_pfn __read_mostly;
+EXPORT_SYMBOL(zero_pfn);
+
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
@@ -211,14 +213,15 @@ static int tlb_next_batch(struct mmu_gather *tlb)
* tear-down from @mm. The @fullmm argument is used when @mm is without
* users and we're going to destroy the full address space (exit/execve).
*/
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
{
tlb->mm = mm;
- tlb->fullmm = fullmm;
+ /* Is it from 0 to ~0? */
+ tlb->fullmm = !(start | (end+1));
tlb->need_flush_all = 0;
- tlb->start = -1UL;
- tlb->end = 0;
+ tlb->start = start;
+ tlb->end = end;
tlb->need_flush = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
@@ -258,8 +261,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
{
struct mmu_gather_batch *batch, *next;
- tlb->start = start;
- tlb->end = end;
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -1203,13 +1204,23 @@ again:
* and page-free while holding it.
*/
if (force_flush) {
+ unsigned long old_end;
+
force_flush = 0;
-#ifdef HAVE_GENERIC_MMU_GATHER
- tlb->start = addr;
- tlb->end = end;
-#endif
+ /*
+ * Flush the TLB just for the previous segment,
+ * then update the range to be the remaining
+ * TLB range.
+ */
+ old_end = tlb->end;
+ tlb->end = addr;
+
tlb_flush_mmu(tlb);
+
+ tlb->start = addr;
+ tlb->end = old_end;
+
if (addr != end)
goto again;
}
@@ -1396,7 +1407,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end = start + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
@@ -1422,7 +1433,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
unsigned long end = address + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, address, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, address, end);
unmap_single_vma(&tlb, vma, address, end, details);
@@ -1928,12 +1939,17 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags)
{
struct vm_area_struct *vma;
+ vm_flags_t vm_flags;
int ret;
vma = find_extend_vma(mm, address);
if (!vma || address < vma->vm_start)
return -EFAULT;
+ vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+ if (!(vm_flags & vma->vm_flags))
+ return -EFAULT;
+
ret = handle_mm_fault(mm, vma, address, fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
@@ -3516,12 +3532,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int current_nid)
+ unsigned long addr, int page_nid)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
return mpol_misplaced(page, vma, addr);
@@ -3532,7 +3548,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page = NULL;
spinlock_t *ptl;
- int current_nid = -1;
+ int page_nid = -1;
int target_nid;
bool migrated = false;
@@ -3562,15 +3578,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
- current_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
- /*
- * Account for the fault against the current node if it not
- * being replaced regardless of where the page is located.
- */
- current_nid = numa_node_id();
put_page(page);
goto out;
}
@@ -3578,11 +3589,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Migrate to the requested node */
migrated = migrate_misplaced_page(page, target_nid);
if (migrated)
- current_nid = target_nid;
+ page_nid = target_nid;
out:
- if (current_nid != -1)
- task_numa_fault(current_nid, 1, migrated);
+ if (page_nid != -1)
+ task_numa_fault(page_nid, 1, migrated);
return 0;
}
@@ -3597,7 +3608,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long offset;
spinlock_t *ptl;
bool numa = false;
- int local_nid = numa_node_id();
spin_lock(&mm->page_table_lock);
pmd = *pmdp;
@@ -3620,9 +3630,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte;
struct page *page;
- int curr_nid = local_nid;
+ int page_nid = -1;
int target_nid;
- bool migrated;
+ bool migrated = false;
+
if (!pte_present(pteval))
continue;
if (!pte_numa(pteval))
@@ -3644,25 +3655,19 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(page_mapcount(page) != 1))
continue;
- /*
- * Note that the NUMA fault is later accounted to either
- * the node that is currently running or where the page is
- * migrated to.
- */
- curr_nid = local_nid;
- target_nid = numa_migrate_prep(page, vma, addr,
- page_to_nid(page));
- if (target_nid == -1) {
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+ pte_unmap_unlock(pte, ptl);
+ if (target_nid != -1) {
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ page_nid = target_nid;
+ } else {
put_page(page);
- continue;
}
- /* Migrate to the requested node */
- pte_unmap_unlock(pte, ptl);
- migrated = migrate_misplaced_page(page, target_nid);
- if (migrated)
- curr_nid = target_nid;
- task_numa_fault(curr_nid, 1, migrated);
+ if (page_nid != -1)
+ task_numa_fault(page_nid, 1, migrated);
pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
@@ -4065,6 +4070,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
return len;
}
+EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 74310017296e..b2061bb5af73 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -608,19 +608,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
* If pagelist != NULL then isolate pages from the LRU and
* put them on the pagelist.
*/
-static struct vm_area_struct *
+static int
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
const nodemask_t *nodes, unsigned long flags, void *private)
{
- int err;
- struct vm_area_struct *first, *vma, *prev;
-
+ int err = 0;
+ struct vm_area_struct *vma, *prev;
- first = find_vma(mm, start);
- if (!first)
- return ERR_PTR(-EFAULT);
+ vma = find_vma(mm, start);
+ if (!vma)
+ return -EFAULT;
prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for (; vma && vma->vm_start < end; vma = vma->vm_next) {
unsigned long endvma = vma->vm_end;
if (endvma > end)
@@ -630,9 +629,9 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
if (is_vm_hugetlb_page(vma))
@@ -649,15 +648,13 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
err = check_pgd_range(vma, start, endvma, nodes,
flags, private);
- if (err) {
- first = ERR_PTR(err);
+ if (err)
break;
- }
}
next:
prev = vma;
}
- return first;
+ return err;
}
/*
@@ -732,7 +729,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (prev) {
vma = prev;
next = vma->vm_next;
- continue;
+ if (mpol_equal(vma_policy(vma), new_pol))
+ continue;
+ /* vma_merge() joined vma && vma->next, case 8 */
+ goto replace;
}
if (vma->vm_start != vmstart) {
err = split_vma(vma->vm_mm, vma, vmstart, 1);
@@ -744,6 +744,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (err)
goto out;
}
+ replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
@@ -1134,16 +1135,17 @@ out:
/*
* Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
+ * Start by assuming the page is mapped by the same vma as contains @start.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
{
- struct vm_area_struct *vma = (struct vm_area_struct *)private;
+ struct vm_area_struct *vma;
unsigned long uninitialized_var(address);
+ vma = find_vma(current->mm, start);
while (vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
@@ -1169,7 +1171,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
{
return NULL;
}
@@ -1179,7 +1181,6 @@ static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
- struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
@@ -1245,11 +1246,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
- vma = check_range(mm, start, end, nmask,
+ err = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
-
- err = PTR_ERR(vma); /* maybe ... */
- if (!IS_ERR(vma))
+ if (!err)
err = mbind_range(mm, start, end, new);
if (!err) {
@@ -1257,9 +1256,8 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma,
- MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ nr_failed = migrate_pages(&pagelist, new_page,
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_lru_pages(&pagelist);
}
@@ -2088,7 +2086,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
} else
*new = *old;
- rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
if (new->flags & MPOL_F_REBINDING)
@@ -2096,7 +2093,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
else
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
- rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
@@ -2797,7 +2793,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
*/
VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
- if (!pol || pol == &default_policy)
+ if (!pol || pol == &default_policy || (pol->flags & MPOL_F_MORON))
mode = MPOL_DEFAULT;
else
mode = pol->mode;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f0c24438bba..a88c12f2235d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -103,7 +103,7 @@ void putback_movable_pages(struct list_head *l)
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (unlikely(balloon_page_movable(page)))
+ if (unlikely(isolated_balloon_page(page)))
balloon_page_putback(page);
else
putback_lru_page(page);
@@ -1710,12 +1710,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
unlock_page(new_page);
put_page(new_page); /* Free it */
- unlock_page(page);
+ /* Retake the callers reference and putback on LRU */
+ get_page(page);
putback_lru_page(page);
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
- count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
- isolated = 0;
- goto out;
+ goto out_unlock;
}
/*
@@ -1732,9 +1733,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
- page_add_new_anon_rmap(new_page, vma, haddr);
-
+ pmdp_clear_flush(vma, haddr, pmd);
set_pmd_at(mm, haddr, pmd, entry);
+ page_add_new_anon_rmap(new_page, vma, haddr);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(page);
/*
@@ -1753,7 +1754,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-out:
mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru,
-HPAGE_PMD_NR);
@@ -1762,6 +1762,11 @@ out:
out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref:
+ entry = pmd_mknonnuma(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+
+out_unlock:
unlock_page(page);
put_page(page);
return 0;
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7d1bca..713e462c0776 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -76,6 +76,7 @@ void clear_page_mlock(struct page *page)
*/
void mlock_vma_page(struct page *page)
{
+ /* Serialize with page migration */
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
@@ -106,6 +107,7 @@ unsigned int munlock_vma_page(struct page *page)
{
unsigned int page_mask = 0;
+ /* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
if (TestClearPageMlocked(page)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e1842fad..8f87b14c7968 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -865,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- vma_set_policy(vma, vma_policy(next));
+ mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
@@ -1853,7 +1853,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1862,7 +1862,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
@@ -1901,7 +1901,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1911,14 +1911,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = PAGE_SIZE;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);
@@ -2356,7 +2356,7 @@ static void unmap_region(struct mm_struct *mm,
struct mmu_gather tlb;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
@@ -2735,7 +2735,7 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
- tlb_gather_mmu(&tlb, mm, 1);
+ tlb_gather_mmu(&tlb, mm, 0, -1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4d6b43..e9f65aaa3182 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -135,6 +135,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pmd_t *pmd;
unsigned long next;
unsigned long pages = 0;
+ unsigned long nr_huge_updates = 0;
bool all_same_node;
pmd = pmd_offset(pud, addr);
@@ -146,6 +147,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
else if (change_huge_pmd(vma, pmd, addr, newprot,
prot_numa)) {
pages += HPAGE_PMD_NR;
+ nr_huge_updates++;
continue;
}
/* fall through */
@@ -165,6 +167,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
+ if (nr_huge_updates)
+ count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
+
return pages;
}
@@ -201,6 +206,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
+ set_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -212,6 +218,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
+ clear_tlb_flush_pending(mm);
return pages;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 463a25705ac6..2201d060c31b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -175,10 +175,17 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
break;
if (pmd_trans_huge(*old_pmd)) {
int err = 0;
- if (extent == HPAGE_PMD_SIZE)
+ if (extent == HPAGE_PMD_SIZE) {
+ VM_BUG_ON(vma->vm_file || !vma->anon_vma);
+ /* See comment in move_ptes() */
+ if (need_rmap_locks)
+ anon_vma_lock_write(vma->anon_vma);
err = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
+ if (need_rmap_locks)
+ anon_vma_unlock_write(vma->anon_vma);
+ }
if (err > 0) {
need_flush = true;
continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e451a78c9e..f104c7e9f61e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
- * @tsk: task struct of which task to consider
+ * @start: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*/
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
+static bool has_intersects_mems_allowed(struct task_struct *start,
const nodemask_t *mask)
{
- struct task_struct *start = tsk;
+ struct task_struct *tsk;
+ bool ret = false;
- do {
+ rcu_read_lock();
+ for_each_thread(start, tsk) {
if (mask) {
/*
* If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- if (mempolicy_nodemask_intersects(tsk, mask))
- return true;
+ ret = mempolicy_nodemask_intersects(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
- if (cpuset_mems_allowed_intersects(current, tsk))
- return true;
+ ret = cpuset_mems_allowed_intersects(current, tsk);
}
- } while_each_thread(start, tsk);
+ if (ret)
+ break;
+ }
+ rcu_read_unlock();
- return false;
+ return ret;
}
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
*/
struct task_struct *find_lock_task_mm(struct task_struct *p)
{
- struct task_struct *t = p;
+ struct task_struct *t;
+
+ rcu_read_lock();
- do {
+ for_each_thread(p, t) {
task_lock(t);
if (likely(t->mm))
- return t;
+ goto found;
task_unlock(t);
- } while_each_thread(p, t);
+ }
+ t = NULL;
+found:
+ rcu_read_unlock();
- return NULL;
+ return t;
}
/* return true if the task is not adequate as candidate victim task. */
@@ -170,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- adj -= 30;
+ points -= (points * 3) / 100;
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long chosen_points = 0;
rcu_read_lock();
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
unsigned int points;
switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
chosen = p;
chosen_points = points;
}
- } while_each_thread(g, p);
+ }
if (chosen)
get_task_struct(chosen);
rcu_read_unlock();
@@ -394,6 +402,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
dump_tasks(memcg, nodemask);
}
+/*
+ * Number of OOM killer invocations (including memcg OOM killer).
+ * Primarily used by PM freezer to check for potential races with
+ * OOM killed frozen task.
+ */
+static atomic_t oom_kills = ATOMIC_INIT(0);
+
+int oom_kills_count(void)
+{
+ return atomic_read(&oom_kills);
+}
+
+void note_oom_kill(void)
+{
+ atomic_inc(&oom_kills);
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
@@ -406,7 +431,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
{
struct task_struct *victim = p;
struct task_struct *child;
- struct task_struct *t = p;
+ struct task_struct *t;
struct mm_struct *mm;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +462,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* still freeing memory.
*/
read_lock(&tasklist_lock);
- do {
+ for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -455,13 +480,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
get_task_struct(victim);
}
}
- } while_each_thread(p, t);
+ }
read_unlock(&tasklist_lock);
- rcu_read_lock();
p = find_lock_task_mm(victim);
if (!p) {
- rcu_read_unlock();
put_task_struct(victim);
return;
} else if (victim != p) {
@@ -487,6 +510,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
+ rcu_read_lock();
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4514ad7415c3..73cbc5dc150b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -188,6 +188,26 @@ static unsigned long writeout_period_time = 0;
* global dirtyable memory first.
*/
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ unsigned long nr_pages;
+
+ nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+
+ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
+ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+
+ return nr_pages;
+}
+
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -195,11 +215,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
unsigned long x = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z =
- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
- x += zone_page_state(z, NR_FREE_PAGES) +
- zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+ x += zone_dirtyable_memory(z);
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
@@ -235,9 +253,12 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ x = global_page_state(NR_FREE_PAGES);
x -= min(x, dirty_balance_reserve);
+ x += global_page_state(NR_INACTIVE_FILE);
+ x += global_page_state(NR_ACTIVE_FILE);
+
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -289,32 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
/**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
- *
- * Returns the zone's number of pages potentially available for dirty
- * page cache. This is the base value for the per-zone dirty limits.
- */
-static unsigned long zone_dirtyable_memory(struct zone *zone)
-{
- /*
- * The effective global number of dirtyable pages may exclude
- * highmem as a big-picture measure to keep the ratio between
- * dirty memory and lowmem reasonable.
- *
- * But this function is purely about the individual zone and a
- * highmem zone can hold its share of dirty pages, so we don't
- * care about vm_highmem_is_dirtyable here.
- */
- unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
- zone_reclaimable_pages(zone);
-
- /* don't allow this to underflow */
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
- return nr_pages;
-}
-
-/**
* zone_dirty_limit - maximum number of dirty pages allowed in a zone
* @zone: the zone
*
@@ -1104,11 +1099,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
return 1;
}
-static long bdi_max_pause(struct backing_dev_info *bdi,
- unsigned long bdi_dirty)
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+ unsigned long bdi_dirty)
{
- long bw = bdi->avg_write_bandwidth;
- long t;
+ unsigned long bw = bdi->avg_write_bandwidth;
+ unsigned long t;
/*
* Limit pause time for small memory systems. If sleeping for too long
@@ -1120,7 +1115,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi,
t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
- return min_t(long, t, MAX_PAUSE);
+ return min_t(unsigned long, t, MAX_PAUSE);
}
static long bdi_min_pause(struct backing_dev_info *bdi,
@@ -2031,11 +2026,12 @@ int __set_page_dirty_nobuffers(struct page *page)
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
+ unsigned long flags;
if (!mapping)
return 1;
- spin_lock_irq(&mapping->tree_lock);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
@@ -2044,7 +2040,7 @@ int __set_page_dirty_nobuffers(struct page *page)
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fccf..494a081ec5e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -360,9 +360,11 @@ void prep_compound_page(struct page *page, unsigned long order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- __SetPageTail(p);
set_page_count(p, 0);
p->first_page = page;
+ /* Make sure p->first_page is always valid for PageTail() */
+ smp_wmb();
+ __SetPageTail(p);
}
}
@@ -2118,6 +2120,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
}
/*
+ * PM-freezer should be notified that there might be an OOM killer on
+ * its way to kill and wake somebody up. This is too early and we might
+ * end up not killing anything but false positives are acceptable.
+ * See freeze_processes.
+ */
+ note_oom_kill();
+
+ /*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
@@ -2337,7 +2347,7 @@ static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- const gfp_t wait = gfp_mask & __GFP_WAIT;
+ const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2346,20 +2356,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
- if (!wait) {
+ if (atomic) {
/*
- * Not worth trying to allocate harder for
- * __GFP_NOMEMALLOC even if it can't schedule.
+ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+ * if it can't schedule.
*/
- if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
/*
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+ * comment for __cpuset_node_allowed_softwall().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -6142,6 +6152,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages -= 1 << order;
+#endif
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a872a..e007236f345a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -170,6 +170,7 @@ static void free_page_cgroup(void *addr)
sizeof(struct page_cgroup) * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 5da2cbcfdbb5..2beeabf502c5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -242,7 +242,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
if (err)
break;
pgd++;
- } while (addr = next, addr != end);
+ } while (addr = next, addr < end);
return err;
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 3707c71ae4cd..51108165f829 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -108,7 +108,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
- unsigned int cpu;
+ unsigned int cpu, tcpu;
int i;
for_each_possible_cpu(cpu) {
@@ -116,14 +116,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
- if (!*pagep) {
- pcpu_free_pages(chunk, pages, populated,
- page_start, page_end);
- return -ENOMEM;
- }
+ if (!*pagep)
+ goto err;
}
}
return 0;
+
+err:
+ while (--i >= page_start)
+ __free_page(pages[pcpu_page_idx(cpu, i)]);
+
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ for (i = page_start; i < page_end; i++)
+ __free_page(pages[pcpu_page_idx(tcpu, i)]);
+ }
+ return -ENOMEM;
}
/**
@@ -263,6 +272,7 @@ err:
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
}
+ pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
return err;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 8c8e08f3a692..25e2ea52db82 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -612,7 +612,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
sizeof(chunk->map[0]));
if (!chunk->map) {
- kfree(chunk);
+ pcpu_mem_free(chunk, pcpu_chunk_struct_size);
return NULL;
}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c8323fe6c8f..4b62a16fc3c1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -86,9 +86,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
+ struct mm_struct *mm = (vma)->vm_mm;
pte_t pte;
- pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
- if (pte_accessible(pte))
+ pte = ptep_get_and_clear(mm, address, ptep);
+ if (pte_accessible(mm, pte))
flush_tlb_page(vma, address);
return pte;
}
@@ -166,6 +167,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ pmd_t entry = *pmdp;
+ if (pmd_numa(entry))
+ entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 6280da86b5d6..705bfc8e6fcd 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
+ might_sleep();
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock_write(anon_vma);
anon_vma_unlock_write(anon_vma);
@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page)
* above cannot corrupt).
*/
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
+ return NULL;
}
out:
rcu_read_unlock();
@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
}
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
- goto out;
+ return NULL;
}
/* we pinned the anon_vma, its safe to sleep */
@@ -600,7 +602,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
spinlock_t *ptl;
if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return NULL;
+
ptl = &mm->page_table_lock;
goto check;
}
@@ -1386,9 +1392,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
BUG_ON(!page || PageAnon(page));
if (locked_vma) {
- mlock_vma_page(page); /* no-op if already mlocked */
- if (page == check_page)
+ if (page == check_page) {
+ /* we know we have check_page locked */
+ mlock_vma_page(page);
ret = SWAP_MLOCK;
+ } else if (trylock_page(page)) {
+ /*
+ * If we can lock the page, perform mlock.
+ * Otherwise leave the page alone, it will be
+ * eventually encountered again later.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
continue; /* don't unmap */
}
@@ -1661,10 +1677,9 @@ void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
+ anon_vma_free(anon_vma);
if (root != anon_vma && atomic_dec_and_test(&root->refcount))
anon_vma_free(root);
-
- anon_vma_free(anon_vma);
}
#ifdef CONFIG_MIGRATION
diff --git a/mm/shmem.c b/mm/shmem.c
index 5e6a8422658b..4e4a7349c5cd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate and shmem_writepage communicate via inode->i_private
- * (with i_mutex making sure that it has only one user at a time):
- * we would prefer not to enlarge the shmem inode just for that.
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * inode->i_private (with i_mutex making sure that it has only one user at
+ * a time): we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
+ wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -533,22 +534,19 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
return;
index = start;
- for ( ; ; ) {
+ while (index < end) {
cond_resched();
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
pvec.pages, indices);
if (!pvec.nr) {
- if (index == start || unfalloc)
+ /* If all gone or hole-punch or unfalloc, we're done */
+ if (index == start || end != -1)
break;
+ /* But if truncating, restart to make sure all gone */
index = start;
continue;
}
- if ((index == start || unfalloc) && indices[0] >= end) {
- shmem_deswap_pagevec(&pvec);
- pagevec_release(&pvec);
- break;
- }
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -560,8 +558,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (radix_tree_exceptional_entry(page)) {
if (unfalloc)
continue;
- nr_swaps_freed += !shmem_free_swap(mapping,
- index, page);
+ if (shmem_free_swap(mapping, index, page)) {
+ /* Swap was replaced by page: retry */
+ index--;
+ break;
+ }
+ nr_swaps_freed++;
continue;
}
@@ -570,6 +572,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (page->mapping == mapping) {
VM_BUG_ON(PageWriteback(page));
truncate_inode_page(mapping, page);
+ } else {
+ /* Page was replaced by swap: retry */
+ unlock_page(page);
+ index--;
+ break;
}
}
unlock_page(page);
@@ -826,6 +833,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
shmem_falloc = inode->i_private;
if (shmem_falloc &&
+ !shmem_falloc->waitq &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
shmem_falloc->nr_unswapped++;
@@ -1300,6 +1308,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int error;
int ret = VM_FAULT_LOCKED;
+ /*
+ * Trinity finds that probing a hole which tmpfs is punching can
+ * prevent the hole-punch from ever completing: which in turn
+ * locks writers out with its hold on i_mutex. So refrain from
+ * faulting pages into the hole while it's being punched. Although
+ * shmem_undo_range() does remove the additions, it may be unable to
+ * keep up, as each new page needs its own unmap_mapping_range() call,
+ * and the i_mmap tree grows ever slower to scan if new vmas are added.
+ *
+ * It does not matter if we sometimes reach this check just before the
+ * hole-punch begins, so that one fault then races with the punch:
+ * we just need to make racing faults a rare case.
+ *
+ * The implementation below would be much simpler if we just used a
+ * standard mutex or completion: but we cannot take i_mutex in fault,
+ * and bloating every shmem inode for this unlikely case would be sad.
+ */
+ if (unlikely(inode->i_private)) {
+ struct shmem_falloc *shmem_falloc;
+
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ shmem_falloc->waitq &&
+ vmf->pgoff >= shmem_falloc->start &&
+ vmf->pgoff < shmem_falloc->next) {
+ wait_queue_head_t *shmem_falloc_waitq;
+ DEFINE_WAIT(shmem_fault_wait);
+
+ ret = VM_FAULT_NOPAGE;
+ if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+ !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /* It's polite to up mmap_sem if we can */
+ up_read(&vma->vm_mm->mmap_sem);
+ ret = VM_FAULT_RETRY;
+ }
+
+ shmem_falloc_waitq = shmem_falloc->waitq;
+ prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+
+ /*
+ * shmem_falloc_waitq points into the shmem_fallocate()
+ * stack of the hole-punching task: shmem_falloc_waitq
+ * is usually invalid by the time we reach here, but
+ * finish_wait() does not dereference it in that case;
+ * though i_lock needed lest racing with wake_up_all().
+ */
+ spin_lock(&inode->i_lock);
+ finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+ spin_unlock(&inode->i_lock);
+ return ret;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1821,12 +1887,25 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
struct address_space *mapping = file->f_mapping;
loff_t unmap_start = round_up(offset, PAGE_SIZE);
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+
+ shmem_falloc.waitq = &shmem_falloc_waitq;
+ shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
if ((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1 + unmap_end - unmap_start, 0);
shmem_truncate_range(inode, offset, offset + len - 1);
/* No need to unmap again: hole-punching leaves COWed pages */
+
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ wake_up_all(&shmem_falloc_waitq);
+ spin_unlock(&inode->i_lock);
error = 0;
goto out;
}
@@ -1844,6 +1923,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
+ shmem_falloc.waitq = NULL;
shmem_falloc.start = start;
shmem_falloc.next = start;
shmem_falloc.nr_falloced = 0;
@@ -2048,8 +2128,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
if (new_dentry->d_inode) {
(void) shmem_unlink(new_dir, new_dentry);
- if (they_are_dirs)
+ if (they_are_dirs) {
+ drop_nlink(new_dentry->d_inode);
drop_nlink(old_dir);
+ }
} else if (they_are_dirs) {
drop_nlink(old_dir);
inc_nlink(new_dir);
@@ -2879,14 +2961,8 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
/* common code */
-static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen)
-{
- return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
- dentry->d_name.name);
-}
-
static struct dentry_operations anon_ops = {
- .d_dname = shmem_dname
+ .d_dname = simple_dname
};
/**
diff --git a/mm/slab.c b/mm/slab.c
index 8ccd296c6d9c..bd88411595b9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -565,7 +565,7 @@ static void init_node_lock_keys(int q)
if (slab_state < UP)
return;
- for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) {
+ for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
struct kmem_cache_node *n;
struct kmem_cache *cache = kmalloc_caches[i];
diff --git a/mm/slab.h b/mm/slab.h
index f96b49e4704e..4d6d836247dd 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -162,6 +162,8 @@ static inline const char *cache_name(struct kmem_cache *s)
static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
{
+ if (!s->memcg_params)
+ return NULL;
return s->memcg_params->memcg_caches[idx];
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2d414508e9ec..7d21d3fddbf0 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -55,6 +55,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
continue;
}
+#if !defined(CONFIG_SLUB)
/*
* For simplicity, we won't check this in the list of memcg
* caches. We have control over memcg naming, and if there
@@ -68,6 +69,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
s = NULL;
return -EINVAL;
}
+#endif
}
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
diff --git a/mm/slub.c b/mm/slub.c
index 57707f01bcfb..deaed7b47213 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1201,8 +1201,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size,
/*
* Enable debugging if selected on the kernel commandline.
*/
- if (slub_debug && (!slub_debug_slabs ||
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
+ if (slub_debug && (!slub_debug_slabs || (name &&
+ !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
flags |= slub_debug;
return flags;
@@ -4285,7 +4285,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
page = ACCESS_ONCE(c->partial);
if (page) {
- x = page->pobjects;
+ node = page_to_nid(page);
+ if (flags & SO_TOTAL)
+ WARN_ON_ONCE(1);
+ else if (flags & SO_OBJECTS)
+ WARN_ON_ONCE(1);
+ else
+ x = page->pages;
total += x;
nodes[node] += x;
}
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71d6841..4e35f3ff0427 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
+#include <linux/hugetlb.h>
#include "internal.h"
@@ -80,7 +81,7 @@ static void put_compound_page(struct page *page)
{
if (unlikely(PageTail(page))) {
/* __split_huge_page_refcount can run under us */
- struct page *page_head = compound_trans_head(page);
+ struct page *page_head = compound_head(page);
if (likely(page != page_head &&
get_page_unless_zero(page_head))) {
@@ -94,14 +95,31 @@ static void put_compound_page(struct page *page)
* still hot on arches that do not support
* this_cpu_cmpxchg_double().
*/
- if (PageSlab(page_head)) {
- if (PageTail(page)) {
+ if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+ if (likely(PageTail(page))) {
+ /*
+ * __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON(!PageHead(page_head));
+ atomic_dec(&page->_mapcount);
if (put_page_testzero(page_head))
VM_BUG_ON(1);
-
- atomic_dec(&page->_mapcount);
- goto skip_lock_tail;
+ if (put_page_testzero(page_head))
+ __put_compound_page(page_head);
+ return;
} else
+ /*
+ * __split_huge_page_refcount
+ * run before us, "page" was a
+ * THP tail. The split
+ * page_head has been freed
+ * and reallocated as slab or
+ * hugetlbfs page of smaller
+ * order (only possible if
+ * reallocated as slab on
+ * x86).
+ */
goto skip_lock;
}
/*
@@ -115,8 +133,27 @@ static void put_compound_page(struct page *page)
/* __split_huge_page_refcount run before us */
compound_unlock_irqrestore(page_head, flags);
skip_lock:
- if (put_page_testzero(page_head))
- __put_single_page(page_head);
+ if (put_page_testzero(page_head)) {
+ /*
+ * The head page may have been
+ * freed and reallocated as a
+ * compound page of smaller
+ * order and then freed again.
+ * All we know is that it
+ * cannot have become: a THP
+ * page, a compound page of
+ * higher order, a tail page.
+ * That is because we still
+ * hold the refcount of the
+ * split THP tail and
+ * page_head was the THP head
+ * before the split.
+ */
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
out_put_single:
if (put_page_testzero(page))
__put_single_page(page);
@@ -138,7 +175,6 @@ out_put_single:
VM_BUG_ON(atomic_read(&page->_count) != 0);
compound_unlock_irqrestore(page_head, flags);
-skip_lock_tail:
if (put_page_testzero(page_head)) {
if (PageHead(page_head))
__put_compound_page(page_head);
@@ -183,16 +219,30 @@ bool __get_page_tail(struct page *page)
*/
unsigned long flags;
bool got = false;
- struct page *page_head = compound_trans_head(page);
+ struct page *page_head = compound_head(page);
if (likely(page != page_head && get_page_unless_zero(page_head))) {
-
/* Ref to put_compound_page() comment. */
- if (PageSlab(page_head)) {
+ if (PageSlab(page_head) || PageHeadHuge(page_head)) {
if (likely(PageTail(page))) {
+ /*
+ * This is a hugetlbfs page or a slab
+ * page. __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON(!PageHead(page_head));
__get_page_tail_foll(page, false);
return true;
} else {
+ /*
+ * __split_huge_page_refcount run
+ * before us, "page" was a THP
+ * tail. The split page_head has been
+ * freed and reallocated as slab or
+ * hugetlbfs page of smaller order
+ * (only possible if reallocated as
+ * slab on x86).
+ */
put_page(page_head);
return false;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index c75b736e54b7..2d6151fc8f08 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -20,6 +20,7 @@
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
#include <linux/cleancache.h>
+#include <linux/rmap.h>
#include "internal.h"
@@ -567,16 +568,67 @@ EXPORT_SYMBOL(truncate_pagecache);
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
- loff_t oldsize;
+ loff_t oldsize = inode->i_size;
- oldsize = inode->i_size;
i_size_write(inode, newsize);
-
+ if (newsize > oldsize)
+ pagecache_isize_extended(inode, oldsize, newsize);
truncate_pagecache(inode, oldsize, newsize);
}
EXPORT_SYMBOL(truncate_setsize);
/**
+ * pagecache_isize_extended - update pagecache after extension of i_size
+ * @inode: inode for which i_size was extended
+ * @from: original inode size
+ * @to: new inode size
+ *
+ * Handle extension of inode size either caused by extending truncate or by
+ * write starting after current i_size. We mark the page straddling current
+ * i_size RO so that page_mkwrite() is called on the nearest write access to
+ * the page. This way filesystem can be sure that page_mkwrite() is called on
+ * the page before user writes to the page via mmap after the i_size has been
+ * changed.
+ *
+ * The function must be called after i_size is updated so that page fault
+ * coming after we unlock the page will already see the new i_size.
+ * The function must be called while we still hold i_mutex - this not only
+ * makes sure i_size is stable but also that userspace cannot observe new
+ * i_size value before we are prepared to store mmap writes at new inode size.
+ */
+void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
+{
+ int bsize = 1 << inode->i_blkbits;
+ loff_t rounded_from;
+ struct page *page;
+ pgoff_t index;
+
+ WARN_ON(to > inode->i_size);
+
+ if (from >= to || bsize == PAGE_CACHE_SIZE)
+ return;
+ /* Page straddling @from will not have any hole block created? */
+ rounded_from = round_up(from, bsize);
+ if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
+ return;
+
+ index = from >> PAGE_CACHE_SHIFT;
+ page = find_lock_page(inode->i_mapping, index);
+ /* Page not cached? Nothing to do */
+ if (!page)
+ return;
+ /*
+ * See clear_page_dirty_for_io() for details why set_page_dirty()
+ * is needed.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+}
+EXPORT_SYMBOL(pagecache_isize_extended);
+
+/**
* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
* @inode: inode
* @lstart: offset of beginning of hole
diff --git a/mm/util.c b/mm/util.c
index ab1424dbe2e6..0b1725254ff1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -272,17 +272,14 @@ pid_t vm_is_stack(struct task_struct *task,
if (in_group) {
struct task_struct *t;
- rcu_read_lock();
- if (!pid_alive(task))
- goto done;
- t = task;
- do {
+ rcu_read_lock();
+ for_each_thread(task, t) {
if (vm_is_stack_for_task(t, vma)) {
ret = t->pid;
goto done;
}
- } while_each_thread(task, t);
+ }
done:
rcu_read_unlock();
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d365724feb05..d4565606cc96 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -388,12 +388,12 @@ nocache:
addr = ALIGN(first->va_end, align);
if (addr < vstart)
goto nocache;
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
} else {
addr = ALIGN(vstart, align);
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
n = vmap_area_root.rb_node;
@@ -420,7 +420,7 @@ nocache:
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
addr = ALIGN(first->va_end, align);
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
if (list_is_last(&first->list, &vmap_area_list))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee4..4e89500391dc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,7 @@
#include <asm/div64.h>
#include <linux/swapops.h>
+#include <linux/balloon_compaction.h>
#include "internal.h"
@@ -978,7 +979,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_cache(page) && !PageDirty(page)) {
+ if (page_is_file_cache(page) && !PageDirty(page) &&
+ !isolated_balloon_page(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -2115,6 +2117,20 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
return aborted_reclaim;
}
+static unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+ int nr;
+
+ nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (get_nr_swap_pages() > 0)
+ nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+
+ return nr;
+}
+
static bool zone_reclaimable(struct zone *zone)
{
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
@@ -2270,10 +2286,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
@@ -2298,9 +2321,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
+ struct zoneref *z;
struct zone *zone;
- int high_zoneidx = gfp_zone(gfp_mask);
- pg_data_t *pgdat;
+ pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
@@ -2319,10 +2342,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (fatal_signal_pending(current))
goto out;
- /* Check if the pfmemalloc reserves are ok */
- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
goto out;
/* Account for the throttling */
@@ -3043,7 +3090,10 @@ static int kswapd(void *p)
}
}
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return 0;
}
@@ -3073,41 +3123,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
wake_up_interruptible(&pgdat->kswapd_wait);
}
-/*
- * The reclaimable count would be mostly accurate.
- * The less reclaimable pages may be
- * - mlocked pages, which will be moved to unevictable list when encountered
- * - mapped pages, which may require several travels to be reclaimed
- * - dirty pages, which is not "instantly" reclaimable
- */
-unsigned long global_reclaimable_pages(void)
-{
- int nr;
-
- nr = global_page_state(NR_ACTIVE_FILE) +
- global_page_state(NR_INACTIVE_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += global_page_state(NR_ACTIVE_ANON) +
- global_page_state(NR_INACTIVE_ANON);
-
- return nr;
-}
-
-unsigned long zone_reclaimable_pages(struct zone *zone)
-{
- int nr;
-
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON);
-
- return nr;
-}
-
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f42745e65780..6d9bace4e589 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/cpumask.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
@@ -432,11 +433,12 @@ EXPORT_SYMBOL(dec_zone_page_state);
* with the global counters. These could cause remote node cache line
* bouncing and will have to be only done when necessary.
*/
-void refresh_cpu_vm_stats(int cpu)
+bool refresh_cpu_vm_stats(int cpu)
{
struct zone *zone;
int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ bool vm_activity = false;
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
@@ -483,14 +485,21 @@ void refresh_cpu_vm_stats(int cpu)
if (p->expire)
continue;
- if (p->pcp.count)
+ if (p->pcp.count) {
+ vm_activity = true;
drain_zone_pages(zone, &p->pcp);
+ }
#endif
}
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (global_diff[i])
+ if (global_diff[i]) {
atomic_long_add(global_diff[i], &vm_stat[i]);
+ vm_activity = true;
+ }
+
+ return vm_activity;
+
}
/*
@@ -779,6 +788,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA_BALANCING
"numa_pte_updates",
+ "numa_huge_pte_updates",
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
@@ -1174,22 +1184,72 @@ static const struct file_operations proc_vmstat_file_operations = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static struct cpumask vmstat_off_cpus;
+struct delayed_work vmstat_monitor_work;
-static void vmstat_update(struct work_struct *w)
+static inline bool need_vmstat(int cpu)
{
- refresh_cpu_vm_stats(smp_processor_id());
- schedule_delayed_work(&__get_cpu_var(vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ struct zone *zone;
+ int i;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p;
+
+ p = per_cpu_ptr(zone->pageset, cpu);
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (p->vm_stat_diff[i])
+ return true;
+
+ if (zone_to_nid(zone) != numa_node_id() && p->pcp.count)
+ return true;
+ }
+
+ return false;
}
-static void __cpuinit start_cpu_timer(int cpu)
+static void vmstat_update(struct work_struct *w);
+
+static void start_cpu_timer(int cpu)
{
struct delayed_work *work = &per_cpu(vmstat_work, cpu);
- INIT_DEFERRABLE_WORK(work, vmstat_update);
+ cpumask_clear_cpu(cpu, &vmstat_off_cpus);
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
+static void __cpuinit setup_cpu_timer(int cpu)
+{
+ struct delayed_work *work = &per_cpu(vmstat_work, cpu);
+
+ INIT_DEFERRABLE_WORK(work, vmstat_update);
+ start_cpu_timer(cpu);
+}
+
+static void vmstat_update_monitor(struct work_struct *w)
+{
+ int cpu;
+
+ for_each_cpu_and(cpu, &vmstat_off_cpus, cpu_online_mask)
+ if (need_vmstat(cpu))
+ start_cpu_timer(cpu);
+
+ queue_delayed_work(system_unbound_wq, &vmstat_monitor_work,
+ round_jiffies_relative(sysctl_stat_interval));
+}
+
+
+static void vmstat_update(struct work_struct *w)
+{
+ int cpu = smp_processor_id();
+
+ if (likely(refresh_cpu_vm_stats(cpu)))
+ schedule_delayed_work(&__get_cpu_var(vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ else
+ cpumask_set_cpu(cpu, &vmstat_off_cpus);
+}
+
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
@@ -1204,17 +1264,19 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- per_cpu(vmstat_work, cpu).work.func = NULL;
+ if (!cpumask_test_cpu(cpu, &vmstat_off_cpus)) {
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ per_cpu(vmstat_work, cpu).work.func = NULL;
+ }
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1237,8 +1299,14 @@ static int __init setup_vmstat(void)
register_cpu_notifier(&vmstat_notifier);
+ INIT_DEFERRABLE_WORK(&vmstat_monitor_work,
+ vmstat_update_monitor);
+ queue_delayed_work(system_unbound_wq,
+ &vmstat_monitor_work,
+ round_jiffies_relative(HZ));
+
for_each_online_cpu(cpu)
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);