aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c24
-rw-r--r--mm/fremap.c8
-rw-r--r--mm/huge_memory.c102
-rw-r--r--mm/hugetlb.c287
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c42
-rw-r--r--mm/memory-failure.c69
-rw-r--r--mm/memory.c87
-rw-r--r--mm/mempolicy.c8
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c72
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slub.c12
-rw-r--r--mm/swap.c74
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c53
-rw-r--r--mm/vmstat.c96
27 files changed, 755 insertions, 269 deletions
diff --git a/mm/bounce.c b/mm/bounce.c
index c9f0a4339a7..5a7d58fb883 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -204,6 +204,8 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
struct bio_vec *to, *from;
unsigned i;
+ if (force)
+ goto bounce;
bio_for_each_segment(from, *bio_orig, i)
if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
goto bounce;
diff --git a/mm/compaction.c b/mm/compaction.c
index 05ccb4cc0bd..18a90b4d0bf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc,
bool migrate_scanner)
{
struct zone *zone = cc->zone;
+
+ if (cc->ignore_skip_hint)
+ return;
+
if (!page)
return;
@@ -248,7 +252,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
{
int nr_scanned = 0, total_isolated = 0;
struct page *cursor, *valid_page = NULL;
- unsigned long nr_strict_required = end_pfn - blockpfn;
unsigned long flags;
bool locked = false;
@@ -261,11 +264,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
nr_scanned++;
if (!pfn_valid_within(blockpfn))
- continue;
+ goto isolate_fail;
+
if (!valid_page)
valid_page = page;
if (!PageBuddy(page))
- continue;
+ goto isolate_fail;
/*
* The zone lock must be held to isolate freepages.
@@ -286,12 +290,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
- continue;
+ goto isolate_fail;
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
- if (!isolated && strict)
- break;
total_isolated += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
@@ -302,7 +304,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (isolated) {
blockpfn += isolated - 1;
cursor += isolated - 1;
+ continue;
}
+
+isolate_fail:
+ if (strict)
+ break;
+ else
+ continue;
+
}
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
@@ -312,7 +322,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
* pages requested were isolated. If there were any failures, 0 is
* returned and CMA will fail.
*/
- if (strict && nr_strict_required > total_isolated)
+ if (strict && blockpfn < end_pfn)
total_isolated = 0;
if (locked)
diff --git a/mm/fremap.c b/mm/fremap.c
index 87da3590c61..1fb6bfe39d8 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -203,9 +203,10 @@ get_write_lock:
if (mapping_cap_account_dirty(mapping)) {
unsigned long addr;
struct file *file = get_file(vma->vm_file);
+ /* mmap_region may free vma; grab the info now */
+ vm_flags = vma->vm_flags;
- addr = mmap_region(file, start, size,
- vma->vm_flags, pgoff);
+ addr = mmap_region(file, start, size, vm_flags, pgoff);
fput(file);
if (IS_ERR_VALUE(addr)) {
err = addr;
@@ -213,7 +214,7 @@ get_write_lock:
BUG_ON(addr != start);
err = 0;
}
- goto out;
+ goto out_freed;
}
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
@@ -248,6 +249,7 @@ get_write_lock:
out:
if (vma)
vm_flags = vma->vm_flags;
+out_freed:
if (likely(!has_write_lock))
up_read(&mm->mmap_sem);
else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83f..eb00e81601a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1166,7 +1166,7 @@ alloc:
if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
- if (is_huge_zero_pmd(orig_pmd)) {
+ if (!page) {
ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
address, pmd, orig_pmd, haddr);
} else {
@@ -1190,7 +1190,7 @@ alloc:
goto out;
}
- if (is_huge_zero_pmd(orig_pmd))
+ if (!page)
clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
@@ -1215,7 +1215,7 @@ alloc:
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- if (is_huge_zero_pmd(orig_pmd)) {
+ if (!page) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
@@ -1288,64 +1288,104 @@ out:
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{
+ struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int page_nid = -1, this_nid = numa_node_id();
int target_nid;
- int current_nid = -1;
- bool migrated;
+ bool page_locked;
+ bool migrated = false;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
page = pmd_page(pmd);
- get_page(page);
- current_nid = page_to_nid(page);
+ page_nid = page_to_nid(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == this_nid)
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ /*
+ * Acquire the page lock to serialise THP migrations but avoid dropping
+ * page_table_lock if at all possible
+ */
+ page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
- put_page(page);
- goto clear_pmdnuma;
+ /* If the page was locked, there are no parallel migrations */
+ if (page_locked)
+ goto clear_pmdnuma;
+
+ /*
+ * Otherwise wait for potential migrations and retry. We do
+ * relock and check_same as the page may no longer be mapped.
+ * As the fault is being retried, do not account for it.
+ */
+ spin_unlock(&mm->page_table_lock);
+ wait_on_page_locked(page);
+ page_nid = -1;
+ goto out;
}
- /* Acquire the page lock to serialise THP migrations */
+ /* Page is misplaced, serialise migrations and parallel THP splits */
+ get_page(page);
spin_unlock(&mm->page_table_lock);
- lock_page(page);
+ if (!page_locked)
+ lock_page(page);
+ anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PTE did not while locked */
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
+ page_nid = -1;
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
- /* Migrate the THP to the requested node */
+ /* Bail if we fail to protect against THP splits for any reason */
+ if (unlikely(!anon_vma)) {
+ put_page(page);
+ page_nid = -1;
+ goto clear_pmdnuma;
+ }
+
+ /*
+ * The page_table_lock above provides a memory barrier
+ * with change_protection_range.
+ */
+ if (mm_tlb_flush_pending(mm))
+ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
+ /*
+ * Migrate the THP to the requested node, returns with page unlocked
+ * and pmd_numa cleared.
+ */
+ spin_unlock(&mm->page_table_lock);
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (!migrated)
- goto check_same;
-
- task_numa_fault(target_nid, HPAGE_PMD_NR, true);
- return 0;
+ if (migrated)
+ page_nid = target_nid;
-check_same:
- spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(pmd, *pmdp)))
- goto out_unlock;
+ goto out;
clear_pmdnuma:
+ BUG_ON(!PageLocked(page));
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
+ unlock_page(page);
out_unlock:
spin_unlock(&mm->page_table_lock);
- if (current_nid != -1)
- task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+
+out:
+ if (anon_vma)
+ page_unlock_anon_vma_read(anon_vma);
+
+ if (page_nid != -1)
+ task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
+
return 0;
}
@@ -2286,6 +2326,8 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out;
vma = find_vma(mm, address);
+ if (!vma)
+ goto out;
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -2697,6 +2739,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
+again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2719,7 +2762,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
split_huge_page(page);
put_page(page);
- BUG_ON(pmd_trans_huge(*pmd));
+
+ /*
+ * We don't always have down_write of mmap_sem here: a racing
+ * do_huge_pmd_wp_page() might have copied-on-write to another
+ * huge page before our split_huge_page() got the anon_vma lock.
+ */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ goto again;
}
void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e2bfbf73a55..96cfebd0d67 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -21,6 +21,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/page-isolation.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -517,9 +518,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- if (list_empty(&h->hugepage_freelists[nid]))
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
+ if (!is_migrate_isolate_page(page))
+ break;
+ /*
+ * if 'non-isolated free hugepage' not found on the list,
+ * the allocation fails.
+ */
+ if (&h->hugepage_freelists[nid] == &page->lru)
return NULL;
- page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
h->free_huge_pages--;
@@ -690,6 +697,40 @@ int PageHuge(struct page *page)
}
EXPORT_SYMBOL_GPL(PageHuge);
+/*
+ * PageHeadHuge() only returns true for hugetlbfs head page, but not for
+ * normal or transparent huge pages.
+ */
+int PageHeadHuge(struct page *page_head)
+{
+ compound_page_dtor *dtor;
+
+ if (!PageHead(page_head))
+ return 0;
+
+ dtor = get_compound_page_dtor(page_head);
+
+ return dtor == free_huge_page;
+}
+EXPORT_SYMBOL_GPL(PageHeadHuge);
+
+pgoff_t __basepage_index(struct page *page)
+{
+ struct page *page_head = compound_head(page);
+ pgoff_t index = page_index(page_head);
+ unsigned long compound_idx;
+
+ if (!PageHuge(page_head))
+ return page_index(page);
+
+ if (compound_order(page_head) >= MAX_ORDER)
+ compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+ else
+ compound_idx = page - page_head;
+
+ return (index << compound_order(page_head)) + compound_idx;
+}
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -2295,16 +2336,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ mmun_start = vma->vm_start;
+ mmun_end = vma->vm_end;
+ if (cow)
+ mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte)
- goto nomem;
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ break;
+ }
/* If the pagetables are shared don't copy or take references */
if (dst_pte == src_pte)
@@ -2324,10 +2375,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_unlock(&src->page_table_lock);
spin_unlock(&dst->page_table_lock);
}
- return 0;
-nomem:
- return -ENOMEM;
+ if (cow)
+ mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+
+ return ret;
}
static int is_hugetlb_entry_migration(pte_t pte)
@@ -2473,7 +2525,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
mm = vma->vm_mm;
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2931,15 +2983,6 @@ out_mutex:
return ret;
}
-/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- BUG();
- return NULL;
-}
-
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -3169,6 +3212,216 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+ struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t idx)
+{
+ unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+ svma->vm_start;
+ unsigned long sbase = saddr & PUD_MASK;
+ unsigned long s_end = sbase + PUD_SIZE;
+
+ /* Allow segments to share if only one is marked locked */
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+
+ /*
+ * match the virtual addresses, permission and the alignment of the
+ * page table page.
+ */
+ if (pmd_index(addr) != pmd_index(saddr) ||
+ vm_flags != svm_flags ||
+ sbase < svma->vm_start || svma->vm_end < s_end)
+ return 0;
+
+ return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long base = addr & PUD_MASK;
+ unsigned long end = base + PUD_SIZE;
+
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (vma->vm_flags & VM_MAYSHARE &&
+ vma->vm_start <= base && end <= vma->vm_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ struct vm_area_struct *svma;
+ unsigned long saddr;
+ pte_t *spte = NULL;
+ pte_t *pte;
+
+ if (!vma_shareable(vma, addr))
+ return (pte_t *)pmd_alloc(mm, pud, addr);
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+ if (svma == vma)
+ continue;
+
+ saddr = page_table_shareable(svma, vma, addr, idx);
+ if (saddr) {
+ spte = huge_pte_offset(svma->vm_mm, saddr);
+ if (spte) {
+ get_page(virt_to_page(spte));
+ break;
+ }
+ }
+ }
+
+ if (!spte)
+ goto out;
+
+ spin_lock(&mm->page_table_lock);
+ if (pud_none(*pud))
+ pud_populate(mm, pud,
+ (pmd_t *)((unsigned long)spte & PAGE_MASK));
+ else
+ put_page(virt_to_page(spte));
+ spin_unlock(&mm->page_table_lock);
+out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return pte;
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ * 0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+ pgd_t *pgd = pgd_offset(mm, *addr);
+ pud_t *pud = pud_offset(pgd, *addr);
+
+ BUG_ON(page_count(virt_to_page(ptep)) == 0);
+ if (page_count(virt_to_page(ptep)) == 1)
+ return 0;
+
+ pud_clear(pud);
+ put_page(virt_to_page(ptep));
+ *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ return 1;
+}
+#define want_pmd_share() (1)
+#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ return NULL;
+}
+#define want_pmd_share() (0)
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ if (sz == PUD_SIZE) {
+ pte = (pte_t *)pud;
+ } else {
+ BUG_ON(sz != PMD_SIZE);
+ if (want_pmd_share() && pud_none(*pud))
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ }
+ }
+ BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+ return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, addr);
+ if (pud_present(*pud)) {
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+ pmd = pmd_offset(pud, addr);
+ }
+ }
+ return (pte_t *) pmd;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pmd);
+ if (page)
+ page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pud);
+ if (page)
+ page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
#ifdef CONFIG_MEMORY_FAILURE
/* Should be called in hugetlb_lock */
diff --git a/mm/ksm.c b/mm/ksm.c
index b6afe0c440d..784d1e4bc38 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
static struct page *page_trans_compound_anon(struct page *page)
{
if (PageTransCompound(page)) {
- struct page *head = compound_trans_head(page);
+ struct page *head = compound_head(page);
/*
* head may actually be splitted and freed from under
* us but it's ok here.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf..f45e21ab9ce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -379,7 +379,7 @@ struct mem_cgroup {
static size_t memcg_size(void)
{
return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node);
+ nr_node_ids * sizeof(struct mem_cgroup_per_node *);
}
/* internal only representation about the status of kmem accounting. */
@@ -1220,7 +1220,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (dead_count == iter->last_dead_count) {
smp_rmb();
last_visited = iter->last_visited;
- if (last_visited &&
+ if (last_visited && last_visited != root &&
!css_tryget(&last_visited->css))
last_visited = NULL;
}
@@ -1229,7 +1229,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- if (last_visited)
+ if (last_visited && last_visited != root)
css_put(&last_visited->css);
iter->last_visited = memcg;
@@ -3186,11 +3186,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
if (!s->memcg_params)
return -ENOMEM;
- INIT_WORK(&s->memcg_params->destroy,
- kmem_cache_destroy_work_func);
if (memcg) {
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
+ INIT_WORK(&s->memcg_params->destroy,
+ kmem_cache_destroy_work_func);
} else
s->memcg_params->is_root_cache = true;
@@ -5584,7 +5584,13 @@ static int compare_thresholds(const void *a, const void *b)
const struct mem_cgroup_threshold *_a = a;
const struct mem_cgroup_threshold *_b = b;
- return _a->threshold - _b->threshold;
+ if (_a->threshold > _b->threshold)
+ return 1;
+
+ if (_a->threshold < _b->threshold)
+ return -1;
+
+ return 0;
}
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
@@ -6296,16 +6302,6 @@ mem_cgroup_css_online(struct cgroup *cont)
error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- if (error) {
- /*
- * We call put now because our (and parent's) refcnts
- * are already in place. mem_cgroup_put() will internally
- * call __mem_cgroup_free, so return directly
- */
- mem_cgroup_put(memcg);
- if (parent->use_hierarchy)
- mem_cgroup_put(parent);
- }
return error;
}
@@ -6330,9 +6326,23 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ struct cgroup *iter;
mem_cgroup_invalidate_reclaim_iterators(memcg);
+
+ /*
+ * This requires that offlining is serialized. Right now that is
+ * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+ */
+ rcu_read_lock();
+ cgroup_for_each_descendant_post(iter, cont) {
+ rcu_read_unlock();
+ mem_cgroup_reparent_charges(mem_cgroup_from_cont(iter));
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
mem_cgroup_reparent_charges(memcg);
+
mem_cgroup_destroy_all_caches(memcg);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f1932..59c62fa75c5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -854,14 +854,14 @@ static int page_action(struct page_state *ps, struct page *p,
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno, int flags)
+ int trapno, int flags, struct page **hpagep)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
int kill = 1, forcekill;
- struct page *hpage = compound_head(p);
+ struct page *hpage = *hpagep;
struct page *ppage;
if (PageReserved(p) || PageSlab(p))
@@ -936,6 +936,21 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
BUG_ON(!PageHWPoison(p));
return SWAP_FAIL;
}
+ /*
+ * We pinned the head page for hwpoison handling,
+ * now we split the thp and we are interested in
+ * the hwpoisoned raw page, so move the refcount
+ * to it. Similarly, page lock is shifted.
+ */
+ if (hpage != p) {
+ if (!(flags & MF_COUNT_INCREASED)) {
+ put_page(hpage);
+ get_page(p);
+ }
+ lock_page(p);
+ unlock_page(hpage);
+ *hpagep = p;
+ }
/* THP is split, so ppage should be the real poisoned page. */
ppage = p;
}
@@ -952,17 +967,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(ppage, &tokill);
- if (hpage != ppage)
- lock_page(ppage);
-
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(ppage));
- if (hpage != ppage)
- unlock_page(ppage);
-
/*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
@@ -1179,8 +1188,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ *
+ * When the raw error page is thp tail page, hpage points to the raw
+ * page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
+ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+ != SWAP_SUCCESS) {
printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
res = -EBUSY;
goto out;
@@ -1410,7 +1423,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/*
* Isolate the page, so that it doesn't get reallocated if it
- * was free.
+ * was free. This flag should be kept set until the source page
+ * is freed and PG_hwpoison on it is set.
*/
set_migratetype_isolate(p, true);
/*
@@ -1433,7 +1447,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/* Not a free page */
ret = 1;
}
- unset_migratetype_isolate(p, MIGRATE_MOVABLE);
unlock_memory_hotplug();
return ret;
}
@@ -1489,12 +1502,17 @@ static int soft_offline_huge_page(struct page *page, int flags)
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
} else {
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_trans_order(hpage),
- &num_poisoned_pages);
+ /* overcommit hugetlb page will be freed to buddy */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
}
- /* keep elevated page count for bad page */
return ret;
}
@@ -1526,7 +1544,7 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_trans_head(page);
+ struct page *hpage = compound_head(page);
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
@@ -1559,7 +1577,7 @@ int soft_offline_page(struct page *page, int flags)
atomic_long_inc(&num_poisoned_pages);
}
}
- /* keep elevated page count for bad page */
+ unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}
@@ -1625,7 +1643,22 @@ static int __soft_offline_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
+ /*
+ * After page migration succeeds, the source page can
+ * be trapped in pagevec and actual freeing is delayed.
+ * Freeing code works differently based on PG_hwpoison,
+ * so there's a race. We need to make sure that the
+ * source page should be freed back to buddy before
+ * setting PG_hwpoison.
+ */
+ if (!is_free_buddy_page(page))
+ lru_add_drain_all();
+ if (!is_free_buddy_page(page))
+ drain_all_pages();
SetPageHWPoison(page);
+ if (!is_free_buddy_page(page))
+ pr_info("soft offline: %#lx: page leaked\n",
+ pfn);
atomic_long_inc(&num_poisoned_pages);
}
} else {
diff --git a/mm/memory.c b/mm/memory.c
index 61a262b08e5..4b60011907d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -211,14 +211,15 @@ static int tlb_next_batch(struct mmu_gather *tlb)
* tear-down from @mm. The @fullmm argument is used when @mm is without
* users and we're going to destroy the full address space (exit/execve).
*/
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
{
tlb->mm = mm;
- tlb->fullmm = fullmm;
+ /* Is it from 0 to ~0? */
+ tlb->fullmm = !(start | (end+1));
tlb->need_flush_all = 0;
- tlb->start = -1UL;
- tlb->end = 0;
+ tlb->start = start;
+ tlb->end = end;
tlb->need_flush = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
@@ -258,8 +259,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
{
struct mmu_gather_batch *batch, *next;
- tlb->start = start;
- tlb->end = end;
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -1203,13 +1202,23 @@ again:
* and page-free while holding it.
*/
if (force_flush) {
+ unsigned long old_end;
+
force_flush = 0;
-#ifdef HAVE_GENERIC_MMU_GATHER
- tlb->start = addr;
- tlb->end = end;
-#endif
+ /*
+ * Flush the TLB just for the previous segment,
+ * then update the range to be the remaining
+ * TLB range.
+ */
+ old_end = tlb->end;
+ tlb->end = addr;
+
tlb_flush_mmu(tlb);
+
+ tlb->start = addr;
+ tlb->end = old_end;
+
if (addr != end)
goto again;
}
@@ -1396,7 +1405,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end = start + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
@@ -1422,7 +1431,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
unsigned long end = address + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, address, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, address, end);
unmap_single_vma(&tlb, vma, address, end, details);
@@ -3516,12 +3525,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int current_nid)
+ unsigned long addr, int page_nid)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
return mpol_misplaced(page, vma, addr);
@@ -3532,7 +3541,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page = NULL;
spinlock_t *ptl;
- int current_nid = -1;
+ int page_nid = -1;
int target_nid;
bool migrated = false;
@@ -3562,15 +3571,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
- current_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
- /*
- * Account for the fault against the current node if it not
- * being replaced regardless of where the page is located.
- */
- current_nid = numa_node_id();
put_page(page);
goto out;
}
@@ -3578,11 +3582,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Migrate to the requested node */
migrated = migrate_misplaced_page(page, target_nid);
if (migrated)
- current_nid = target_nid;
+ page_nid = target_nid;
out:
- if (current_nid != -1)
- task_numa_fault(current_nid, 1, migrated);
+ if (page_nid != -1)
+ task_numa_fault(page_nid, 1, migrated);
return 0;
}
@@ -3597,7 +3601,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long offset;
spinlock_t *ptl;
bool numa = false;
- int local_nid = numa_node_id();
spin_lock(&mm->page_table_lock);
pmd = *pmdp;
@@ -3620,9 +3623,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte;
struct page *page;
- int curr_nid = local_nid;
+ int page_nid = -1;
int target_nid;
- bool migrated;
+ bool migrated = false;
+
if (!pte_present(pteval))
continue;
if (!pte_numa(pteval))
@@ -3644,25 +3648,19 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(page_mapcount(page) != 1))
continue;
- /*
- * Note that the NUMA fault is later accounted to either
- * the node that is currently running or where the page is
- * migrated to.
- */
- curr_nid = local_nid;
- target_nid = numa_migrate_prep(page, vma, addr,
- page_to_nid(page));
- if (target_nid == -1) {
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+ pte_unmap_unlock(pte, ptl);
+ if (target_nid != -1) {
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ page_nid = target_nid;
+ } else {
put_page(page);
- continue;
}
- /* Migrate to the requested node */
- pte_unmap_unlock(pte, ptl);
- migrated = migrate_misplaced_page(page, target_nid);
- if (migrated)
- curr_nid = target_nid;
- task_numa_fault(curr_nid, 1, migrated);
+ if (page_nid != -1)
+ task_numa_fault(page_nid, 1, migrated);
pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
@@ -4065,6 +4063,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
return len;
}
+EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 74310017296..6c2dace665a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -732,7 +732,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (prev) {
vma = prev;
next = vma->vm_next;
- continue;
+ if (mpol_equal(vma_policy(vma), new_pol))
+ continue;
+ /* vma_merge() joined vma && vma->next, case 8 */
+ goto replace;
}
if (vma->vm_start != vmstart) {
err = split_vma(vma->vm_mm, vma, vmstart, 1);
@@ -744,6 +747,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (err)
goto out;
}
+ replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
@@ -2797,7 +2801,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
*/
VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
- if (!pol || pol == &default_policy)
+ if (!pol || pol == &default_policy || (pol->flags & MPOL_F_MORON))
mode = MPOL_DEFAULT;
else
mode = pol->mode;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f0c24438bb..a88c12f2235 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -103,7 +103,7 @@ void putback_movable_pages(struct list_head *l)
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (unlikely(balloon_page_movable(page)))
+ if (unlikely(isolated_balloon_page(page)))
balloon_page_putback(page);
else
putback_lru_page(page);
@@ -1710,12 +1710,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
unlock_page(new_page);
put_page(new_page); /* Free it */
- unlock_page(page);
+ /* Retake the callers reference and putback on LRU */
+ get_page(page);
putback_lru_page(page);
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
- count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
- isolated = 0;
- goto out;
+ goto out_unlock;
}
/*
@@ -1732,9 +1733,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
- page_add_new_anon_rmap(new_page, vma, haddr);
-
+ pmdp_clear_flush(vma, haddr, pmd);
set_pmd_at(mm, haddr, pmd, entry);
+ page_add_new_anon_rmap(new_page, vma, haddr);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(page);
/*
@@ -1753,7 +1754,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-out:
mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru,
-HPAGE_PMD_NR);
@@ -1762,6 +1762,11 @@ out:
out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref:
+ entry = pmd_mknonnuma(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+
+out_unlock:
unlock_page(page);
put_page(page);
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e1842fa..8f87b14c796 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -865,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- vma_set_policy(vma, vma_policy(next));
+ mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
@@ -1853,7 +1853,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1862,7 +1862,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
@@ -1901,7 +1901,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1911,14 +1911,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = PAGE_SIZE;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);
@@ -2356,7 +2356,7 @@ static void unmap_region(struct mm_struct *mm,
struct mmu_gather tlb;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
@@ -2735,7 +2735,7 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
- tlb_gather_mmu(&tlb, mm, 1);
+ tlb_gather_mmu(&tlb, mm, 0, -1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4d6b4..e9f65aaa318 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -135,6 +135,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pmd_t *pmd;
unsigned long next;
unsigned long pages = 0;
+ unsigned long nr_huge_updates = 0;
bool all_same_node;
pmd = pmd_offset(pud, addr);
@@ -146,6 +147,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
else if (change_huge_pmd(vma, pmd, addr, newprot,
prot_numa)) {
pages += HPAGE_PMD_NR;
+ nr_huge_updates++;
continue;
}
/* fall through */
@@ -165,6 +167,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
+ if (nr_huge_updates)
+ count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
+
return pages;
}
@@ -201,6 +206,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
+ set_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -212,6 +218,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
+ clear_tlb_flush_pending(mm);
return pages;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e451a78c9..dfa94ed3c7f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -170,7 +170,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- adj -= 30;
+ points -= (points * 3) / 100;
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4514ad7415c..73cbc5dc150 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -188,6 +188,26 @@ static unsigned long writeout_period_time = 0;
* global dirtyable memory first.
*/
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ unsigned long nr_pages;
+
+ nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+
+ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
+ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+
+ return nr_pages;
+}
+
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -195,11 +215,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
unsigned long x = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z =
- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
- x += zone_page_state(z, NR_FREE_PAGES) +
- zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+ x += zone_dirtyable_memory(z);
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
@@ -235,9 +253,12 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ x = global_page_state(NR_FREE_PAGES);
x -= min(x, dirty_balance_reserve);
+ x += global_page_state(NR_INACTIVE_FILE);
+ x += global_page_state(NR_ACTIVE_FILE);
+
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -289,32 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
/**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
- *
- * Returns the zone's number of pages potentially available for dirty
- * page cache. This is the base value for the per-zone dirty limits.
- */
-static unsigned long zone_dirtyable_memory(struct zone *zone)
-{
- /*
- * The effective global number of dirtyable pages may exclude
- * highmem as a big-picture measure to keep the ratio between
- * dirty memory and lowmem reasonable.
- *
- * But this function is purely about the individual zone and a
- * highmem zone can hold its share of dirty pages, so we don't
- * care about vm_highmem_is_dirtyable here.
- */
- unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
- zone_reclaimable_pages(zone);
-
- /* don't allow this to underflow */
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
- return nr_pages;
-}
-
-/**
* zone_dirty_limit - maximum number of dirty pages allowed in a zone
* @zone: the zone
*
@@ -1104,11 +1099,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
return 1;
}
-static long bdi_max_pause(struct backing_dev_info *bdi,
- unsigned long bdi_dirty)
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+ unsigned long bdi_dirty)
{
- long bw = bdi->avg_write_bandwidth;
- long t;
+ unsigned long bw = bdi->avg_write_bandwidth;
+ unsigned long t;
/*
* Limit pause time for small memory systems. If sleeping for too long
@@ -1120,7 +1115,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi,
t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
- return min_t(long, t, MAX_PAUSE);
+ return min_t(unsigned long, t, MAX_PAUSE);
}
static long bdi_min_pause(struct backing_dev_info *bdi,
@@ -2031,11 +2026,12 @@ int __set_page_dirty_nobuffers(struct page *page)
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
+ unsigned long flags;
if (!mapping)
return 1;
- spin_lock_irq(&mapping->tree_lock);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
@@ -2044,7 +2040,7 @@ int __set_page_dirty_nobuffers(struct page *page)
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fcc..0ab02fb8e9b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -360,9 +360,11 @@ void prep_compound_page(struct page *page, unsigned long order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- __SetPageTail(p);
set_page_count(p, 0);
p->first_page = page;
+ /* Make sure p->first_page is always valid for PageTail() */
+ smp_wmb();
+ __SetPageTail(p);
}
}
@@ -6142,6 +6144,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages -= 1 << order;
+#endif
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 5da2cbcfdbb..2beeabf502c 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -242,7 +242,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
if (err)
break;
pgd++;
- } while (addr = next, addr != end);
+ } while (addr = next, addr < end);
return err;
}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c8323fe6c8..4b62a16fc3c 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -86,9 +86,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
+ struct mm_struct *mm = (vma)->vm_mm;
pte_t pte;
- pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
- if (pte_accessible(pte))
+ pte = ptep_get_and_clear(mm, address, ptep);
+ if (pte_accessible(mm, pte))
flush_tlb_page(vma, address);
return pte;
}
@@ -166,6 +167,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ pmd_t entry = *pmdp;
+ if (pmd_numa(entry))
+ entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 6280da86b5d..3f6077461ae 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
spinlock_t *ptl;
if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return NULL;
+
ptl = &mm->page_table_lock;
goto check;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 5e6a8422658..509b393ecee 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2879,14 +2879,8 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
/* common code */
-static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen)
-{
- return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
- dentry->d_name.name);
-}
-
static struct dentry_operations anon_ops = {
- .d_dname = shmem_dname
+ .d_dname = simple_dname
};
/**
diff --git a/mm/slab.c b/mm/slab.c
index 8ccd296c6d9..bd88411595b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -565,7 +565,7 @@ static void init_node_lock_keys(int q)
if (slab_state < UP)
return;
- for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) {
+ for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
struct kmem_cache_node *n;
struct kmem_cache *cache = kmalloc_caches[i];
diff --git a/mm/slab.h b/mm/slab.h
index f96b49e4704..4d6d836247d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -162,6 +162,8 @@ static inline const char *cache_name(struct kmem_cache *s)
static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
{
+ if (!s->memcg_params)
+ return NULL;
return s->memcg_params->memcg_caches[idx];
}
diff --git a/mm/slub.c b/mm/slub.c
index 57707f01bcf..deaed7b4721 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1201,8 +1201,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size,
/*
* Enable debugging if selected on the kernel commandline.
*/
- if (slub_debug && (!slub_debug_slabs ||
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
+ if (slub_debug && (!slub_debug_slabs || (name &&
+ !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
flags |= slub_debug;
return flags;
@@ -4285,7 +4285,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
page = ACCESS_ONCE(c->partial);
if (page) {
- x = page->pobjects;
+ node = page_to_nid(page);
+ if (flags & SO_TOTAL)
+ WARN_ON_ONCE(1);
+ else if (flags & SO_OBJECTS)
+ WARN_ON_ONCE(1);
+ else
+ x = page->pages;
total += x;
nodes[node] += x;
}
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71d684..4e35f3ff042 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
+#include <linux/hugetlb.h>
#include "internal.h"
@@ -80,7 +81,7 @@ static void put_compound_page(struct page *page)
{
if (unlikely(PageTail(page))) {
/* __split_huge_page_refcount can run under us */
- struct page *page_head = compound_trans_head(page);
+ struct page *page_head = compound_head(page);
if (likely(page != page_head &&
get_page_unless_zero(page_head))) {
@@ -94,14 +95,31 @@ static void put_compound_page(struct page *page)
* still hot on arches that do not support
* this_cpu_cmpxchg_double().
*/
- if (PageSlab(page_head)) {
- if (PageTail(page)) {
+ if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+ if (likely(PageTail(page))) {
+ /*
+ * __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON(!PageHead(page_head));
+ atomic_dec(&page->_mapcount);
if (put_page_testzero(page_head))
VM_BUG_ON(1);
-
- atomic_dec(&page->_mapcount);
- goto skip_lock_tail;
+ if (put_page_testzero(page_head))
+ __put_compound_page(page_head);
+ return;
} else
+ /*
+ * __split_huge_page_refcount
+ * run before us, "page" was a
+ * THP tail. The split
+ * page_head has been freed
+ * and reallocated as slab or
+ * hugetlbfs page of smaller
+ * order (only possible if
+ * reallocated as slab on
+ * x86).
+ */
goto skip_lock;
}
/*
@@ -115,8 +133,27 @@ static void put_compound_page(struct page *page)
/* __split_huge_page_refcount run before us */
compound_unlock_irqrestore(page_head, flags);
skip_lock:
- if (put_page_testzero(page_head))
- __put_single_page(page_head);
+ if (put_page_testzero(page_head)) {
+ /*
+ * The head page may have been
+ * freed and reallocated as a
+ * compound page of smaller
+ * order and then freed again.
+ * All we know is that it
+ * cannot have become: a THP
+ * page, a compound page of
+ * higher order, a tail page.
+ * That is because we still
+ * hold the refcount of the
+ * split THP tail and
+ * page_head was the THP head
+ * before the split.
+ */
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
out_put_single:
if (put_page_testzero(page))
__put_single_page(page);
@@ -138,7 +175,6 @@ out_put_single:
VM_BUG_ON(atomic_read(&page->_count) != 0);
compound_unlock_irqrestore(page_head, flags);
-skip_lock_tail:
if (put_page_testzero(page_head)) {
if (PageHead(page_head))
__put_compound_page(page_head);
@@ -183,16 +219,30 @@ bool __get_page_tail(struct page *page)
*/
unsigned long flags;
bool got = false;
- struct page *page_head = compound_trans_head(page);
+ struct page *page_head = compound_head(page);
if (likely(page != page_head && get_page_unless_zero(page_head))) {
-
/* Ref to put_compound_page() comment. */
- if (PageSlab(page_head)) {
+ if (PageSlab(page_head) || PageHeadHuge(page_head)) {
if (likely(PageTail(page))) {
+ /*
+ * This is a hugetlbfs page or a slab
+ * page. __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON(!PageHead(page_head));
__get_page_tail_foll(page, false);
return true;
} else {
+ /*
+ * __split_huge_page_refcount run
+ * before us, "page" was a THP
+ * tail. The split page_head has been
+ * freed and reallocated as slab or
+ * hugetlbfs page of smaller order
+ * (only possible if reallocated as
+ * slab on x86).
+ */
put_page(page_head);
return false;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d365724feb0..d4565606cc9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -388,12 +388,12 @@ nocache:
addr = ALIGN(first->va_end, align);
if (addr < vstart)
goto nocache;
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
} else {
addr = ALIGN(vstart, align);
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
n = vmap_area_root.rb_node;
@@ -420,7 +420,7 @@ nocache:
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
addr = ALIGN(first->va_end, align);
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
if (list_is_last(&first->list, &vmap_area_list))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee..43ddef3cf44 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,7 @@
#include <asm/div64.h>
#include <linux/swapops.h>
+#include <linux/balloon_compaction.h>
#include "internal.h"
@@ -978,7 +979,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_cache(page) && !PageDirty(page)) {
+ if (page_is_file_cache(page) && !PageDirty(page) &&
+ !isolated_balloon_page(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -2115,6 +2117,20 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
return aborted_reclaim;
}
+static unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+ int nr;
+
+ nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (get_nr_swap_pages() > 0)
+ nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+
+ return nr;
+}
+
static bool zone_reclaimable(struct zone *zone)
{
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
@@ -3073,41 +3089,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
wake_up_interruptible(&pgdat->kswapd_wait);
}
-/*
- * The reclaimable count would be mostly accurate.
- * The less reclaimable pages may be
- * - mlocked pages, which will be moved to unevictable list when encountered
- * - mapped pages, which may require several travels to be reclaimed
- * - dirty pages, which is not "instantly" reclaimable
- */
-unsigned long global_reclaimable_pages(void)
-{
- int nr;
-
- nr = global_page_state(NR_ACTIVE_FILE) +
- global_page_state(NR_INACTIVE_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += global_page_state(NR_ACTIVE_ANON) +
- global_page_state(NR_INACTIVE_ANON);
-
- return nr;
-}
-
-unsigned long zone_reclaimable_pages(struct zone *zone)
-{
- int nr;
-
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON);
-
- return nr;
-}
-
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f42745e6578..6d9bace4e58 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/cpumask.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
@@ -432,11 +433,12 @@ EXPORT_SYMBOL(dec_zone_page_state);
* with the global counters. These could cause remote node cache line
* bouncing and will have to be only done when necessary.
*/
-void refresh_cpu_vm_stats(int cpu)
+bool refresh_cpu_vm_stats(int cpu)
{
struct zone *zone;
int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ bool vm_activity = false;
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
@@ -483,14 +485,21 @@ void refresh_cpu_vm_stats(int cpu)
if (p->expire)
continue;
- if (p->pcp.count)
+ if (p->pcp.count) {
+ vm_activity = true;
drain_zone_pages(zone, &p->pcp);
+ }
#endif
}
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (global_diff[i])
+ if (global_diff[i]) {
atomic_long_add(global_diff[i], &vm_stat[i]);
+ vm_activity = true;
+ }
+
+ return vm_activity;
+
}
/*
@@ -779,6 +788,7 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA_BALANCING
"numa_pte_updates",
+ "numa_huge_pte_updates",
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
@@ -1174,22 +1184,72 @@ static const struct file_operations proc_vmstat_file_operations = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static struct cpumask vmstat_off_cpus;
+struct delayed_work vmstat_monitor_work;
-static void vmstat_update(struct work_struct *w)
+static inline bool need_vmstat(int cpu)
{
- refresh_cpu_vm_stats(smp_processor_id());
- schedule_delayed_work(&__get_cpu_var(vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ struct zone *zone;
+ int i;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p;
+
+ p = per_cpu_ptr(zone->pageset, cpu);
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (p->vm_stat_diff[i])
+ return true;
+
+ if (zone_to_nid(zone) != numa_node_id() && p->pcp.count)
+ return true;
+ }
+
+ return false;
}
-static void __cpuinit start_cpu_timer(int cpu)
+static void vmstat_update(struct work_struct *w);
+
+static void start_cpu_timer(int cpu)
{
struct delayed_work *work = &per_cpu(vmstat_work, cpu);
- INIT_DEFERRABLE_WORK(work, vmstat_update);
+ cpumask_clear_cpu(cpu, &vmstat_off_cpus);
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
+static void __cpuinit setup_cpu_timer(int cpu)
+{
+ struct delayed_work *work = &per_cpu(vmstat_work, cpu);
+
+ INIT_DEFERRABLE_WORK(work, vmstat_update);
+ start_cpu_timer(cpu);
+}
+
+static void vmstat_update_monitor(struct work_struct *w)
+{
+ int cpu;
+
+ for_each_cpu_and(cpu, &vmstat_off_cpus, cpu_online_mask)
+ if (need_vmstat(cpu))
+ start_cpu_timer(cpu);
+
+ queue_delayed_work(system_unbound_wq, &vmstat_monitor_work,
+ round_jiffies_relative(sysctl_stat_interval));
+}
+
+
+static void vmstat_update(struct work_struct *w)
+{
+ int cpu = smp_processor_id();
+
+ if (likely(refresh_cpu_vm_stats(cpu)))
+ schedule_delayed_work(&__get_cpu_var(vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ else
+ cpumask_set_cpu(cpu, &vmstat_off_cpus);
+}
+
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
@@ -1204,17 +1264,19 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- per_cpu(vmstat_work, cpu).work.func = NULL;
+ if (!cpumask_test_cpu(cpu, &vmstat_off_cpus)) {
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ per_cpu(vmstat_work, cpu).work.func = NULL;
+ }
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1237,8 +1299,14 @@ static int __init setup_vmstat(void)
register_cpu_notifier(&vmstat_notifier);
+ INIT_DEFERRABLE_WORK(&vmstat_monitor_work,
+ vmstat_update_monitor);
+ queue_delayed_work(system_unbound_wq,
+ &vmstat_monitor_work,
+ round_jiffies_relative(HZ));
+
for_each_online_cpu(cpu)
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);