aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorMark Brown <broonie@kernel.org>2018-05-22 10:28:38 +0100
committerMark Brown <broonie@kernel.org>2018-05-22 10:28:38 +0100
commite1c51aa8705fd78e5259e0515aad3b5003e18550 (patch)
tree3728628142a24460d6b488e5331f421f902e5d3a /mm
parent6e49b7fa3083c30688b7fff1b37ed891cd1c30af (diff)
parent08556e03ad36b2c6219aba90de88ccf58038e208 (diff)
Merge branch 'linux-linaro-lsk-v4.9' into linux-linaro-lsk-v4.9-rtlinux-linaro-lsk-v4.9-rt-test
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c9
-rw-r--r--mm/frame_vector.c12
-rw-r--r--mm/gup.c67
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/khugepaged.c7
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c40
-rw-r--r--mm/page-writeback.c18
-rw-r--r--mm/percpu.c1
-rw-r--r--mm/shmem.c31
-rw-r--r--mm/slab.c3
-rw-r--r--mm/vmscan.c19
-rw-r--r--mm/vmstat.c22
13 files changed, 185 insertions, 56 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a8d2c7a73d54..113c2248c476 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -621,7 +621,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
VM_BUG_ON_PAGE(!PageLocked(new), new);
VM_BUG_ON_PAGE(new->mapping, new);
- error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
@@ -677,7 +677,7 @@ static int __add_to_page_cache_locked(struct page *page,
return error;
}
- error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
+ error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
if (error) {
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
@@ -1252,8 +1252,7 @@ no_page:
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
- err = add_to_page_cache_lru(page, mapping, offset,
- gfp_mask & GFP_RECLAIM_MASK);
+ err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
if (unlikely(err)) {
put_page(page);
page = NULL;
@@ -2001,7 +2000,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
+ ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index db77dcb38afd..375a103d7a56 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -52,6 +52,18 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
ret = -EFAULT;
goto out;
}
+
+ /*
+ * While get_vaddr_frames() could be used for transient (kernel
+ * controlled lifetime) pinning of memory pages all current
+ * users establish long term (userspace controlled lifetime)
+ * page pinning. Treat get_vaddr_frames() like
+ * get_user_pages_longterm() and disallow it for filesystem-dax
+ * mappings.
+ */
+ if (vma_is_fsdax(vma))
+ return -EOPNOTSUPP;
+
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
diff --git a/mm/gup.c b/mm/gup.c
index c63a0341ae38..be4ccddac26f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -430,6 +430,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
+ if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ return -EFAULT;
+
if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
@@ -982,6 +985,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
+#ifdef CONFIG_FS_DAX
+/*
+ * This is the same as get_user_pages() in that it assumes we are
+ * operating on the current task's mm, but it goes further to validate
+ * that the vmas associated with the address range are suitable for
+ * longterm elevated page reference counts. For example, filesystem-dax
+ * mappings are subject to the lifetime enforced by the filesystem and
+ * we need guarantees that longterm users like RDMA and V4L2 only
+ * establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * "longterm" == userspace controlled elevated page count lifetime.
+ * Contrast this to iov_iter_get_pages() usages which are transient.
+ */
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas_arg)
+{
+ struct vm_area_struct **vmas = vmas_arg;
+ struct vm_area_struct *vma_prev = NULL;
+ long rc, i;
+
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas) {
+ vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas)
+ return -ENOMEM;
+ }
+
+ rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+
+ for (i = 0; i < rc; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma == vma_prev)
+ continue;
+
+ vma_prev = vma;
+
+ if (vma_is_fsdax(vma))
+ break;
+ }
+
+ /*
+ * Either get_user_pages() failed, or the vma validation
+ * succeeded, in either case we don't need to put_page() before
+ * returning.
+ */
+ if (i >= rc)
+ goto out;
+
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
+out:
+ if (vmas != vmas_arg)
+ kfree(vmas);
+ return rc;
+}
+EXPORT_SYMBOL(get_user_pages_longterm);
+#endif /* CONFIG_FS_DAX */
+
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c234c078693c..e2982ea26090 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2279,11 +2279,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
- lock_page(page);
+ if (!trylock_page(page))
+ goto next;
/* split_huge_page() removes page from list on success */
if (!split_huge_page(page))
split++;
unlock_page(page);
+next:
put_page(page);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5d7c006373d3..898eb26f5dc8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -528,7 +528,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
- VM_BUG_ON_PAGE(PageCompound(page), page);
+ /* TODO: teach khugepaged to collapse THP mapped with pte */
+ if (PageCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out;
+ }
+
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5aa71a82ca73..851efb004857 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -921,6 +921,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
int ret;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
+ bool mlocked = PageMlocked(hpage);
/*
* Here we are interested only in user-mapped pages, so skip any
@@ -985,6 +986,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
pfn, page_mapcount(hpage));
/*
+ * try_to_unmap() might put mlocked page in lru cache, so call
+ * shake_page() again to ensure that it's flushed.
+ */
+ if (mlocked)
+ shake_page(hpage, 0);
+
+ /*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
* killing is needed or not. Only kill when the page
diff --git a/mm/memory.c b/mm/memory.c
index e2e68767a373..d2db2c4eb0a4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2848,6 +2848,17 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
return ret;
}
+/*
+ * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
+ * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
+ * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
+ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
+ */
+static int pmd_devmap_trans_unstable(pmd_t *pmd)
+{
+ return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
+}
+
static int pte_alloc_one_map(struct fault_env *fe)
{
struct vm_area_struct *vma = fe->vma;
@@ -2871,18 +2882,27 @@ static int pte_alloc_one_map(struct fault_env *fe)
map_pte:
/*
* If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
- * didn't become pmd_trans_huge under us and then back to pmd_none, as
- * a result of MADV_DONTNEED running immediately after a huge pmd fault
- * in a different thread of this mm, in turn leading to a misleading
- * pmd_trans_huge() retval. All we have to ensure is that it is a
- * regular pmd that we can walk with pte_offset_map() and we can do that
- * through an atomic read in C, which is what pmd_trans_unstable()
- * provides.
+ * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
+ * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
+ * under us and then back to pmd_none, as a result of MADV_DONTNEED
+ * running immediately after a huge pmd fault in a different thread of
+ * this mm, in turn leading to a misleading pmd_trans_huge() retval.
+ * All we have to ensure is that it is a regular pmd that we can walk
+ * with pte_offset_map() and we can do that through an atomic read in
+ * C, which is what pmd_trans_unstable() provides.
*/
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_devmap_trans_unstable(fe->pmd))
return VM_FAULT_NOPAGE;
+ /*
+ * At this point we know that our vmf->pmd points to a page of ptes
+ * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
+ * for the duration of the fault. If a racing MADV_DONTNEED runs and
+ * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
+ * be valid and we will re-check to make sure the vmf->pte isn't
+ * pte_none() under vmf->ptl protection when we return to
+ * alloc_set_pte().
+ */
fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
&fe->ptl);
return 0;
@@ -3456,7 +3476,7 @@ static int handle_pte_fault(struct fault_env *fe)
fe->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_devmap_trans_unstable(fe->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439cc63ad903..807236aed275 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2506,13 +2506,13 @@ void account_page_redirty(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
current->nr_dirtied--;
dec_node_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
}
}
EXPORT_SYMBOL(account_page_redirty);
@@ -2618,15 +2618,15 @@ void cancel_dirty_page(struct page *page)
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
lock_page_memcg(page);
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page))
account_page_cleaned(page, mapping, wb);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
unlock_page_memcg(page);
} else {
ClearPageDirty(page);
@@ -2658,7 +2658,7 @@ int clear_page_dirty_for_io(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
/*
* Yes, Virginia, this is indeed insane.
@@ -2695,7 +2695,7 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page)) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_node_page_state(page, NR_FILE_DIRTY);
@@ -2703,7 +2703,7 @@ int clear_page_dirty_for_io(struct page *page)
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
return ret;
}
return TestClearPageDirty(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index 4e739fcf91bf..88b57ee35ba3 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -70,6 +70,7 @@
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
+#include <linux/sched.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index 2123bfc39ef2..42ca5df2c0e3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -466,36 +466,45 @@ next:
info = list_entry(pos, struct shmem_inode_info, shrinklist);
inode = &info->vfs_inode;
- if (nr_to_split && split >= nr_to_split) {
- iput(inode);
- continue;
- }
+ if (nr_to_split && split >= nr_to_split)
+ goto leave;
- page = find_lock_page(inode->i_mapping,
+ page = find_get_page(inode->i_mapping,
(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
if (!page)
goto drop;
+ /* No huge page at the end of the file: nothing to split */
if (!PageTransHuge(page)) {
- unlock_page(page);
put_page(page);
goto drop;
}
+ /*
+ * Leave the inode on the list if we failed to lock
+ * the page at this time.
+ *
+ * Waiting for the lock may lead to deadlock in the
+ * reclaim path.
+ */
+ if (!trylock_page(page)) {
+ put_page(page);
+ goto leave;
+ }
+
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
- if (ret) {
- /* split failed: leave it on the list */
- iput(inode);
- continue;
- }
+ /* If split failed leave the inode on the list */
+ if (ret)
+ goto leave;
split++;
drop:
list_del_init(&info->shrinklist);
removed++;
+leave:
iput(inode);
}
diff --git a/mm/slab.c b/mm/slab.c
index 1f82d16a0518..c59844dbd034 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4096,7 +4096,8 @@ next:
next_reap_node();
out:
/* Set up the next iteration */
- schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
+ schedule_delayed_work_on(smp_processor_id(), work,
+ round_jiffies_relative(REAPTIMEOUT_AC));
}
#ifdef CONFIG_SLABINFO
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4365de74bdb0..557ad1367595 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2841,8 +2841,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
- if (!managed_zone(zone) ||
- pgdat_reclaimable_pages(pgdat) == 0)
+ if (!managed_zone(zone))
+ continue;
+
+ if (!zone_reclaimable_pages(zone))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2964,7 +2966,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = memalloc_noio_flags(gfp_mask),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
@@ -2979,12 +2981,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
* 1 is returned so that the page allocator does not OOM kill at this
* point.
*/
- if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
+ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
- gfp_mask,
+ sc.gfp_mask,
sc.reclaim_idx);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3747,16 +3749,15 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
- int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = memalloc_noio_flags(gfp_mask),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
- .reclaim_idx = classzone_idx,
+ .reclaim_idx = gfp_zone(gfp_mask),
};
cond_resched();
@@ -3766,7 +3767,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
* and RECLAIM_UNMAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
- lockdep_set_current_reclaim_state(gfp_mask);
+ lockdep_set_current_reclaim_state(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3585f345c22..00f9620e21ff 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1363,8 +1363,6 @@ static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
return zone == compare;
}
- /* The zone must be somewhere! */
- WARN_ON_ONCE(1);
return false;
}
@@ -1399,18 +1397,24 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone->present_pages,
zone->managed_pages);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- seq_printf(m, "\n %-12s %lu", vmstat_text[i],
- zone_page_state(zone, i));
-
seq_printf(m,
"\n protection: (%ld",
zone->lowmem_reserve[0]);
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
- seq_printf(m,
- ")"
- "\n pagesets");
+ seq_putc(m, ')');
+
+ /* If unpopulated, no other information is useful */
+ if (!populated_zone(zone)) {
+ seq_putc(m, '\n');
+ return;
+ }
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu", vmstat_text[i],
+ zone_page_state(zone, i));
+
+ seq_printf(m, "\n pagesets");
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;