diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 249 | ||||
-rw-r--r-- | mm/filemap.c | 341 | ||||
-rw-r--r-- | mm/internal.h | 22 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 4 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 56 | ||||
-rw-r--r-- | mm/mincore.c | 20 | ||||
-rw-r--r-- | mm/page_alloc.c | 62 | ||||
-rw-r--r-- | mm/readahead.c | 27 | ||||
-rw-r--r-- | mm/shmem.c | 122 | ||||
-rw-r--r-- | mm/swap.c | 51 | ||||
-rw-r--r-- | mm/truncate.c | 74 | ||||
-rw-r--r-- | mm/vmscan.c | 36 |
15 files changed, 697 insertions, 375 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 5e38e5706f62..4229fc22a477 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; - zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = start_pfn; + zone->compact_cached_migrate_pfn[1] = start_pfn; zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_flush = false; @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { struct zone *zone = cc->zone; + unsigned long pfn; if (cc->ignore_skip_hint) return; @@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (!nr_isolated) { - unsigned long pfn = page_to_pfn(page); + if (nr_isolated) + return; + + /* + * Only skip pageblocks when all forms of compaction will be known to + * fail in the near future. + */ + if (set_unsuitable) set_pageblock_skip(page); - /* Update where compaction should restart */ - if (migrate_scanner) { - if (!cc->finished_update_migrate && - pfn > zone->compact_cached_migrate_pfn) - zone->compact_cached_migrate_pfn = pfn; - } else { - if (!cc->finished_update_free && - pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + pfn = page_to_pfn(page); + + /* Update where async and sync compaction should restart */ + if (migrate_scanner) { + if (cc->finished_update_migrate) + return; + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; + } else { + if (cc->finished_update_free) + return; + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } } #else @@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } /* async aborts if taking too long or contended */ - if (!cc->sync) { + if (cc->mode == MIGRATE_ASYNC) { cc->contended = true; return false; } @@ -208,10 +222,28 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } -static inline bool compact_trylock_irqsave(spinlock_t *lock, - unsigned long *flags, struct compact_control *cc) +/* + * Aside from avoiding lock contention, compaction also periodically checks + * need_resched() and either schedules in sync compaction or aborts async + * compaction. This is similar to what compact_checklock_irqsave() does, but + * is used where no lock is concerned. + * + * Returns false when no scheduling was needed, or sync compaction scheduled. + * Returns true when async compaction should abort. + */ +static inline bool compact_should_abort(struct compact_control *cc) { - return compact_checklock_irqsave(lock, flags, false, cc); + /* async compaction aborts if contended */ + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) { + cc->contended = true; + return true; + } + + cond_resched(); + } + + return false; } /* Returns true if the page is within a block suitable for migration to */ @@ -329,7 +361,8 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); + update_pageblock_skip(cc, valid_page, total_isolated, true, + false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -464,8 +497,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; - bool skipped_async_unsuitable = false; - const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | + bool set_unsuitable = true; + const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? + ISOLATE_ASYNC_MIGRATE : 0) | (unevictable ? ISOLATE_UNEVICTABLE : 0); /* @@ -475,7 +509,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (!cc->sync) + if (cc->mode == MIGRATE_ASYNC) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -484,8 +518,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, return 0; } + if (compact_should_abort(cc)) + return 0; + /* Time to isolate some pages for migration */ - cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { /* give a chance to irqs before checking need_resched() */ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { @@ -540,9 +576,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * the minimum amount of work satisfies the allocation */ mt = get_pageblock_migratetype(page); - if (!cc->sync && !migrate_async_suitable(mt)) { - cc->finished_update_migrate = true; - skipped_async_unsuitable = true; + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(mt)) { + set_unsuitable = false; goto next_pageblock; } } @@ -646,11 +682,10 @@ next_pageblock: /* * Update the pageblock-skip information and cached scanner pfn, * if the whole pageblock was scanned without isolating any page. - * This is not done when pageblock was skipped due to being unsuitable - * for async compaction, so that eventual sync compaction can try. */ - if (low_pfn == end_pfn && !skipped_async_unsuitable) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + if (low_pfn == end_pfn) + update_pageblock_skip(cc, valid_page, nr_isolated, + set_unsuitable, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -671,7 +706,9 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn; + unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long block_end_pfn; /* end of current pageblock */ + unsigned long low_pfn; /* lowest pfn scanner is able to scan */ int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -679,41 +716,38 @@ static void isolate_freepages(struct zone *zone, * Initialise the free scanner. The starting point is where we last * successfully isolated from, zone-cached value, or the end of the * zone when isolating for the first time. We need this aligned to - * the pageblock boundary, because we do pfn -= pageblock_nr_pages - * in the for loop. + * the pageblock boundary, because we do + * block_start_pfn -= pageblock_nr_pages in the for loop. + * For ending point, take care when isolating in last pageblock of a + * a zone which ends in the middle of a pageblock. * The low boundary is the end of the pageblock the migration scanner * is using. */ - pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_end_pfn = min(block_start_pfn + pageblock_nr_pages, + zone_end_pfn(zone)); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. - */ - high_pfn = min(low_pfn, pfn); - - z_end_pfn = zone_end_pfn(zone); - - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { + for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages) { unsigned long isolated; - unsigned long end_pfn; /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need - * to schedule. + * to schedule, or even abort async compaction. */ - cond_resched(); + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; - if (!pfn_valid(pfn)) + if (!pfn_valid(block_start_pfn)) continue; /* @@ -723,7 +757,7 @@ static void isolate_freepages(struct zone *zone, * i.e. it's possible that all pages within a zones range of * pages do not belong to a single zone. */ - page = pfn_to_page(pfn); + page = pfn_to_page(block_start_pfn); if (page_zone(page) != zone) continue; @@ -736,26 +770,26 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ - isolated = 0; + cc->free_pfn = block_start_pfn; + isolated = isolate_freepages_block(cc, block_start_pfn, + block_end_pfn, freelist, false); + nr_freepages += isolated; /* - * Take care when isolating in last pageblock of a zone which - * ends in the middle of a pageblock. + * Set a flag that we successfully isolated in this pageblock. + * In the next loop iteration, zone->compact_cached_free_pfn + * will not be updated and thus it will effectively contain the + * highest pageblock we isolated pages from. */ - end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); - isolated = isolate_freepages_block(cc, pfn, end_pfn, - freelist, false); - nr_freepages += isolated; + if (isolated) + cc->finished_update_free = true; /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator + * isolate_freepages_block() might have aborted due to async + * compaction being contended */ - if (isolated) { - cc->finished_update_free = true; - high_pfn = max(high_pfn, pfn); - } + if (cc->contended) + break; } /* split_free_page does not map the pages */ @@ -765,10 +799,9 @@ static void isolate_freepages(struct zone *zone, * If we crossed the migrate scanner, we want to keep it that way * so that compact_finished() may detect this */ - if (pfn < low_pfn) - cc->free_pfn = max(pfn, zone->zone_start_pfn); - else - cc->free_pfn = high_pfn; + if (block_start_pfn < low_pfn) + cc->free_pfn = cc->migrate_pfn; + cc->nr_freepages = nr_freepages; } @@ -783,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* Isolate free pages if necessary */ + /* + * Isolate free pages if necessary, and if we are not aborting due to + * contention. + */ if (list_empty(&cc->freepages)) { - isolate_freepages(cc->zone, cc); + if (!cc->contended) + isolate_freepages(cc->zone, cc); if (list_empty(&cc->freepages)) return NULL; @@ -799,23 +836,16 @@ static struct page *compaction_alloc(struct page *migratepage, } /* - * We cannot control nr_migratepages and nr_freepages fully when migration is - * running as migrate_pages() has no knowledge of compact_control. When - * migration is complete, we count the number of pages on the lists by hand. + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no + * special handling needed for NUMA. */ -static void update_nr_listpages(struct compact_control *cc) +static void compaction_free(struct page *page, unsigned long data) { - int nr_migratepages = 0; - int nr_freepages = 0; - struct page *page; - - list_for_each_entry(page, &cc->migratepages, lru) - nr_migratepages++; - list_for_each_entry(page, &cc->freepages, lru) - nr_freepages++; + struct compact_control *cc = (struct compact_control *)data; - cc->nr_migratepages = nr_migratepages; - cc->nr_freepages = nr_freepages; + list_add(&page->lru, &cc->freepages); + cc->nr_freepages++; } /* possible outcome of isolate_migratepages */ @@ -862,13 +892,14 @@ static int compact_finished(struct zone *zone, unsigned int order; unsigned long watermark; - if (fatal_signal_pending(current)) + if (cc->contended || fatal_signal_pending(current)) return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { /* Let the next compaction start anew. */ - zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = zone_end_pfn(zone); /* @@ -968,6 +999,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -993,7 +1025,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -1001,7 +1033,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); @@ -1009,7 +1042,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { - unsigned long nr_migrate, nr_remaining; int err; switch (isolate_migratepages(zone, cc)) { @@ -1024,21 +1056,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - nr_migrate = cc->nr_migratepages; + if (!cc->nr_migratepages) + continue; + err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); - update_nr_listpages(cc); - nr_remaining = cc->nr_migratepages; - trace_mm_compaction_migratepages(nr_migrate - nr_remaining, - nr_remaining); + trace_mm_compaction_migratepages(cc->nr_migratepages, err, + &cc->migratepages); - /* Release isolated pages not migrated */ + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; if (err) { putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it @@ -1060,9 +1091,8 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync, bool *contended) +static unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum migrate_mode mode, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1071,7 +1101,7 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .sync = sync, + .mode = mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1093,7 +1123,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from - * @sync: Whether migration is synchronous or not + * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that is true if compaction was aborted due to lock contention * @page: Optionally capture a free page of the requested order during compaction * @@ -1101,7 +1131,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1126,7 +1156,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, + status = compact_zone_order(zone, order, gfp_mask, mode, contended); rc = max(status, rc); @@ -1165,9 +1195,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (zone_watermark_ok(zone, cc->order, low_wmark_pages(zone), 0, 0)) compaction_defer_reset(zone, cc->order, false); - /* Currently async compaction is never deferred. */ - else if (cc->sync) - defer_compaction(zone, cc->order); } VM_BUG_ON(!list_empty(&cc->freepages)); @@ -1179,7 +1206,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .sync = false, + .mode = MIGRATE_ASYNC, }; if (!order) @@ -1192,7 +1219,7 @@ static void compact_node(int nid) { struct compact_control cc = { .order = -1, - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; diff --git a/mm/filemap.c b/mm/filemap.c index 4cbc4cd512e4..bdaa21555abe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -448,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page) +{ + void **slot; + int error; + + slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); + if (slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + return 0; + } + error = radix_tree_insert(&mapping->page_tree, page->index, page); + if (!error) + mapping->nrpages++; + return error; +} + /** * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add @@ -482,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->index = offset; spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); + error = page_cache_tree_insert(mapping, page); radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); trace_mm_filemap_add_to_page_cache(page); @@ -688,14 +710,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * find_get_page - find and get a page reference + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + +/** + * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index * - * Is there a pagecache struct page at the given (mapping, offset) tuple? - * If yes, increment its refcount and return it; if no, return NULL. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. + * + * If the slot holds a shadow entry of a previously evicted page, it + * is returned. + * + * Otherwise, %NULL is returned. */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; @@ -736,24 +845,50 @@ out: return page; } -EXPORT_SYMBOL(find_get_page); +EXPORT_SYMBOL(find_get_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. * - * Returns zero if the page was not present. find_lock_page() may sleep. + * Otherwise, %NULL is returned. */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = find_get_entry(mapping, offset); + + if (radix_tree_exceptional_entry(page)) + page = NULL; + return page; +} +EXPORT_SYMBOL(find_get_page); + +/** + * find_lock_entry - locate, pin and lock a page cache entry + * @mapping: the address_space to search + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * If the slot holds a shadow entry of a previously evicted page, it + * is returned. + * + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. + */ +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - page = find_get_page(mapping, offset); + page = find_get_entry(mapping, offset); if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ @@ -766,6 +901,29 @@ repeat: } return page; } +EXPORT_SYMBOL(find_lock_entry); + +/** + * find_lock_page - locate, pin and lock a pagecache page + * @mapping: the address_space to search + * @offset: the page index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * Otherwise, %NULL is returned. + * + * find_lock_page() may sleep. + */ +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = find_lock_entry(mapping, offset); + + if (radix_tree_exceptional_entry(page)) + page = NULL; + return page; +} EXPORT_SYMBOL(find_lock_page); /** @@ -774,16 +932,18 @@ EXPORT_SYMBOL(find_lock_page); * @index: the page's index into the mapping * @gfp_mask: page allocation mode * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * If the page is not present, a new page is allocated using @gfp_mask + * and added to the page cache and the VM's LRU list. The page is + * returned locked and with an increased refcount. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! + * On memory exhaustion, %NULL is returned. * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. + * find_or_create_page() may sleep, even if @gfp_flags specifies an + * atomic allocation! */ struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) @@ -816,6 +976,76 @@ repeat: EXPORT_SYMBOL(find_or_create_page); /** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages are included in the returned + * array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * Otherwise, we must be storing a swap entry + * here as an exceptional entry: so return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} + +/** * find_get_pages - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index @@ -1797,6 +2027,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), @@ -1823,6 +2065,8 @@ repeat: if (err < 0) { page_cache_release(page); page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); } } return page; @@ -1859,6 +2103,10 @@ retry: if (err < 0) { page_cache_release(page); return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; } out: mark_page_accessed(page); @@ -1866,40 +2114,25 @@ out: } /** - * read_cache_page_async - read into page cache, fill it if needed + * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read * @data: first arg to filler(data, page) function, often left as NULL * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. + * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. */ -struct page *read_cache_page_async(struct address_space *mapping, +struct page *read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } -EXPORT_SYMBOL(read_cache_page_async); - -static struct page *wait_on_page_read(struct page *page) -{ - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - } - return page; -} +EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. @@ -1918,31 +2151,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, { filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); + return do_read_cache_page(mapping, index, filler, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: first arg to filler(data, page) function, often left as NULL - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page then wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) -{ - return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -} -EXPORT_SYMBOL(read_cache_page); - static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { @@ -1976,7 +2188,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, char *kaddr; size_t copied; - BUG_ON(!pagefault_disabled()); kaddr = kmap_atomic(page); if (likely(i->nr_segs == 1)) { int left; @@ -2350,9 +2561,7 @@ again: if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); diff --git a/mm/internal.h b/mm/internal.h index 3e910000fda4..1a8a0d4b687a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H +#include <linux/fs.h> #include <linux/mm.h> void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, @@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) atomic_set(&page->_count, v); } +extern int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static inline unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + return __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); +} + /* * Turn a non-refcounted page (->_count == 0) into refcounted with * a count of one. @@ -119,7 +134,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ + enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool finished_update_free; /* True when the zone cached pfns are * no longer being updated @@ -129,7 +144,10 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended, or + * need_resched() true during async + * compaction + */ }; unsigned long diff --git a/mm/madvise.c b/mm/madvise.c index 539eeb96b323..a402f8fdc68e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, for (; start < end; start += PAGE_SIZE) { index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - page = find_get_page(mapping, index); + page = find_get_entry(mapping, index); if (!radix_tree_exceptional_entry(page)) { if (page) page_cache_release(page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 33365e9ce6a7..a98c7fce470a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1540,7 +1540,7 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_move(&hpage->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", @@ -1621,7 +1621,7 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { if (!list_empty(&pagelist)) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a650db29606f..f6f23833de44 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1332,7 +1332,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * alloc_migrate_target should be improooooved!! * migrate_pages returns # of failed pages. */ - ret = migrate_pages(&source, alloc_migrate_target, 0, + ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 796c7e6cf93b..e8fff0fa1202 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1060,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, + err = migrate_pages(&pagelist, new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1306,7 +1306,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_page, + nr_failed = migrate_pages(&pagelist, new_page, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index 13f47fbe3550..3acac4a62c4b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -941,8 +941,9 @@ out: * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, enum migrate_mode mode) +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct page *page, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -986,11 +987,18 @@ out: page_is_file_cache(page)); putback_lru_page(page); } + /* - * Move the new page to the LRU. If migration was not successful - * then this will free the page. + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, putback_lru_page() will drop the reference grabbed + * during isolation. */ - putback_lru_page(newpage); + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { + ClearPageSwapBacked(newpage); + put_new_page(newpage, private); + } else + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -1019,8 +1027,9 @@ out: * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_page_t get_new_page, - unsigned long private, struct page *hpage, - int force, enum migrate_mode mode) + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -1059,20 +1068,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, 1, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) remove_migration_ptes(hpage, hpage); if (anon_vma) put_anon_vma(anon_vma); - if (!rc) + if (rc == MIGRATEPAGE_SUCCESS) hugetlb_cgroup_migrate(hpage, new_hpage); unlock_page(hpage); out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - put_page(new_hpage); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(new_hpage, private); + else + put_page(new_hpage); + if (result) { if (rc) *result = rc; @@ -1089,6 +1108,8 @@ out: * @from: The list of pages to be migrated. * @get_new_page: The function used to allocate free pages to be used * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for * page migration, if any. @@ -1102,7 +1123,8 @@ out: * Returns the number of pages that were not migrated, or an error code. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode, int reason) + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1124,10 +1146,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, mode); + put_new_page, private, page, + pass > 2, mode); else - rc = unmap_and_move(get_new_page, private, - page, pass > 2, mode); + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1276,7 +1299,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, + err = migrate_pages(&pagelist, new_page_node, NULL, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1732,7 +1755,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); diff --git a/mm/mincore.c b/mm/mincore.c index 101623378fbf..725c80961048 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP - /* shmem/tmpfs may return swap: account for swapcache page too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif if (page) { present = PageUptodate(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c69f474a1b60..139a57ee6b20 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -980,6 +980,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); return page; } @@ -1106,7 +1107,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. + * buddy pages to CMA itself. We also ensure the freepage_migratetype + * is set to CMA so it is returned to the correct freelist in case + * the page ends up being not actually allocated from the pcp lists. */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1174,6 +1177,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) expand(zone, page, order, current_order, area, new_type); + /* The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages. This is OK as long as it does + * not differ for MIGRATE_CMA type. + */ + set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype, new_type); @@ -1224,7 +1233,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int mt = migratetype, i; + int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1245,14 +1254,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - if (IS_ENABLED(CONFIG_CMA)) { - mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) - mt = migratetype; - } - set_freepage_migratetype(page, mt); list = &page->lru; - if (is_migrate_cma(mt)) + if (is_migrate_cma(get_freepage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1639,7 +1642,7 @@ again: goto failed; } __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); + get_freepage_migratetype(page)); spin_unlock(&zone->lock); } @@ -2302,7 +2305,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, + int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2316,7 +2319,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration, + nodemask, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; @@ -2349,7 +2352,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * As async compaction considers a subset of pageblocks, only * defer if the failure was a sync compaction failure. */ - if (sync_migration) + if (mode != MIGRATE_ASYNC) defer_compaction(preferred_zone, order); cond_resched(); @@ -2362,9 +2365,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int migratetype, enum migrate_mode mode, bool *contended_compaction, + bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } @@ -2559,7 +2561,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - bool sync_migration = false; + enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; bool contended_compaction = false; @@ -2653,17 +2655,15 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; - sync_migration = true; + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because @@ -2738,12 +2738,10 @@ rebalance: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -6318,7 +6316,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -6357,7 +6355,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/readahead.c b/mm/readahead.c index 1fa0d6fca556..0ca36a7770b1 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,9 +8,7 @@ */ #include <linux/kernel.h> -#include <linux/fs.h> #include <linux/gfp.h> -#include <linux/mm.h> #include <linux/export.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -20,6 +18,8 @@ #include <linux/syscalls.h> #include <linux/file.h> +#include "internal.h" + /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -149,8 +149,7 @@ out: * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -static int -__do_page_cache_readahead(struct address_space *mapping, struct file *filp, +int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) { @@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); rcu_read_unlock(); - if (page) + if (page && !radix_tree_exceptional_entry(page)) continue; page = page_cache_alloc_readahead(mapping); @@ -244,20 +243,6 @@ unsigned long max_sane_readahead(unsigned long nr) } /* - * Submit IO for the read-ahead request in file_ra_state. - */ -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) -{ - int actual; - - actual = __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); - - return actual; -} - -/* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra @@ -347,7 +332,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -427,7 +412,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset + 1, max); rcu_read_unlock(); if (!start || start - offset > max) diff --git a/mm/shmem.c b/mm/shmem.c index f0d698ba7d0f..0f1447563f17 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -243,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { void **pslot; - void *item = NULL; + void *item; VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (pslot) - item = radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock); + if (!pslot) + return -ENOENT; + item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - if (replacement) - radix_tree_replace_slot(pslot, replacement); - else - radix_tree_delete(&mapping->page_tree, index); + radix_tree_replace_slot(pslot, replacement); return 0; } @@ -332,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Like find_get_pages, but collecting swap entries as well as pages. - */ -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages, pgoff_t *indices) -{ - void **slot; - unsigned int ret = 0; - struct radix_tree_iter iter; - - if (!nr_pages) - return 0; - - rcu_read_lock(); -restart: - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; - /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. - */ - goto export; - } - if (!page_cache_get_speculative(page)) - goto repeat; - - /* Has the page moved? */ - if (unlikely(page != *slot)) { - page_cache_release(page); - goto repeat; - } -export: - indices[ret] = iter.index; - pages[ret] = page; - if (++ret == nr_pages) - break; - } - rcu_read_unlock(); - return ret; -} - -/* * Remove swap entry from radix tree, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { - int error; + void *old; spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + old = radix_tree_delete_item(&mapping->page_tree, index, radswap); spin_unlock_irq(&mapping->tree_lock); - if (!error) - free_swap_and_cache(radix_to_swp_entry(radswap)); - return error; -} - -/* - * Pagevec may contain swap entries, so shuffle up pages before releasing. - */ -static void shmem_deswap_pagevec(struct pagevec *pvec) -{ - int i, j; - - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) - pvec->pages[j++] = page; - } - pvec->nr = j; + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; } /* @@ -430,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping) * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it * has finished, if it hits a row of PAGEVEC_SIZE swap entries. */ - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) break; index = indices[pvec.nr - 1] + 1; - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(pvec.pages, pvec.nr); pagevec_release(&pvec); cond_resched(); @@ -467,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec, 0); index = start; while (index < end) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); if (!pvec.nr) break; mem_cgroup_uncharge_start(); @@ -498,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -536,9 +470,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; while (index < end) { cond_resched(); - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + + pvec.nr = find_get_entries(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.pages, indices); if (!pvec.nr) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -581,7 +516,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -1088,7 +1023,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return -EFBIG; repeat: swap.val = 0; - page = find_lock_page(mapping, index); + page = find_lock_entry(mapping, index); if (radix_tree_exceptional_entry(page)) { swap = radix_to_swp_entry(page); page = NULL; @@ -1483,6 +1418,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->backing_dev_info == &shmem_backing_dev_info; +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -1795,7 +1735,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pagevec_init(&pvec, 0); pvec.nr = 1; /* start small: we may be there already */ while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr = find_get_entries(mapping, index, pvec.nr, pvec.pages, indices); if (!pvec.nr) { if (whence == SEEK_DATA) @@ -1822,7 +1762,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, break; } } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); pvec.nr = PAGEVEC_SIZE; cond_resched(); diff --git a/mm/swap.c b/mm/swap.c index 01fc47983d7c..ba288569e796 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -954,6 +954,57 @@ void __pagevec_lru_add(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_lru_add); /** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec: Where the resulting entries are placed + * @mapping: The address_space to search + * @start: The starting entry index + * @nr_entries: The maximum number of entries + * @indices: The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping. All + * entries are placed in @pvec. pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes. There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_pages, + pgoff_t *indices) +{ + pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->pages, indices); + return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec: The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec. This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; +} + +/** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search diff --git a/mm/truncate.c b/mm/truncate.c index ac18edc30649..827ad8d2b5cd 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -23,6 +23,22 @@ #include <linux/rmap.h> #include "internal.h" +static void clear_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + radix_tree_delete_item(&mapping->page_tree, index, entry); + spin_unlock_irq(&mapping->tree_lock); +} /** * do_invalidatepage - invalidate part or all of a page @@ -209,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; @@ -239,17 +256,23 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec, 0); index = start; - while (index < end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -260,6 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -308,14 +332,16 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { if (index == start) break; index = start; continue; } - if (index == start && pvec.pages[0]->index >= end) { + if (index == start && indices[0] >= end) { + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -324,16 +350,22 @@ void truncate_inode_pages_range(struct address_space *mapping, struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -376,6 +408,7 @@ EXPORT_SYMBOL(truncate_inode_pages); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; @@ -391,17 +424,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, */ pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -415,6 +454,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, deactivate_page(page); count += ret; } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -482,6 +522,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; @@ -492,17 +533,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { @@ -540,6 +587,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret = ret2; unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c0b36e5b4f8..deb139e6b8ed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2018,13 +2018,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + bool scan_adjusted; get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2045,17 +2059,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) continue; /* - * For global direct reclaim, reclaim only the number of pages - * requested. Less care is taken to scan proportionally as it - * is more important to minimise direct reclaim stall latency - * than it is to properly age the LRU lists. - */ - if (global_reclaim(sc) && !current_is_kswapd()) - break; - - /* * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs shrink + * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. @@ -2063,6 +2068,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; |