From e286781d5f2e9c846e012a39653a166e9d31777d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:30 -0700 Subject: mm: speculative page references If we can be sure that elevating the page_count on a pagecache page will pin it, we can speculatively run this operation, and subsequently check to see if we hit the right page rather than relying on holding a lock or otherwise pinning a reference to the page. This can be done if get_page/put_page behaves consistently throughout the whole tree (ie. if we "get" the page after it has been used for something else, we must be able to free it with a put_page). Actually, there is a period where the count behaves differently: when the page is free or if it is a constituent page of a compound page. We need an atomic_inc_not_zero operation to ensure we don't try to grab the page in either case. This patch introduces the core locking protocol to the pagecache (ie. adds page_cache_get_speculative, and tweaks some update-side code to make it work). Thanks to Hugh for pointing out an improvement to the algorithm setting page_count to zero when we have control of all references, in order to hold off speculative getters. [kamezawa.hiroyu@jp.fujitsu.com: fix migration_entry_wait()] [hugh@veritas.com: fix add_to_page_cache] [akpm@linux-foundation.org: repair a comment] Signed-off-by: Nick Piggin Cc: Jeff Garzik Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 32 ++++++++++++++----------- mm/migrate.c | 20 ++++++++++++++-- mm/shmem.c | 6 ++--- mm/swap_state.c | 17 +++++++++---- mm/vmscan.c | 74 +++++++++++++++++++++++++++++++++++++++++---------------- 5 files changed, 105 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2d3ec1ffc66..4e182a9a14c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -442,39 +442,43 @@ int filemap_write_and_wait_range(struct address_space *mapping, } /** - * add_to_page_cache - add newly allocated pagecache pages + * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = mem_cgroup_cache_charge(page, current->mm, + int error; + + VM_BUG_ON(!PageLocked(page)); + + error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & ~__GFP_HIGHMEM); if (error) goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; + if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - } else + } else { + page->mapping = NULL; mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + } write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); @@ -483,7 +487,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, out: return error; } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) diff --git a/mm/migrate.c b/mm/migrate.c index d8c65a65c61..3ca6392e82c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, page = migration_entry_to_page(entry); - get_page(page); + /* + * Once radix-tree replacement of page migration started, page_count + * *must* be zero. And, we don't want to call wait_on_page_locked() + * against a page without get_page(). + * So, we use get_page_unless_zero(), here. Even failed, page fault + * will occur again. + */ + if (!get_page_unless_zero(page)) + goto out; pte_unmap_unlock(ptep, ptl); wait_on_page_locked(page); put_page(page); @@ -305,6 +313,7 @@ out: static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + int expected_count; void **pslot; if (!mapping) { @@ -319,12 +328,18 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - if (page_count(page) != 2 + !!PagePrivate(page) || + expected_count = 2 + !!PagePrivate(page); + if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { write_unlock_irq(&mapping->tree_lock); return -EAGAIN; } + if (!page_freeze_refs(page, expected_count)) { + write_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + /* * Now we know that no one else is looking at the page. */ @@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); + page_unfreeze_refs(page, expected_count); /* * Drop cache reference from old page. * We know this isn't the last reference. diff --git a/mm/shmem.c b/mm/shmem.c index f92fea94d03..1089092aeca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -936,7 +936,7 @@ found: spin_lock(&info->lock); ptr = shmem_swp_entry(info, idx, NULL); if (ptr && ptr->val == entry.val) { - error = add_to_page_cache(page, inode->i_mapping, + error = add_to_page_cache_locked(page, inode->i_mapping, idx, GFP_NOWAIT); /* does mem_cgroup_uncharge_cache_page on error */ } else /* we must compensate for our precharge above */ @@ -1301,8 +1301,8 @@ repeat: SetPageUptodate(filepage); set_page_dirty(filepage); swap_free(swap); - } else if (!(error = add_to_page_cache( - swappage, mapping, idx, GFP_NOWAIT))) { + } else if (!(error = add_to_page_cache_locked(swappage, mapping, + idx, GFP_NOWAIT))) { info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); diff --git a/mm/swap_state.c b/mm/swap_state.c index d8aadaf2a0b..3e3381d6c7e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -64,7 +64,7 @@ void show_swap_cache_info(void) } /* - * add_to_swap_cache resembles add_to_page_cache on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) @@ -76,19 +76,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + write_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!error) { - page_cache_get(page); - SetPageSwapCache(page); - set_page_private(page, entry.val); + if (likely(!error)) { total_swapcache_pages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } write_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); + + if (unlikely(error)) { + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } } return error; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 26672c6cd3c..0075eac1cd0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -391,12 +391,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } /* - * Attempt to detach a locked page from its ->mapping. If it is dirty or if - * someone else has a ref on the page, abort and return 0. If it was - * successfully detached, return 1. Assumes the caller has a single ref on - * this page. + * Same as remove_mapping, but if the page is removed from the mapping, it + * gets returned with a refcount of 0. */ -int remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -427,24 +425,24 @@ int remove_mapping(struct address_space *mapping, struct page *page) * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (unlikely(page_count(page) != 2)) + if (!page_freeze_refs(page, 2)) goto cannot_free; - smp_rmb(); - if (unlikely(PageDirty(page))) + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_unfreeze_refs(page, 2); goto cannot_free; + } if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + } else { + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); } - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); return 1; cannot_free: @@ -452,6 +450,26 @@ cannot_free: return 0; } +/* + * Attempt to detach a locked page from its ->mapping. If it is dirty or if + * someone else has a ref on the page, abort and return 0. If it was + * successfully detached, return 1. Assumes the caller has a single ref on + * this page. + */ +int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (__remove_mapping(mapping, page)) { + /* + * Unfreezing the refcount with 1 rather than 2 effectively + * drops the pagecache ref for us without requiring another + * atomic operation. + */ + page_unfreeze_refs(page, 1); + return 1; + } + return 0; +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PagePrivate(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) - goto free_it; + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) + goto free_it; + else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this page shortly, so we may + * increment nr_reclaimed here (and + * leave it off the LRU). + */ + nr_reclaimed++; + continue; + } + } } - if (!mapping || !remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page)) goto keep_locked; -free_it: unlock_page(page); +free_it: nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) - __pagevec_release_nonlru(&freed_pvec); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } continue; activate_locked: @@ -623,7 +657,7 @@ keep: } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) - __pagevec_release_nonlru(&freed_pvec); + __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } -- cgit v1.2.3