aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-10-29 18:16:40 -0700
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-29 21:40:42 -0700
commit4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 (patch)
tree1f76d33bb1d76221c6424bc5fed080a4f91349a6 /mm
parentb38c6845b695141259019e2b7c0fe6c32a6e720d (diff)
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with a many-threaded application which concurrently initializes different parts of a large anonymous area. This patch corrects that, by using a separate spinlock per page table page, to guard the page table entries in that page, instead of using the mm's single page_table_lock. (But even then, page_table_lock is still used to guard page table allocation, and anon_vma allocation.) In this implementation, the spinlock is tucked inside the struct page of the page table page: with a BUILD_BUG_ON in case it overflows - which it would in the case of 32-bit PA-RISC with spinlock debugging enabled. Splitting the lock is not quite for free: another cacheline access. Ideally, I suppose we would use split ptlock only for multi-threaded processes on multi-cpu machines; but deciding that dynamically would have its own costs. So for now enable it by config, at some number of cpus - since the Kconfig language doesn't support inequalities, let preprocessor compare that with NR_CPUS. But I don't think it's worth being user-configurable: for good testing of both split and unsplit configs, split now at 4 cpus, and perhaps change that to 8 later. There is a benefit even for singly threaded processes: kswapd can be attacking one part of the mm while another part is busy faulting. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig13
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/memory.c24
-rw-r--r--mm/mremap.c11
-rw-r--r--mm/page_alloc.c16
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c22
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swap_state.c8
-rw-r--r--mm/swapfile.c12
-rw-r--r--mm/vmscan.c2
12 files changed, 74 insertions, 48 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d13..f35a550ba4b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,16 @@ config SPARSEMEM_STATIC
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "4096" if ARM && !CPU_CACHE_VIPT
+ default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+ default "4"
diff --git a/mm/filemap.c b/mm/filemap.c
index 8aa344e8848..f560b41c8f6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -152,7 +152,7 @@ static int sync_page(void *word)
* in the ->sync_page() methods make essential use of the
* page_mapping(), merely passing the page down to the backing
* device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page->private is
+ * ignore it for all cases but swap, where only page_private(page) is
* of interest. When page_mapping() does go NULL, the entire
* call stack gracefully ignores the page and returns.
* -- wli
diff --git a/mm/memory.c b/mm/memory.c
index 8461e2dd91d..e9ef599498b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
+ pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
if (!new)
return -ENOMEM;
+ pte_lock_init(new);
spin_lock(&mm->page_table_lock);
- if (pmd_present(*pmd)) /* Another has populated it */
+ if (pmd_present(*pmd)) { /* Another has populated it */
+ pte_lock_deinit(new);
pte_free(new);
- else {
+ } else {
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
@@ -432,7 +435,7 @@ again:
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
- src_ptl = &src_mm->page_table_lock;
+ src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock(src_ptl);
do {
@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range);
* (but do_wp_page is only called after already making such a check;
* and do_anonymous_page and do_no_page can safely check later on).
*/
-static inline int pte_unmap_same(struct mm_struct *mm,
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spin_lock(&mm->page_table_lock);
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
same = pte_same(*page_table, orig_pte);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
#endif
pte_unmap(page_table);
@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t pte;
int ret = VM_FAULT_MINOR;
- if (!pte_unmap_same(mm, page_table, orig_pte))
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
goto out;
entry = pte_to_swp_entry(orig_pte);
@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_get(page);
entry = mk_pte(page, vma->vm_page_prot);
- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (!pte_none(*page_table))
goto release;
@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff_t pgoff;
int err;
- if (!pte_unmap_same(mm, page_table, orig_pte))
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
return VM_FAULT_MINOR;
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
pte, pmd, write_access, entry);
}
- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
diff --git a/mm/mremap.c b/mm/mremap.c
index 8de77b632a2..b535438c363 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct address_space *mapping = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
- spinlock_t *old_ptl;
+ spinlock_t *old_ptl, *new_ptl;
if (vma->vm_file) {
/*
@@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_vma->vm_truncate_count = 0;
}
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_sem prevents deadlock.
+ */
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock(new_ptl);
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
@@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
set_pte_at(mm, new_addr, new_pte, pte);
}
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
pte_unmap_nested(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0541288ebf4..a2995a5d012 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
struct page *p = page + i;
SetPageCompound(p);
- p->private = (unsigned long)page;
+ set_page_private(p, (unsigned long)page);
}
}
@@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (!PageCompound(p))
bad_page(__FUNCTION__, page);
- if (p->private != (unsigned long)page)
+ if (page_private(p) != (unsigned long)page)
bad_page(__FUNCTION__, page);
ClearPageCompound(p);
}
@@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
* So, we don't need atomic page->flags operations here.
*/
static inline unsigned long page_order(struct page *page) {
- return page->private;
+ return page_private(page);
}
static inline void set_page_order(struct page *page, int order) {
- page->private = order;
+ set_page_private(page, order);
__SetPagePrivate(page);
}
static inline void rmv_page_order(struct page *page)
{
__ClearPagePrivate(page);
- page->private = 0;
+ set_page_private(page, 0);
}
/*
@@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (a) the buddy is free &&
* (b) the buddy is on the buddy system &&
* (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
*
*/
static inline int page_is_buddy(struct page *page, int order)
@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
@@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order)
page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk);
- page->private = 0;
+ set_page_private(page, 0);
set_page_refs(page, order);
kernel_map_pages(page, 1 << order, 1);
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 330e00d6db0..bb2b0d53889 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
- bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+ bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+ end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
BUG_ON(!PageLocked(page));
ClearPageUptodate(page);
- bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+ bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+ end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
ret = -ENOMEM;
diff --git a/mm/rmap.c b/mm/rmap.c
index a84bdfe582c..a33e779d1bd 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
return NULL;
}
- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;
@@ -550,7 +550,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
update_hiwater_rss(mm);
if (PageAnon(page)) {
- swp_entry_t entry = { .val = page->private };
+ swp_entry_t entry = { .val = page_private(page) };
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
diff --git a/mm/shmem.c b/mm/shmem.c
index 37777f4c11f..dc25565a61e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
-/* Keep swapped page count in private field of indirect struct page */
-#define nr_swapped private
-
/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
enum sgp_type {
SGP_QUICK, /* don't try more than file page cache lookup */
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
entry->val = value;
info->swapped += incdec;
- if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
- kmap_atomic_to_page(entry)->nr_swapped += incdec;
+ if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+ struct page *page = kmap_atomic_to_page(entry);
+ set_page_private(page, page_private(page) + incdec);
+ }
}
/*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
spin_unlock(&info->lock);
page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
- if (page) {
- page->nr_swapped = 0;
- }
+ if (page)
+ set_page_private(page, 0);
spin_lock(&info->lock);
if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
diroff = 0;
}
subdir = dir[diroff];
- if (subdir && subdir->nr_swapped) {
+ if (subdir && page_private(subdir)) {
size = limit - idx;
if (size > ENTRIES_PER_PAGE)
size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
nr_swaps_freed += freed;
if (offset)
spin_lock(&info->lock);
- subdir->nr_swapped -= freed;
+ set_page_private(subdir, page_private(subdir) - freed);
if (offset)
spin_unlock(&info->lock);
- BUG_ON(subdir->nr_swapped > offset);
+ BUG_ON(page_private(subdir) > offset);
}
if (offset)
offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
dir = shmem_dir_map(subdir);
}
subdir = *dir;
- if (subdir && subdir->nr_swapped) {
+ if (subdir && page_private(subdir)) {
ptr = shmem_swp_map(subdir);
size = limit - idx;
if (size > ENTRIES_PER_PAGE)
diff --git a/mm/swap.c b/mm/swap.c
index 21d15f99805..b89512877ec 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
void put_page(struct page *page)
{
if (unlikely(PageCompound(page))) {
- page = (struct page *)page->private;
+ page = (struct page *)page_private(page);
if (put_page_testzero(page)) {
void (*dtor)(struct page *page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 132164f7d0a..cafc1edcbeb 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
page_cache_get(page);
SetPageLocked(page);
SetPageSwapCache(page);
- page->private = entry.val;
+ set_page_private(page, entry.val);
total_swapcache_pages++;
pagecache_acct(1);
}
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
BUG_ON(PageWriteback(page));
BUG_ON(PagePrivate(page));
- radix_tree_delete(&swapper_space.page_tree, page->private);
- page->private = 0;
+ radix_tree_delete(&swapper_space.page_tree, page_private(page));
+ set_page_private(page, 0);
ClearPageSwapCache(page);
total_swapcache_pages--;
pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
{
swp_entry_t entry;
- entry.val = page->private;
+ entry.val = page_private(page);
write_lock_irq(&swapper_space.tree_lock);
__delete_from_swap_cache(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 510f0039b00..8970c0b7419 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
swp_entry_t entry;
down_read(&swap_unplug_sem);
- entry.val = page->private;
+ entry.val = page_private(page);
if (PageSwapCache(page)) {
struct block_device *bdev = swap_info[swp_type(entry)].bdev;
struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
/*
* If the page is removed from swapcache from under us (with a
* racy try_to_unuse/swapoff) we need an additional reference
- * count to avoid reading garbage from page->private above. If
- * the WARN_ON triggers during a swapoff it maybe the race
+ * count to avoid reading garbage from page_private(page) above.
+ * If the WARN_ON triggers during a swapoff it maybe the race
* condition and it's harmless. However if it triggers without
* swapoff it signals a problem.
*/
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
struct swap_info_struct *p;
swp_entry_t entry;
- entry.val = page->private;
+ entry.val = page_private(page);
p = swap_info_get(entry);
if (p) {
/* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
if (page_count(page) != 2) /* 2: us + cache */
return 0;
- entry.val = page->private;
+ entry.val = page_private(page);
p = swap_info_get(entry);
if (!p)
return 0;
@@ -1042,7 +1042,7 @@ int page_queue_congested(struct page *page)
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page->private };
+ swp_entry_t entry = { .val = page_private(page) };
struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 41d1064aabf..135bf8ca96e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -521,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page->private };
+ swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);