From 1f5ce9e93aa96a867f195ed45f6f77935175f12e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Jun 2006 09:34:16 -0400 Subject: VFS: Unexport do_kern_mount() and clean up simple_pin_fs() Replace all module uses with the new vfs_kern_mount() interface, and fix up simple_pin_fs(). Signed-off-by: Trond Myklebust --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 4c5e68e4e9a..8184342440f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2261,7 +2261,7 @@ static int __init init_tmpfs(void) #ifdef CONFIG_TMPFS devfs_mk_dir("shm"); #endif - shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER, + shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, tmpfs_fs_type.name, NULL); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); -- cgit v1.2.3 From 454e2398be9b9fa30433fccc548db34d19aa9958 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2006 02:02:57 -0700 Subject: [PATCH] VFS: Permit filesystem to override root dentry on mount Extend the get_sb() filesystem operation to take an extra argument that permits the VFS to pass in the target vfsmount that defines the mountpoint. The filesystem is then required to manually set the superblock and root dentry pointers. For most filesystems, this should be done with simple_set_mnt() which will set the superblock pointer and then set the root dentry to the superblock's s_root (as per the old default behaviour). The get_sb() op now returns an integer as there's now no need to return the superblock pointer. This patch permits a superblock to be implicitly shared amongst several mount points, such as can be done with NFS to avoid potential inode aliasing. In such a case, simple_set_mnt() would not be called, and instead the mnt_root and mnt_sb would be set directly. The patch also makes the following changes: (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount pointer argument and return an integer, so most filesystems have to change very little. (*) If one of the convenience function is not used, then get_sb() should normally call simple_set_mnt() to instantiate the vfsmount. This will always return 0, and so can be tail-called from get_sb(). (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the dcache upon superblock destruction rather than shrink_dcache_anon(). This is required because the superblock may now have multiple trees that aren't actually bound to s_root, but that still need to be cleaned up. The currently called functions assume that the whole tree is rooted at s_root, and that anonymous dentries are not the roots of trees which results in dentries being left unculled. However, with the way NFS superblock sharing are currently set to be implemented, these assumptions are violated: the root of the filesystem is simply a dummy dentry and inode (the real inode for '/' may well be inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries with child trees. [*] Anonymous until discovered from another tree. (*) The documentation has been adjusted, including the additional bit of changing ext2_* into foo_* in the documentation. [akpm@osdl.org: convert ipath_fs, do other stuff] Signed-off-by: David Howells Acked-by: Al Viro Cc: Nathan Scott Cc: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 1e43c8a865b..7617bb1c6bf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2233,10 +2233,10 @@ static struct vm_operations_struct shmem_vm_ops = { }; -static struct super_block *shmem_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int shmem_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, shmem_fill_super); + return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); } static struct file_system_type tmpfs_fs_type = { -- cgit v1.2.3 From 726c334223180e3c0197cc980a432681370d4baf Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2006 02:02:58 -0700 Subject: [PATCH] VFS: Permit filesystem to perform statfs with a known root dentry Give the statfs superblock operation a dentry pointer rather than a superblock pointer. This complements the get_sb() patch. That reduced the significance of sb->s_root, allowing NFS to place a fake root there. However, NFS does require a dentry to use as a target for the statfs operation. This permits the root in the vfsmount to be used instead. linux/mount.h has been added where necessary to make allyesconfig build successfully. Interest has also been expressed for use with the FUSE and XFS filesystems. Signed-off-by: David Howells Acked-by: Al Viro Cc: Nathan Scott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 7617bb1c6bf..10020d8b407 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1654,9 +1654,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos, return desc.error; } -static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) +static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; -- cgit v1.2.3 From cb2b95e1c6b56e3d2369d3a5f4bc97f4fa180683 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 23 Jun 2006 02:03:01 -0700 Subject: [PATCH] zone handle unaligned zone boundaries The buddy allocator has a requirement that boundaries between contigious zones occur aligned with the the MAX_ORDER ranges. Where they do not we will incorrectly merge pages cross zone boundaries. This can lead to pages from the wrong zone being handed out. Originally the buddy allocator would check that buddies were in the same zone by referencing the zone start and end page frame numbers. This was removed as it became very expensive and the buddy allocator already made the assumption that zones boundaries were aligned. It is clear that not all configurations and architectures are honouring this alignment requirement. Therefore it seems safest to reintroduce support for non-aligned zone boundaries. This patch introduces a new check when considering a page a buddy it compares the zone_table index for the two pages and refuses to merge the pages where they do not match. The zone_table index is unique for each node/zone combination when FLATMEM/DISCONTIGMEM is enabled and for each section/zone combination when SPARSEMEM is enabled (a SPARSEMEM section is at least a MAX_ORDER size). Signed-off-by: Andy Whitcroft Cc: Dave Hansen Cc: Mel Gorman Cc: Yasunori Goto Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 253a450c400..fd631c2536a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -286,22 +286,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && * (b) the buddy is in the buddy system && - * (c) a page and its buddy have the same order. + * (c) a page and its buddy have the same order && + * (d) a page and its buddy are in the same zone. * * For recording whether a page is in the buddy system, we use PG_buddy. * Setting, clearing, and testing PG_buddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ -static inline int page_is_buddy(struct page *page, int order) +static inline int page_is_buddy(struct page *page, struct page *buddy, + int order) { #ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) + if (!pfn_valid(page_to_pfn(buddy))) return 0; #endif - if (PageBuddy(page) && page_order(page) == order) { - BUG_ON(page_count(page) != 0); + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + + if (PageBuddy(buddy) && page_order(buddy) == order) { + BUG_ON(page_count(buddy) != 0); return 1; } return 0; @@ -352,7 +357,7 @@ static inline void __free_one_page(struct page *page, struct page *buddy; buddy = __page_find_buddy(page, page_idx, order); - if (!page_is_buddy(buddy, order)) + if (!page_is_buddy(page, buddy, order)) break; /* Move the buddy up one level. */ list_del(&buddy->lru); -- cgit v1.2.3 From 4da5eda0dca9730f59f391230304526ab4bffec7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:04 -0700 Subject: [PATCH] Page Migration: Make do_swap_page redo the fault It is better to redo the complete fault if do_swap_page() finds that the page is not in PageSwapCache() because the page migration code may have replaced the swap pte already with a pte pointing to valid memory. do_swap_page() may interpret an invalid swap entry without this patch because we do not reload the pte if we are looping back. The page migration code may already have reused the swap entry referenced by our local swp_entry. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0ec7bc64427..7e3683fd4f3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1879,7 +1879,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); -again: page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1903,12 +1902,6 @@ again: mark_page_accessed(page); lock_page(page); - if (!PageSwapCache(page)) { - /* Page migration has occured */ - unlock_page(page); - page_cache_release(page); - goto again; - } /* * Back out if somebody else already faulted in this pte. -- cgit v1.2.3 From 729bd0b74ce9ac6c829109052fcd565f5c366ca5 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 23 Jun 2006 02:03:05 -0700 Subject: [PATCH] slab: extract cache_free_alien from __cache_free Move alien object freeing to cache_free_alien() to reduce #ifdef clutter in __cache_free(). Signed-off-by: Pekka Enberg Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 77 ++++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f1b644eb39d..bf05ea900ce 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1024,6 +1024,40 @@ static void drain_alien_cache(struct kmem_cache *cachep, } } } + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + struct slab *slabp = virt_to_slab(objp); + int nodeid = slabp->nodeid; + struct kmem_list3 *l3; + struct array_cache *alien = NULL; + + /* + * Make sure we are not freeing a object from another node to the array + * cache on this cpu. + */ + if (likely(slabp->nodeid == numa_node_id())) + return 0; + + l3 = cachep->nodelists[numa_node_id()]; + STATS_INC_NODEFREES(cachep); + if (l3->alien && l3->alien[nodeid]) { + alien = l3->alien[nodeid]; + spin_lock(&alien->lock); + if (unlikely(alien->avail == alien->limit)) { + STATS_INC_ACOVERFLOW(cachep); + __drain_alien_cache(cachep, alien, nodeid); + } + alien->entry[alien->avail++] = objp; + spin_unlock(&alien->lock); + } else { + spin_lock(&(cachep->nodelists[nodeid])->list_lock); + free_block(cachep, &objp, 1, nodeid); + spin_unlock(&(cachep->nodelists[nodeid])->list_lock); + } + return 1; +} + #else #define drain_alien_cache(cachep, alien) do { } while (0) @@ -1038,6 +1072,11 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) { } +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + return 0; +} + #endif static int cpuup_callback(struct notifier_block *nfb, @@ -3087,41 +3126,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - /* Make sure we are not freeing a object from another - * node to the array cache on this cpu. - */ -#ifdef CONFIG_NUMA - { - struct slab *slabp; - slabp = virt_to_slab(objp); - if (unlikely(slabp->nodeid != numa_node_id())) { - struct array_cache *alien = NULL; - int nodeid = slabp->nodeid; - struct kmem_list3 *l3; - - l3 = cachep->nodelists[numa_node_id()]; - STATS_INC_NODEFREES(cachep); - if (l3->alien && l3->alien[nodeid]) { - alien = l3->alien[nodeid]; - spin_lock(&alien->lock); - if (unlikely(alien->avail == alien->limit)) { - STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, - alien, nodeid); - } - alien->entry[alien->avail++] = objp; - spin_unlock(&alien->lock); - } else { - spin_lock(&(cachep->nodelists[nodeid])-> - list_lock); - free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->nodelists[nodeid])-> - list_lock); - } - return; - } - } -#endif + if (cache_free_alien(cachep, objp)) + return; + if (likely(ac->avail < ac->limit)) { STATS_INC_FREEHIT(cachep); ac->entry[ac->avail++] = objp; -- cgit v1.2.3 From 4776874ff096cd410382c0eca5d75f69c9dfa58f Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 23 Jun 2006 02:03:07 -0700 Subject: [PATCH] slab: page mapping cleanup Clean up slab allocator page mapping a bit. The memory allocated for a slab is physically contiguous so it is okay to assume struct pages are too so kill the long-standing comment. Furthermore, rename set_slab_attr to slab_map_pages and add a comment explaining why its needed. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index bf05ea900ce..a94cf0fea8a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2499,23 +2499,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, slabp->inuse--; } -static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, - void *objp) +/* + * Map pages beginning at addr to the given cache and slab. This is required + * for the slab allocator to be able to lookup the cache and slab of a + * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. + */ +static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, + void *addr) { - int i; + int nr_pages; struct page *page; - /* Nasty!!!!!! I hope this is OK. */ - page = virt_to_page(objp); + page = virt_to_page(addr); - i = 1; + nr_pages = 1; if (likely(!PageCompound(page))) - i <<= cachep->gfporder; + nr_pages <<= cache->gfporder; + do { - page_set_cache(page, cachep); - page_set_slab(page, slabp); + page_set_cache(page, cache); + page_set_slab(page, slab); page++; - } while (--i); + } while (--nr_pages); } /* @@ -2587,7 +2592,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) goto opps1; slabp->nodeid = nodeid; - set_slab_attr(cachep, slabp, objp); + slab_map_pages(cachep, slabp, objp); cache_init_objs(cachep, slabp, ctor_flags); -- cgit v1.2.3 From 3c5a87f476bed45616e7e543dcaea4440c77bf93 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:08 -0700 Subject: [PATCH] migration: remove unnecessary PageSwapCache checks Remove two unnecessary PageSwapCache checks. The page refcount is raised and therefore page migration cannot occur in both functions. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 8 -------- mm/swapfile.c | 7 ------- 2 files changed, 15 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 10020d8b407..84b5cf9b63c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1081,14 +1081,6 @@ repeat: page_cache_release(swappage); goto repeat; } - if (!PageSwapCache(swappage)) { - /* Page migration has occured */ - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - unlock_page(swappage); - page_cache_release(swappage); - goto repeat; - } if (PageWriteback(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index e5fd5385f0c..47a6812f5f8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -716,7 +716,6 @@ static int try_to_unuse(unsigned int type) */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); -again: page = read_swap_cache_async(entry, NULL, 0); if (!page) { /* @@ -751,12 +750,6 @@ again: wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); - if (!PageSwapCache(page)) { - /* Page migration has occured */ - unlock_page(page); - page_cache_release(page); - goto again; - } wait_on_page_writeback(page); /* -- cgit v1.2.3 From 02b694dea473ad3db1e2d1b14c1fef8fbd92e5e6 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Fri, 23 Jun 2006 02:03:08 -0700 Subject: [PATCH] wait_table and zonelist initializing for memory hotadd: change name of wait_table_size() This is just to rename from wait_table_size() to wait_table_hash_nr_entries(). Signed-off-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd631c2536a..27320a0542d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1727,7 +1727,7 @@ void __init build_all_zonelists(void) */ #define PAGES_PER_WAITQUEUE 256 -static inline unsigned long wait_table_size(unsigned long pages) +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) { unsigned long size = 1; @@ -2019,13 +2019,15 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) * The per-page waitqueue mechanism uses hashed waitqueues * per zone. */ - zone->wait_table_size = wait_table_size(zone_size_pages); - zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + zone->wait_table_hash_nr_entries = + wait_table_hash_nr_entries(zone_size_pages); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_hash_nr_entries); zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size + alloc_bootmem_node(pgdat, zone->wait_table_hash_nr_entries * sizeof(wait_queue_head_t)); - for(i = 0; i < zone->wait_table_size; ++i) + for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); } -- cgit v1.2.3 From 86356ab147669bd3bcb2149fd9561d1280835c24 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Fri, 23 Jun 2006 02:03:09 -0700 Subject: [PATCH] wait_table and zonelist initializing for memory hotadd: change to meminit for build_zonelist Change definitions of some functions and data from __init to __meminit. These functions and data can be used after bootup by this patch to be used for hot-add codes. Signed-off-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27320a0542d..5ae75bead4d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -83,8 +83,8 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; int min_free_kbytes = 1024; -unsigned long __initdata nr_kernel_pages; -unsigned long __initdata nr_all_pages; +unsigned long __meminitdata nr_kernel_pages; +unsigned long __meminitdata nr_all_pages; #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) @@ -1517,7 +1517,7 @@ void show_free_areas(void) * * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, +static int __meminit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int nr_zones, int zone_type) { struct zone *zone; @@ -1553,7 +1553,7 @@ static inline int highest_zone(int zone_bits) #ifdef CONFIG_NUMA #define MAX_NODE_LOAD (num_online_nodes()) -static int __initdata node_load[MAX_NUMNODES]; +static int __meminitdata node_load[MAX_NUMNODES]; /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending @@ -1568,7 +1568,7 @@ static int __initdata node_load[MAX_NUMNODES]; * on them otherwise. * It returns -1 if no node is found. */ -static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; @@ -1614,7 +1614,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask) return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +static void __meminit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; @@ -1666,7 +1666,7 @@ static void __init build_zonelists(pg_data_t *pgdat) #else /* CONFIG_NUMA */ -static void __init build_zonelists(pg_data_t *pgdat) +static void __meminit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; @@ -2071,7 +2071,7 @@ static __meminit void init_currently_empty_zone(struct zone *zone, * - mark all memory queues empty * - clear the memory bitmaps */ -static void __init free_area_init_core(struct pglist_data *pgdat, +static void __meminit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long j; @@ -2159,7 +2159,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } -void __init free_area_init_node(int nid, struct pglist_data *pgdat, +void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { -- cgit v1.2.3 From 718127cc3170454f4aa274fdd2f1e01574fecd66 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Fri, 23 Jun 2006 02:03:10 -0700 Subject: [PATCH] wait_table and zonelist initializing for memory hotadd: add return code for init_current_empty_zone When add_zone() is called against empty zone (not populated zone), we have to initialize the zone which didn't initialize at boot time. But, init_currently_empty_zone() may fail due to allocation of wait table. So, this patch is to catch its error code. Changes against wait_table is in the next patch. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 15 +++++++++++++-- mm/page_alloc.c | 11 ++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 70df5c0d957..71da5c98c9c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -26,7 +26,7 @@ extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, unsigned long size); -static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) +static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) { struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; @@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) int zone_type; zone_type = zone - pgdat->node_zones; + if (!populated_zone(zone)) { + int ret = 0; + ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); + if (ret < 0) + return ret; + } memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); + return 0; } extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, @@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) if (ret < 0) return ret; - __add_zone(zone, phys_start_pfn); + ret = __add_zone(zone, phys_start_pfn); + + if (ret < 0) + return ret; + return register_new_memory(__pfn_to_section(phys_start_pfn)); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ae75bead4d..4bc66f6b771 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2050,8 +2050,9 @@ static __meminit void zone_pcp_init(struct zone *zone) zone->name, zone->present_pages, batch); } -static __meminit void init_currently_empty_zone(struct zone *zone, - unsigned long zone_start_pfn, unsigned long size) +__meminit int init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, + unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; @@ -2063,6 +2064,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone, memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); zone_init_free_lists(pgdat, zone, zone->spanned_pages); + + return 0; } /* @@ -2077,6 +2080,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, unsigned long j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + int ret; pgdat_resize_init(pgdat); pgdat->nr_zones = 0; @@ -2118,7 +2122,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, continue; zonetable_add(zone, nid, j, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, size); + BUG_ON(ret); zone_start_pfn += size; } } -- cgit v1.2.3 From cca448fe92246fb59efe55ba2e048ded0971a9af Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Fri, 23 Jun 2006 02:03:10 -0700 Subject: [PATCH] wait_table and zonelist initializing for memory hotadd: wait_table initialization Wait_table is initialized according to zone size at boot time. But, we cannot know the maixmum zone size when memory hotplug is enabled. It can be changed.... And resizing of wait_table is hard. So kernel allocate and initialzie wait_table as its maximum size. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4bc66f6b771..62564e27b44 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1727,6 +1727,7 @@ void __init build_all_zonelists(void) */ #define PAGES_PER_WAITQUEUE 256 +#ifndef CONFIG_MEMORY_HOTPLUG static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) { unsigned long size = 1; @@ -1745,6 +1746,29 @@ static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) return max(size, 4UL); } +#else +/* + * A zone's size might be changed by hot-add, so it is not possible to determine + * a suitable size for its wait_table. So we use the maximum size now. + * + * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: + * + * i386 (preemption config) : 4096 x 16 = 64Kbyte. + * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. + * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. + * + * The maximum entries are prepared when a zone's memory is (512K + 256) pages + * or more by the traditional way. (See above). It equals: + * + * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. + * ia64(16K page size) : = ( 8G + 4M)byte. + * powerpc (64K page size) : = (32G +16M)byte. + */ +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) +{ + return 4096UL; +} +#endif /* * This is an integer logarithm so that shifts can be used later @@ -2010,10 +2034,11 @@ void __init setup_per_cpu_pageset(void) #endif static __meminit -void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; struct pglist_data *pgdat = zone->zone_pgdat; + size_t alloc_size; /* * The per-page waitqueue mechanism uses hashed waitqueues @@ -2023,12 +2048,32 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) wait_table_hash_nr_entries(zone_size_pages); zone->wait_table_bits = wait_table_bits(zone->wait_table_hash_nr_entries); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t)); + alloc_size = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + + if (system_state == SYSTEM_BOOTING) { + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, alloc_size); + } else { + /* + * This case means that a zone whose size was 0 gets new memory + * via memory hot-add. + * But it may be the case that a new node was hot-added. In + * this case vmalloc() will not be able to use this new node's + * memory - this wait_table must be initialized to use this new + * node itself as well. + * To use this new node's memory, further consideration will be + * necessary. + */ + zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); + } + if (!zone->wait_table) + return -ENOMEM; for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); + + return 0; } static __meminit void zone_pcp_init(struct zone *zone) @@ -2055,8 +2100,10 @@ __meminit int init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - - zone_wait_table_init(zone, size); + int ret; + ret = zone_wait_table_init(zone, size); + if (ret) + return ret; pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; -- cgit v1.2.3 From 6811378e7d8b9aa4fca2a1ca73d24c9d67c9cb12 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Fri, 23 Jun 2006 02:03:11 -0700 Subject: [PATCH] wait_table and zonelist initializing for memory hotadd: update zonelists In current code, zonelist is considered to be build once, no modification. But MemoryHotplug can add new zone/pgdat. It must be updated. This patch modifies build_all_zonelists(). By this, build_all_zonelist() can reconfig pgdat's zonelists. To update them safety, this patch use stop_machine_run(). Other cpus don't touch among updating them by using it. In old version (V2 of node hotadd), kernel updated them after zone initialization. But present_page of its new zone is still 0, because online_page() is not called yet at this time. Build_zonelists() checks present_pages to find present zone. It was too early. So, I changed it after online_pages(). Signed-off-by: Yasunori Goto Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 12 ++++++++++++ mm/page_alloc.c | 26 +++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 71da5c98c9c..1b1ac3db218 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -127,6 +127,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; + int need_zonelists_rebuild = 0; /* * This doesn't need a lock to do pfn_to_page(). @@ -139,6 +140,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); pgdat_resize_unlock(zone->zone_pgdat, &flags); + /* + * If this zone is not populated, then it is not in zonelist. + * This means the page allocator ignores this zone. + * So, zonelist must be updated after online. + */ + if (!populated_zone(zone)) + need_zonelists_rebuild = 1; + for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn + i); online_page(page); @@ -149,5 +158,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) setup_per_zone_pages_min(); + if (need_zonelists_rebuild) + build_all_zonelists(); + return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62564e27b44..9dfbe6b7d1c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -1704,14 +1705,29 @@ static void __meminit build_zonelists(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ -void __init build_all_zonelists(void) +/* return values int ....just for stop_machine_run() */ +static int __meminit __build_all_zonelists(void *dummy) { - int i; + int nid; + for_each_online_node(nid) + build_zonelists(NODE_DATA(nid)); + return 0; +} + +void __meminit build_all_zonelists(void) +{ + if (system_state == SYSTEM_BOOTING) { + __build_all_zonelists(0); + cpuset_init_current_mems_allowed(); + } else { + /* we have to stop all cpus to guaranntee there is no user + of zonelist */ + stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); + /* cpuset refresh routine should be here */ + } - for_each_online_node(i) - build_zonelists(NODE_DATA(i)); printk("Built %i zonelists\n", num_online_nodes()); - cpuset_init_current_mems_allowed(); + } /* -- cgit v1.2.3 From 67de648211fa041fe08a0c25241a4980bbb90698 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 23 Jun 2006 02:03:12 -0700 Subject: [PATCH] squash duplicate page_to_pfn and pfn_to_page We have architectures where the size of page_to_pfn and pfn_to_page are significant enough to overall image size that they wish to push them out of line. However, in the process we have grown a second copy of the implementation of each of these routines for each memory model. Share the implmentation exposing it either inline or out-of-line as required. Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9dfbe6b7d1c..5af33186a25 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2879,42 +2879,14 @@ void *__init alloc_large_system_hash(const char *tablename, } #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -/* - * pfn <-> page translation. out-of-line version. - * (see asm-generic/memory_model.h) - */ -#if defined(CONFIG_FLATMEM) -struct page *pfn_to_page(unsigned long pfn) -{ - return mem_map + (pfn - ARCH_PFN_OFFSET); -} -unsigned long page_to_pfn(struct page *page) -{ - return (page - mem_map) + ARCH_PFN_OFFSET; -} -#elif defined(CONFIG_DISCONTIGMEM) struct page *pfn_to_page(unsigned long pfn) { - int nid = arch_pfn_to_nid(pfn); - return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); + return __pfn_to_page(pfn); } unsigned long page_to_pfn(struct page *page) { - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); - return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; -} -#elif defined(CONFIG_SPARSEMEM) -struct page *pfn_to_page(unsigned long pfn) -{ - return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; -} - -unsigned long page_to_pfn(struct page *page) -{ - long section_id = page_to_section(page); - return page - __section_mem_map_addr(__nr_to_section(section_id)); + return __page_to_pfn(page); } -#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ EXPORT_SYMBOL(pfn_to_page); EXPORT_SYMBOL(page_to_pfn); #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ -- cgit v1.2.3 From fadd8fbd153c12963f8fe3c9ef7f8967f286f98b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 23 Jun 2006 02:03:13 -0700 Subject: [PATCH] support for panic at OOM This patch adds panic_on_oom sysctl under sys.vm. When sysctl vm.panic_on_oom = 1, the kernel panics intead of killing rogue processes. And if vm.panic_on_oom is 0 the kernel will do oom_kill() in the same way as it does today. Of course, the default value is 0 and only root can modifies it. In general, oom_killer works well and kill rogue processes. So the whole system can survive. But there are environments where panic is preferable rather than kill some processes. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 042e6436c3e..f9bb3cf3238 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -22,6 +22,7 @@ #include #include +int sysctl_panic_on_oom; /* #define DEBUG */ /** @@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) break; case CONSTRAINT_NONE: + if (sysctl_panic_on_oom) + panic("out of memory. panic_on_oom is selected\n"); retry: /* * Rambo mode: Shoot down a process and hope it solves whatever -- cgit v1.2.3 From 6937a25cff818d32d0f9ff58a518c9ab96760aeb Mon Sep 17 00:00:00 2001 From: Dave Peterson Date: Fri, 23 Jun 2006 02:03:13 -0700 Subject: [PATCH] mm: fix typos in comments in mm/oom_kill.c This fixes a few typos in the comments in mm/oom_kill.c. Signed-off-by: David S. Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f9bb3cf3238..d46ed0f1dc0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,7 +26,7 @@ int sysctl_panic_on_oom; /* #define DEBUG */ /** - * oom_badness - calculate a numeric value for how bad this task has been + * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate * @uptime: current uptime in seconds * @@ -201,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) continue; /* - * This is in the process of releasing memory so for wait it + * This is in the process of releasing memory so wait for it * to finish before killing some other task by mistake. */ releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || @@ -307,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, } /** - * oom_kill - kill the "best" process when we run out of memory + * out_of_memory - kill the "best" process when we run out of memory * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) -- cgit v1.2.3 From a43a8c39bbb493c9e93f6764b350de2e33e18e92 Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Fri, 23 Jun 2006 02:03:15 -0700 Subject: [PATCH] tightening hugetlb strict accounting Current hugetlb strict accounting for shared mapping always assume mapping starts at zero file offset and reserves pages between zero and size of the file. This assumption often reserves (or lock down) a lot more pages then necessary if application maps at none zero file offset. libhugetlbfs is one example that requires proper reservation on shared mapping starts at none zero offset. This patch extends the reservation and hugetlb strict accounting to support any arbitrary pair of (offset, len), resulting a much more robust and accurate scheme. More importantly, it won't lock down any hugetlb pages outside file mapping. Signed-off-by: Ken Chen Acked-by: Adam Litke Cc: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 282 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 162 insertions(+), 120 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 832f676ca03..df499973255 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,7 +22,7 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; +static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; @@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void) static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; - int use_reserve = 0; - unsigned long idx; spin_lock(&hugetlb_lock); - - if (vma->vm_flags & VM_MAYSHARE) { - - /* idx = radix tree index, i.e. offset into file in - * HPAGE_SIZE units */ - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - - /* The hugetlbfs specific inode info stores the number - * of "guaranteed available" (huge) pages. That is, - * the first 'prereserved_hpages' pages of the inode - * are either already instantiated, or have been - * pre-reserved (by hugetlb_reserve_for_inode()). Here - * we're in the process of instantiating the page, so - * we use this to determine whether to draw from the - * pre-reserved pool or the truly free pool. */ - if (idx < HUGETLBFS_I(inode)->prereserved_hpages) - use_reserve = 1; - } - - if (!use_reserve) { - if (free_huge_pages <= reserved_huge_pages) - goto fail; - } else { - BUG_ON(reserved_huge_pages == 0); - reserved_huge_pages--; - } + if (vma->vm_flags & VM_MAYSHARE) + resv_huge_pages--; + else if (free_huge_pages <= resv_huge_pages) + goto fail; page = dequeue_huge_page(vma, addr); if (!page) @@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_refcounted(page); return page; - fail: - WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ +fail: spin_unlock(&hugetlb_lock); return NULL; } -/* hugetlb_extend_reservation() - * - * Ensure that at least 'atleast' hugepages are, and will remain, - * available to instantiate the first 'atleast' pages of the given - * inode. If the inode doesn't already have this many pages reserved - * or instantiated, set aside some hugepages in the reserved pool to - * satisfy later faults (or fail now if there aren't enough, rather - * than getting the SIGBUS later). - */ -int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, - unsigned long atleast) -{ - struct inode *inode = &info->vfs_inode; - unsigned long change_in_reserve = 0; - int ret = 0; - - spin_lock(&hugetlb_lock); - read_lock_irq(&inode->i_mapping->tree_lock); - - if (info->prereserved_hpages >= atleast) - goto out; - - /* Because we always call this on shared mappings, none of the - * pages beyond info->prereserved_hpages can have been - * instantiated, so we need to reserve all of them now. */ - change_in_reserve = atleast - info->prereserved_hpages; - - if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { - ret = -ENOMEM; - goto out; - } - - reserved_huge_pages += change_in_reserve; - info->prereserved_hpages = atleast; - - out: - read_unlock_irq(&inode->i_mapping->tree_lock); - spin_unlock(&hugetlb_lock); - - return ret; -} - -/* hugetlb_truncate_reservation() - * - * This returns pages reserved for the given inode to the general free - * hugepage pool. If the inode has any pages prereserved, but not - * instantiated, beyond offset (atmost << HPAGE_SIZE), then release - * them. - */ -void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, - unsigned long atmost) -{ - struct inode *inode = &info->vfs_inode; - struct address_space *mapping = inode->i_mapping; - unsigned long idx; - unsigned long change_in_reserve = 0; - struct page *page; - - spin_lock(&hugetlb_lock); - read_lock_irq(&inode->i_mapping->tree_lock); - - if (info->prereserved_hpages <= atmost) - goto out; - - /* Count pages which were reserved, but not instantiated, and - * which we can now release. */ - for (idx = atmost; idx < info->prereserved_hpages; idx++) { - page = radix_tree_lookup(&mapping->page_tree, idx); - if (!page) - /* Pages which are already instantiated can't - * be unreserved (and in fact have already - * been removed from the reserved pool) */ - change_in_reserve++; - } - - BUG_ON(reserved_huge_pages < change_in_reserve); - reserved_huge_pages -= change_in_reserve; - info->prereserved_hpages = atmost; - - out: - read_unlock_irq(&inode->i_mapping->tree_lock); - spin_unlock(&hugetlb_lock); -} - static int __init hugetlb_init(void) { unsigned long i; @@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count) return nr_huge_pages; spin_lock(&hugetlb_lock); - count = max(count, reserved_huge_pages); + count = max(count, resv_huge_pages); try_to_free_low(count); while (count < nr_huge_pages) { struct page *page = dequeue_huge_page(NULL, 0); @@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf) return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" + "HugePages_Rsvd: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, - reserved_huge_pages, + resv_huge_pages, HPAGE_SIZE/1024); } @@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } +struct file_region { + struct list_head link; + long from; + long to; +}; + +static long region_add(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg, *trg; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. */ + if (rg->to > t) + t = rg->to; + if (rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + nrg->from = f; + nrg->to = t; + return 0; +} + +static long region_chg(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg; + long chg = 0; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* If we are below the current region then a new region is required. + * Subtle, allocate a new region at the position but make it zero + * size such that we can guarentee to record the reservation. */ + if (&rg->link == head || t < rg->from) { + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (nrg == 0) + return -ENOMEM; + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + list_add(&nrg->link, rg->link.prev); + + return t - f; + } + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + list_for_each_entry(rg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + return chg; + + /* We overlap with this area, if it extends futher than + * us then we must extend ourselves. Account for its + * existing reservation. */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + } + return chg; +} + +static long region_truncate(struct list_head *head, long end) +{ + struct file_region *rg, *trg; + long chg = 0; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (end <= rg->to) + break; + if (&rg->link == head) + return 0; + + /* If we are in the middle of a region then adjust it. */ + if (end > rg->from) { + chg = rg->to - end; + rg->to = end; + rg = list_entry(rg->link.next, typeof(*rg), link); + } + + /* Drop any remaining regions. */ + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + chg += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + } + return chg; +} + +static int hugetlb_acct_memory(long delta) +{ + int ret = -ENOMEM; + + spin_lock(&hugetlb_lock); + if ((delta + resv_huge_pages) <= free_huge_pages) { + resv_huge_pages += delta; + ret = 0; + } + spin_unlock(&hugetlb_lock); + return ret; +} + +int hugetlb_reserve_pages(struct inode *inode, long from, long to) +{ + long ret, chg; + + chg = region_chg(&inode->i_mapping->private_list, from, to); + if (chg < 0) + return chg; + ret = hugetlb_acct_memory(chg); + if (ret < 0) + return ret; + region_add(&inode->i_mapping->private_list, from, to); + return 0; +} + +void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) +{ + long chg = region_truncate(&inode->i_mapping->private_list, offset); + hugetlb_acct_memory(freed - chg); +} -- cgit v1.2.3 From e1b6aa6f1404f162697650df2cdb6c374b1d6a5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Jun 2006 02:03:17 -0700 Subject: [PATCH] slab: clean up kmem_getpages The last ifdef addition hit the ugliness treshold on this functions, so: - rename the variable i to nr_pages so it's somewhat descriptive - remove the addr variable and do the page_address call at the very end - instead of ifdef'ing the whole alloc_pages_node call just make the __GFP_COMP addition to flags conditional - rewrite the __GFP_COMP comment to make sense Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a94cf0fea8a..7bd19639efd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1489,31 +1489,29 @@ __initcall(cpucache_init); static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct page *page; - void *addr; + int nr_pages; int i; - flags |= cachep->gfpflags; #ifndef CONFIG_MMU - /* nommu uses slab's for process anonymous memory allocations, so - * requires __GFP_COMP to properly refcount higher order allocations" + /* + * Nommu uses slab's for process anonymous memory allocations, and thus + * requires __GFP_COMP to properly refcount higher order allocations */ - page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder); -#else - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + flags |= __GFP_COMP; #endif + flags |= cachep->gfpflags; + + page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) return NULL; - addr = page_address(page); - i = (1 << cachep->gfporder); + nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - atomic_add(i, &slab_reclaim_pages); - add_page_state(nr_slab, i); - while (i--) { - __SetPageSlab(page); - page++; - } - return addr; + atomic_add(nr_pages, &slab_reclaim_pages); + add_page_state(nr_slab, nr_pages); + for (i = 0; i < nr_pages; i++) + __SetPageSlab(page + i); + return page_address(page); } /* -- cgit v1.2.3 From 7a7c381d25067b9a2bfe025dfcb16459daec0373 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Jun 2006 02:03:17 -0700 Subject: [PATCH] slab: stop using list_for_each Use the _entry variant everywhere to clean the code up a tiny bit. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 7bd19639efd..54bd2eb3f2f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1950,8 +1950,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, void (*dtor)(void*, struct kmem_cache *, unsigned long)) { size_t left_over, slab_size, ralign; - struct kmem_cache *cachep = NULL; - struct list_head *p; + struct kmem_cache *cachep = NULL, *pc; /* * Sanity checks... these are all serious usage bugs. @@ -1971,8 +1970,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, mutex_lock(&cache_chain_mutex); - list_for_each(p, &cache_chain) { - struct kmem_cache *pc = list_entry(p, struct kmem_cache, next); + list_for_each_entry(pc, &cache_chain, next) { mm_segment_t old_fs = get_fs(); char tmp; int res; @@ -3690,7 +3688,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, */ static void cache_reap(void *unused) { - struct list_head *walk; + struct kmem_cache *searchp; struct kmem_list3 *l3; int node = numa_node_id(); @@ -3701,13 +3699,11 @@ static void cache_reap(void *unused) return; } - list_for_each(walk, &cache_chain) { - struct kmem_cache *searchp; + list_for_each_entry(searchp, &cache_chain, next) { struct list_head *p; int tofree; struct slab *slabp; - searchp = list_entry(walk, struct kmem_cache, next); check_irq_on(); /* @@ -3835,7 +3831,6 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = p; - struct list_head *q; struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -3856,15 +3851,13 @@ static int s_show(struct seq_file *m, void *p) check_irq_on(); spin_lock_irq(&l3->list_lock); - list_for_each(q, &l3->slabs_full) { - slabp = list_entry(q, struct slab, list); + list_for_each_entry(slabp, &l3->slabs_full, list) { if (slabp->inuse != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } - list_for_each(q, &l3->slabs_partial) { - slabp = list_entry(q, struct slab, list); + list_for_each_entry(slabp, &l3->slabs_partial, list) { if (slabp->inuse == cachep->num && !error) error = "slabs_partial inuse accounting error"; if (!slabp->inuse && !error) @@ -3872,8 +3865,7 @@ static int s_show(struct seq_file *m, void *p) active_objs += slabp->inuse; active_slabs++; } - list_for_each(q, &l3->slabs_free) { - slabp = list_entry(q, struct slab, list); + list_for_each_entry(slabp, &l3->slabs_free, list) { if (slabp->inuse && !error) error = "slabs_free/inuse accounting error"; num_slabs++; @@ -3966,7 +3958,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, { char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; int limit, batchcount, shared, res; - struct list_head *p; + struct kmem_cache *cachep; if (count > MAX_SLABINFO_WRITE) return -EINVAL; @@ -3985,10 +3977,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, /* Find the cache in the chain of caches. */ mutex_lock(&cache_chain_mutex); res = -EINVAL; - list_for_each(p, &cache_chain) { - struct kmem_cache *cachep; - - cachep = list_entry(p, struct kmem_cache, next); + list_for_each_entry(cachep, &cache_chain, next) { if (!strcmp(cachep->name, kbuf)) { if (limit < 1 || batchcount < 1 || batchcount > limit || shared < 0) { @@ -4090,7 +4079,6 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = p; - struct list_head *q; struct slab *slabp; struct kmem_list3 *l3; const char *name; @@ -4115,14 +4103,10 @@ static int leaks_show(struct seq_file *m, void *p) check_irq_on(); spin_lock_irq(&l3->list_lock); - list_for_each(q, &l3->slabs_full) { - slabp = list_entry(q, struct slab, list); + list_for_each_entry(slabp, &l3->slabs_full, list) handle_slab(n, cachep, slabp); - } - list_for_each(q, &l3->slabs_partial) { - slabp = list_entry(q, struct slab, list); + list_for_each_entry(slabp, &l3->slabs_partial, list) handle_slab(n, cachep, slabp); - } spin_unlock_irq(&l3->list_lock); } name = cachep->name; -- cgit v1.2.3 From d6277db4ab271862ed599da08d78961c70f00002 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jun 2006 02:03:18 -0700 Subject: [PATCH] swsusp: rework memory shrinker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework the swsusp's memory shrinker in the following way: - Simplify balance_pgdat() by removing all of the swsusp-related code from it. - Make shrink_all_memory() use shrink_slab() and a new function shrink_all_zones() which calls shrink_active_list() and shrink_inactive_list() directly for each zone in a way that's optimized for suspend. In shrink_all_memory() we try to free exactly as many pages as the caller asks for, preferably in one shot, starting from easier targets.  If slab caches are huge, they are most likely to have enough pages to reclaim.  The inactive lists are next (the zones with more inactive pages go first) etc. Each time shrink_all_memory() attempts to shrink the active and inactive lists for each zone in 5 passes.  In the first pass, only the inactive lists are taken into consideration.  In the next two passes the active lists are also shrunk, but mapped pages are not reclaimed.  In the last two passes the active and inactive lists are shrunk and mapped pages are reclaimed as well. The aim of this is to alter the reclaim logic to choose the best pages to keep on resume and improve the responsiveness of the resumed system. Signed-off-by: Rafael J. Wysocki Signed-off-by: Con Kolivas Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 219 +++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 164 insertions(+), 55 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 440a733fe2e..46be8a02280 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -61,6 +61,8 @@ struct scan_control { * In this context, it doesn't matter that we scan the * whole list at once. */ int swap_cluster_max; + + int swappiness; }; /* @@ -741,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * A 100% value of vm_swappiness overrides this algorithm * altogether. */ - swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; /* * Now use this metric to decide whether to start moving mapped @@ -957,6 +959,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, .may_swap = 1, + .swappiness = vm_swappiness, }; inc_page_state(allocstall); @@ -1021,10 +1024,6 @@ out: * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at pages_high. * - * If `nr_pages' is non-zero then it is the number of pages which are to be - * reclaimed, regardless of the zone occupancies. This is a software suspend - * special. - * * Returns the number of pages which were actually freed. * * There is special handling here for zones which are full of pinned pages. @@ -1042,10 +1041,8 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, - int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { - unsigned long to_free = nr_pages; int all_zones_ok; int priority; int i; @@ -1055,7 +1052,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_swap = 1, - .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = vm_swappiness, }; loop_again: @@ -1082,31 +1080,26 @@ loop_again: all_zones_ok = 1; - if (nr_pages == 0) { - /* - * Scan in the highmem->dma direction for the highest - * zone which needs scanning - */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; + /* + * Scan in the highmem->dma direction for the highest + * zone which needs scanning + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; - if (!populated_zone(zone)) - continue; + if (!populated_zone(zone)) + continue; - if (zone->all_unreclaimable && - priority != DEF_PRIORITY) - continue; + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; - if (!zone_watermark_ok(zone, order, - zone->pages_high, 0, 0)) { - end_zone = i; - goto scan; - } + if (!zone_watermark_ok(zone, order, zone->pages_high, + 0, 0)) { + end_zone = i; + goto scan; } - goto out; - } else { - end_zone = pgdat->nr_zones - 1; } + goto out; scan: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; @@ -1133,11 +1126,9 @@ scan: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - if (nr_pages == 0) { /* Not software suspend */ - if (!zone_watermark_ok(zone, order, - zone->pages_high, end_zone, 0)) - all_zones_ok = 0; - } + if (!zone_watermark_ok(zone, order, zone->pages_high, + end_zone, 0)) + all_zones_ok = 0; zone->temp_priority = priority; if (zone->prev_priority > priority) zone->prev_priority = priority; @@ -1162,8 +1153,6 @@ scan: total_scanned > nr_reclaimed + nr_reclaimed / 2) sc.may_writepage = 1; } - if (nr_pages && to_free > nr_reclaimed) - continue; /* swsusp: need to do more work */ if (all_zones_ok) break; /* kswapd: all done */ /* @@ -1179,7 +1168,7 @@ scan: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) + if (nr_reclaimed >= SWAP_CLUSTER_MAX) break; } out: @@ -1261,7 +1250,7 @@ static int kswapd(void *p) } finish_wait(&pgdat->kswapd_wait, &wait); - balance_pgdat(pgdat, 0, order); + balance_pgdat(pgdat, order); } return 0; } @@ -1290,35 +1279,154 @@ void wakeup_kswapd(struct zone *zone, int order) #ifdef CONFIG_PM /* - * Try to free `nr_pages' of memory, system-wide. Returns the number of freed - * pages. + * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages + * from LRU lists system-wide, for given pass and priority, and returns the + * number of reclaimed pages + * + * For pass > 3 we also try to shrink the LRU lists that contain a few pages + */ +static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, + int prio, struct scan_control *sc) +{ + struct zone *zone; + unsigned long nr_to_scan, ret = 0; + + for_each_zone(zone) { + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && prio != DEF_PRIORITY) + continue; + + /* For pass = 0 we don't shrink the active list */ + if (pass > 0) { + zone->nr_scan_active += (zone->nr_active >> prio) + 1; + if (zone->nr_scan_active >= nr_pages || pass > 3) { + zone->nr_scan_active = 0; + nr_to_scan = min(nr_pages, zone->nr_active); + shrink_active_list(nr_to_scan, zone, sc); + } + } + + zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; + if (zone->nr_scan_inactive >= nr_pages || pass > 3) { + zone->nr_scan_inactive = 0; + nr_to_scan = min(nr_pages, zone->nr_inactive); + ret += shrink_inactive_list(nr_to_scan, zone, sc); + if (ret >= nr_pages) + return ret; + } + } + + return ret; +} + +/* + * Try to free `nr_pages' of memory, system-wide, and return the number of + * freed pages. + * + * Rather than trying to age LRUs the aim is to preserve the overall + * LRU order by reclaiming preferentially + * inactive > active > active referenced > active mapped */ unsigned long shrink_all_memory(unsigned long nr_pages) { - pg_data_t *pgdat; - unsigned long nr_to_free = nr_pages; + unsigned long lru_pages, nr_slab; unsigned long ret = 0; - unsigned retry = 2; - struct reclaim_state reclaim_state = { - .reclaimed_slab = 0, + int pass; + struct reclaim_state reclaim_state; + struct zone *zone; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 0, + .swap_cluster_max = nr_pages, + .may_writepage = 1, + .swappiness = vm_swappiness, }; current->reclaim_state = &reclaim_state; -repeat: - for_each_online_pgdat(pgdat) { - unsigned long freed; - freed = balance_pgdat(pgdat, nr_to_free, 0); - ret += freed; - nr_to_free -= freed; - if ((long)nr_to_free <= 0) + lru_pages = 0; + for_each_zone(zone) + lru_pages += zone->nr_active + zone->nr_inactive; + + nr_slab = read_page_state(nr_slab); + /* If slab caches are huge, it's better to hit them first */ + while (nr_slab >= lru_pages) { + reclaim_state.reclaimed_slab = 0; + shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + if (!reclaim_state.reclaimed_slab) break; + + ret += reclaim_state.reclaimed_slab; + if (ret >= nr_pages) + goto out; + + nr_slab -= reclaim_state.reclaimed_slab; } - if (retry-- && ret < nr_pages) { - blk_congestion_wait(WRITE, HZ/5); - goto repeat; + + /* + * We try to shrink LRUs in 5 passes: + * 0 = Reclaim from inactive_list only + * 1 = Reclaim from active list but don't reclaim mapped + * 2 = 2nd pass of type 1 + * 3 = Reclaim mapped (normal reclaim) + * 4 = 2nd pass of type 3 + */ + for (pass = 0; pass < 5; pass++) { + int prio; + + /* Needed for shrinking slab caches later on */ + if (!lru_pages) + for_each_zone(zone) { + lru_pages += zone->nr_active; + lru_pages += zone->nr_inactive; + } + + /* Force reclaiming mapped pages in the passes #3 and #4 */ + if (pass > 2) { + sc.may_swap = 1; + sc.swappiness = 100; + } + + for (prio = DEF_PRIORITY; prio >= 0; prio--) { + unsigned long nr_to_scan = nr_pages - ret; + + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + + ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); + if (ret >= nr_pages) + goto out; + + reclaim_state.reclaimed_slab = 0; + shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); + ret += reclaim_state.reclaimed_slab; + if (ret >= nr_pages) + goto out; + + if (sc.nr_scanned && prio < DEF_PRIORITY - 2) + blk_congestion_wait(WRITE, HZ / 10); + } + + lru_pages = 0; } + + /* + * If ret = 0, we could not shrink LRUs, but there may be something + * in slab caches + */ + if (!ret) + do { + reclaim_state.reclaimed_slab = 0; + shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + ret += reclaim_state.reclaimed_slab; + } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + +out: current->reclaim_state = NULL; + return ret; } #endif @@ -1416,6 +1524,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, + .swappiness = vm_swappiness, }; disable_swap_token(); -- cgit v1.2.3 From 833423143c3a7c6545e409d65febd0d92deb351b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 23 Jun 2006 02:03:20 -0700 Subject: [PATCH] mm: introduce remap_vmalloc_range() Add remap_vmalloc_range, vmalloc_user, and vmalloc_32_user so that drivers can have a nice interface for remapping vmalloc memory. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c0504f1e34e..35f8553f893 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -256,6 +256,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); } +/* Caller must hold vmlist_lock */ +static struct vm_struct *__find_vm_area(void *addr) +{ + struct vm_struct *tmp; + + for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { + if (tmp->addr == addr) + break; + } + + return tmp; +} + /* Caller must hold vmlist_lock */ struct vm_struct *__remove_vm_area(void *addr) { @@ -498,10 +511,32 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); } EXPORT_SYMBOL(vmalloc); +/** + * vmalloc_user - allocate virtually contiguous memory which has + * been zeroed so it can be mapped to userspace without + * leaking data. + * + * @size: allocation size + */ +void *vmalloc_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + write_lock(&vmlist_lock); + area = __find_vm_area(ret); + area->flags |= VM_USERMAP; + write_unlock(&vmlist_lock); + + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + /** * vmalloc_node - allocate memory on a specific node * @@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); } EXPORT_SYMBOL(vmalloc_node); @@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size) } EXPORT_SYMBOL(vmalloc_32); +/** + * vmalloc_32_user - allocate virtually contiguous memory (32bit + * addressable) which is zeroed so it can be + * mapped to userspace without leaking data. + * + * @size: allocation size + */ +void *vmalloc_32_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + write_lock(&vmlist_lock); + area = __find_vm_area(ret); + area->flags |= VM_USERMAP; + write_unlock(&vmlist_lock); + + return ret; +} +EXPORT_SYMBOL(vmalloc_32_user); + long vread(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; @@ -630,3 +687,64 @@ finished: read_unlock(&vmlist_lock); return buf - buf_start; } + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * @returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + struct vm_struct *area; + unsigned long uaddr = vma->vm_start; + unsigned long usize = vma->vm_end - vma->vm_start; + int ret; + + if ((PAGE_SIZE-1) & (unsigned long)addr) + return -EINVAL; + + read_lock(&vmlist_lock); + area = __find_vm_area(addr); + if (!area) + goto out_einval_locked; + + if (!(area->flags & VM_USERMAP)) + goto out_einval_locked; + + if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) + goto out_einval_locked; + read_unlock(&vmlist_lock); + + addr += pgoff << PAGE_SHIFT; + do { + struct page *page = vmalloc_to_page(addr); + ret = vm_insert_page(vma, uaddr, page); + if (ret) + return ret; + + uaddr += PAGE_SIZE; + addr += PAGE_SIZE; + usize -= PAGE_SIZE; + } while (usize > 0); + + /* Prevent "things" like memory migration? VM_flags need a cleanup... */ + vma->vm_flags |= VM_RESERVED; + + return ret; + +out_einval_locked: + read_unlock(&vmlist_lock); + return -EINVAL; +} +EXPORT_SYMBOL(remap_vmalloc_range); + -- cgit v1.2.3 From b344e05c585406904c70865e531e02467c4c7931 Mon Sep 17 00:00:00 2001 From: Hua Zhong Date: Fri, 23 Jun 2006 02:03:23 -0700 Subject: [PATCH] likely cleanup: remove unlikely in sys_mprotect() With likely/unlikely profiling on my not-so-busy-typical-developmentsystem there are 5k misses vs 2k hits. So I guess we should remove the unlikely. Signed-off-by: Hua Zhong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c14d4289b6..5faf01ad3ef 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -205,8 +205,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) /* * Does the application expect PROT_READ to imply PROT_EXEC: */ - if (unlikely((prot & PROT_READ) && - (current->personality & READ_IMPLIES_EXEC))) + if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) prot |= PROT_EXEC; vm_flags = calc_vm_prot_bits(prot); -- cgit v1.2.3 From 58ce1fd5805647a58a050bbbbd2252ea5ecb47b3 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 23 Jun 2006 02:03:24 -0700 Subject: [PATCH] slab: redzone double-free detection At present our slab debugging tells us that it detected a double-free or corruption - it does not distinguish between them. Sometimes it's useful to be able to differentiate between these two types of information. Add double-free detection to redzone verification when freeing an object. As explained by Manfred, when we are freeing an object, both redzones should be RED_ACTIVE. However, if both are RED_INACTIVE, we are trying to free an object that was already free'd. Signed-off-by: Manfred Spraul Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 54bd2eb3f2f..2046da2aa46 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2636,6 +2636,28 @@ static void kfree_debugcheck(const void *objp) } } +static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) +{ + unsigned long redzone1, redzone2; + + redzone1 = *dbg_redzone1(cache, obj); + redzone2 = *dbg_redzone2(cache, obj); + + /* + * Redzone is ok. + */ + if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) + return; + + if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) + slab_error(cache, "double free detected"); + else + slab_error(cache, "memory outside object was overwritten"); + + printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", + obj, redzone1, redzone2); +} + static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, void *caller) { @@ -2659,15 +2681,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, slabp = page_get_slab(page); if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || - *dbg_redzone2(cachep, objp) != RED_ACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%lx, " - "redzone 2:0x%lx.\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); - } + verify_redzone_free(cachep, objp); *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } -- cgit v1.2.3 From 111ebb6e6f7bd7de6d722c5848e95621f43700d9 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 23 Jun 2006 02:03:26 -0700 Subject: [PATCH] writeback: fix range handling When a writeback_control's `start' and `end' fields are used to indicate a one-byte-range starting at file offset zero, the required values of .start=0,.end=0 mean that the ->writepages() implementation has no way of telling that it is being asked to perform a range request. Because we're currently overloading (start == 0 && end == 0) to mean "this is not a write-a-range request". To make all this sane, the patch changes range of writeback_control. So caller does: If it is calling ->writepages() to write pages, it sets range (range_start/end or range_cyclic) always. And if range_cyclic is true, ->writepages() thinks the range is cyclic, otherwise it just uses range_start and range_end. This patch does, - Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h -1 is usually ok for range_end (type is long long). But, if someone did, range_end += val; range_end is "val - 1" u64val = range_end >> bits; u64val is "~(0ULL)" or something, they are wrong. So, this adds LLONG_MAX to avoid nasty things, and uses LLONG_MAX for range_end. - All callers of ->writepages() sets range_start/end or range_cyclic. - Fix updates of ->writeback_index. It seems already bit strange. If it starts at 0 and ended by check of nr_to_write, this last index may reduce chance to scan end of file. So, this updates ->writeback_index only if range_cyclic is true or whole-file is scanned. Signed-off-by: OGAWA Hirofumi Cc: Nathan Scott Cc: Anton Altaparmakov Cc: Steven French Cc: "Vladimir V. Saveliev" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 +++--- mm/page-writeback.c | 3 +++ mm/vmscan.c | 2 ++ 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index fd57442186c..3342067ca43 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -190,8 +190,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = mapping->nrpages * 2, - .start = start, - .end = end, + .range_start = start, + .range_end = end, }; if (!mapping_cap_writeback_dirty(mapping)) @@ -204,7 +204,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, static inline int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) { - return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode); + return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); } int filemap_fdatawrite(struct address_space *mapping) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 75d7f48b79b..8ccf6f1b147 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -204,6 +204,7 @@ static void balance_dirty_pages(struct address_space *mapping) .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, .nr_to_write = write_chunk, + .range_cyclic = 1, }; get_dirty_limits(&wbs, &background_thresh, @@ -331,6 +332,7 @@ static void background_writeout(unsigned long _min_pages) .older_than_this = NULL, .nr_to_write = 0, .nonblocking = 1, + .range_cyclic = 1, }; for ( ; ; ) { @@ -407,6 +409,7 @@ static void wb_kupdate(unsigned long arg) .nr_to_write = 0, .nonblocking = 1, .for_kupdate = 1, + .range_cyclic = 1, }; sync_supers(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 46be8a02280..bc5d4f43036 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -339,6 +339,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping) struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = SWAP_CLUSTER_MAX, + .range_start = 0, + .range_end = LLONG_MAX, .nonblocking = 1, .for_reclaim = 1, }; -- cgit v1.2.3 From 7352349a19e84b28f10668eca9cbb6bbbfa9d38e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:27 -0700 Subject: [PATCH] page migration cleanup: rename "ignrefs" to "migration" migrate is a better name since it is only used by page migration. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 1963e269314..10806b7af40 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -578,7 +578,7 @@ void page_remove_rmap(struct page *page) * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int ignore_refs) + int migration) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -602,7 +602,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ if ((vma->vm_flags & VM_LOCKED) || (ptep_clear_flush_young(vma, address, pte) - && !ignore_refs)) { + && !migration)) { ret = SWAP_FAIL; goto out_unmap; } @@ -736,7 +736,7 @@ static void try_to_unmap_cluster(unsigned long cursor, pte_unmap_unlock(pte - 1, ptl); } -static int try_to_unmap_anon(struct page *page, int ignore_refs) +static int try_to_unmap_anon(struct page *page, int migration) { struct anon_vma *anon_vma; struct vm_area_struct *vma; @@ -747,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs) return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - ret = try_to_unmap_one(page, vma, ignore_refs); + ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -764,7 +764,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs) * * This function is only called from try_to_unmap for object-based pages. */ -static int try_to_unmap_file(struct page *page, int ignore_refs) +static int try_to_unmap_file(struct page *page, int migration) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -778,7 +778,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma, ignore_refs); + ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -863,16 +863,16 @@ out: * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable */ -int try_to_unmap(struct page *page, int ignore_refs) +int try_to_unmap(struct page *page, int migration) { int ret; BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, ignore_refs); + ret = try_to_unmap_anon(page, migration); else - ret = try_to_unmap_file(page, ignore_refs); + ret = try_to_unmap_file(page, migration); if (!page_mapped(page)) ret = SWAP_SUCCESS; -- cgit v1.2.3 From 1d8b85ccf1ed53a71b092fb5d807edf1ea7dabdd Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:28 -0700 Subject: [PATCH] page migration cleanup: group functions Reorder functions in migrate.c. Group all migration functions for struct address_space_operations together. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 142 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 72 insertions(+), 70 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 1c25040693d..49e71ddb679 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -119,15 +119,6 @@ int putback_lru_pages(struct list_head *l) return count; } -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - /* * swapout a single page * page is locked upon entry, unlocked on exit @@ -297,6 +288,17 @@ void migrate_page_copy(struct page *newpage, struct page *page) } EXPORT_SYMBOL(migrate_page_copy); +/************************************************************ + * Migration functions + ***********************************************************/ + +/* Always fail migration. Used for mappings that are not movable */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + /* * Common logic to directly migrate a single page suitable for * pages that do not use PagePrivate. @@ -329,6 +331,67 @@ int migrate_page(struct page *newpage, struct page *page) } EXPORT_SYMBOL(migrate_page); +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + int rc; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + rc = migrate_page_remove_references(newpage, page, 3); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); + /* * migrate_pages * @@ -528,67 +591,6 @@ next: return nr_failed + retry; } -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. - */ -int buffer_migrate_page(struct page *newpage, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct buffer_head *bh, *head; - int rc; - - if (!mapping) - return -EAGAIN; - - if (!page_has_buffers(page)) - return migrate_page(newpage, page); - - head = page_buffers(page); - - rc = migrate_page_remove_references(newpage, page, 3); - - if (rc) - return rc; - - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); - - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); - - bh = head; - do { - set_bh_page(bh, newpage, bh_offset(bh)); - bh = bh->b_this_page; - - } while (bh != head); - - SetPagePrivate(newpage); - - migrate_page_copy(newpage, page); - - bh = head; - do { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; - - } while (bh != head); - - return 0; -} -EXPORT_SYMBOL(buffer_migrate_page); - /* * Migrate the list 'pagelist' of pages to a certain destination. * -- cgit v1.2.3 From e7340f73307abed9283d0a07570d06e228c205dd Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:29 -0700 Subject: [PATCH] page migration cleanup: remove useless definitions Remove the export for migrate_page_remove_references() and migrate_page_copy() that are unlikely to be used directly by filesystems implementing migration. The export was useful when buffer_migrate_page() lived in fs/buffer.c but it has now been moved to migrate.c in the migration reorg. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 49e71ddb679..be3f141e53a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -169,7 +169,7 @@ retry: * Remove references for a page and establish the new page with the correct * basic settings to be able to stop accesses to the page. */ -int migrate_page_remove_references(struct page *newpage, +static int migrate_page_remove_references(struct page *newpage, struct page *page, int nr_refs) { struct address_space *mapping = page_mapping(page); @@ -246,12 +246,11 @@ int migrate_page_remove_references(struct page *newpage, return 0; } -EXPORT_SYMBOL(migrate_page_remove_references); /* * Copy the page to its new location */ -void migrate_page_copy(struct page *newpage, struct page *page) +static void migrate_page_copy(struct page *newpage, struct page *page) { copy_highpage(newpage, page); @@ -286,7 +285,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageWriteback(newpage)) end_page_writeback(newpage); } -EXPORT_SYMBOL(migrate_page_copy); /************************************************************ * Migration functions -- cgit v1.2.3 From 5b5c7120e2154239837fad5e3c7b7b781092b19c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:29 -0700 Subject: [PATCH] page migration cleanup: drop nr_refs in remove_references() Drop nr_refs parameter from migrate_page_remove_references() The nr_refs parameter is not really useful since the number of remaining references is always 1 for anonymous pages without a mapping 2 for pages with a mapping 3 for pages with a mapping and PagePrivate set. Remove the early check for the number of references since we are checking page_mapcount() earlier. Ultimately only the refcount matters after the tree_lock has been obtained. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index be3f141e53a..2803a6698dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,19 +168,19 @@ retry: /* * Remove references for a page and establish the new page with the correct * basic settings to be able to stop accesses to the page. + * + * The number of remaining references must be: + * 1 for anonymous pages without a mapping + * 2 for pages with a mapping + * 3 for pages with a mapping and PagePrivate set. */ static int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) + struct page *page) { struct address_space *mapping = page_mapping(page); struct page **radix_pointer; - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + if (!mapping) return -EAGAIN; /* @@ -218,7 +218,8 @@ static int migrate_page_remove_references(struct page *newpage, &mapping->page_tree, page_index(page)); - if (!page_mapping(page) || page_count(page) != nr_refs || + if (!page_mapping(page) || + page_count(page) != 2 + !!PagePrivate(page) || *radix_pointer != page) { write_unlock_irq(&mapping->tree_lock); return -EAGAIN; @@ -309,7 +310,7 @@ int migrate_page(struct page *newpage, struct page *page) BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_remove_references(newpage, page, 2); + rc = migrate_page_remove_references(newpage, page); if (rc) return rc; @@ -348,7 +349,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page) head = page_buffers(page); - rc = migrate_page_remove_references(newpage, page, 3); + rc = migrate_page_remove_references(newpage, page); if (rc) return rc; -- cgit v1.2.3 From c3fcf8a5daacf350f0632e1379414c01f34eeea3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:32 -0700 Subject: [PATCH] page migration cleanup: extract try_to_unmap from migration functions Extract try_to_unmap and rename remove_references -> move_mapping try_to_unmap() may significantly change the page state by for example setting the dirty bit. It is therefore best to unmap in migrate_pages() before calling any migration functions. migrate_page_remove_references() will then only move the new page in place of the old page in the mapping. Rename the function to migrate_page_move_mapping(). This allows us to get rid of the special unmapping for the fallback path. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 76 +++++++++++++++++++++++++----------------------------------- 1 file changed, 31 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 2803a6698dd..8095c607a49 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -166,15 +166,14 @@ retry: } /* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. + * Replace the page in the mapping. * * The number of remaining references must be: * 1 for anonymous pages without a mapping * 2 for pages with a mapping * 3 for pages with a mapping and PagePrivate set. */ -static int migrate_page_remove_references(struct page *newpage, +static int migrate_page_move_mapping(struct page *newpage, struct page *page) { struct address_space *mapping = page_mapping(page); @@ -183,35 +182,6 @@ static int migrate_page_remove_references(struct page *newpage, if (!mapping) return -EAGAIN; - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> permanent failure */ - return -EPERM; - - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; - write_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( @@ -310,7 +280,7 @@ int migrate_page(struct page *newpage, struct page *page) BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_remove_references(newpage, page); + rc = migrate_page_move_mapping(newpage, page); if (rc) return rc; @@ -349,7 +319,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page) head = page_buffers(page); - rc = migrate_page_remove_references(newpage, page); + rc = migrate_page_move_mapping(newpage, page); if (rc) return rc; @@ -481,6 +451,33 @@ redo: newpage = lru_to_page(to); lock_page(newpage); + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + rc = -EPERM; + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> permanent failure */ + goto unlock_both; + + rc = -EAGAIN; + if (page_mapped(page)) + goto unlock_both; /* * Pages are properly locked and writeback is complete. * Try to migrate the page. @@ -501,17 +498,6 @@ redo: goto unlock_both; } - /* Make sure the dirty bit is up to date */ - if (try_to_unmap(page, 1) == SWAP_FAIL) { - rc = -EPERM; - goto unlock_both; - } - - if (page_mapcount(page)) { - rc = -EAGAIN; - goto unlock_both; - } - /* * Default handling if a filesystem does not provide * a migration function. We can only migrate clean -- cgit v1.2.3 From 2d1db3b1170db4e8bf0531dd636742269c2cf579 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:33 -0700 Subject: [PATCH] page migration cleanup: pass "mapping" to migration functions Change handling of address spaces. Pass a pointer to the address space in which the page is migrated to all migration function. This avoids repeatedly having to retrieve the address space pointer from the page and checking it for validity. The old page mapping will change once migration has gone to a certain step, so it is less confusing to have the pointer always available. Move the setting of the mapping and index for the new page into migrate_pages(). Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 70 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 8095c607a49..f65e69d9452 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -173,15 +173,11 @@ retry: * 2 for pages with a mapping * 3 for pages with a mapping and PagePrivate set. */ -static int migrate_page_move_mapping(struct page *newpage, - struct page *page) +static int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) { - struct address_space *mapping = page_mapping(page); struct page **radix_pointer; - if (!mapping) - return -EAGAIN; - write_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( @@ -197,15 +193,8 @@ static int migrate_page_move_mapping(struct page *newpage, /* * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. */ get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); @@ -262,7 +251,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) ***********************************************************/ /* Always fail migration. Used for mappings that are not movable */ -int fail_migrate_page(struct page *newpage, struct page *page) +int fail_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { return -EIO; } @@ -274,13 +264,14 @@ EXPORT_SYMBOL(fail_migrate_page); * * Pages are locked upon entry and exit. */ -int migrate_page(struct page *newpage, struct page *page) +int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page); if (rc) return rc; @@ -305,21 +296,18 @@ EXPORT_SYMBOL(migrate_page); * if the underlying filesystem guarantees that no other references to "page" * exist. */ -int buffer_migrate_page(struct page *newpage, struct page *page) +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { - struct address_space *mapping = page->mapping; struct buffer_head *bh, *head; int rc; - if (!mapping) - return -EAGAIN; - if (!page_has_buffers(page)) - return migrate_page(newpage, page); + return migrate_page(mapping, newpage, page); head = page_buffers(page); - rc = migrate_page_move_mapping(newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page); if (rc) return rc; @@ -448,9 +436,6 @@ redo: goto next; } - newpage = lru_to_page(to); - lock_page(newpage); - /* * Establish swap ptes for anonymous pages or destroy pte * maps for files. @@ -473,11 +458,18 @@ redo: rc = -EPERM; if (try_to_unmap(page, 1) == SWAP_FAIL) /* A vma has VM_LOCKED set -> permanent failure */ - goto unlock_both; + goto unlock_page; rc = -EAGAIN; if (page_mapped(page)) - goto unlock_both; + goto unlock_page; + + newpage = lru_to_page(to); + lock_page(newpage); + /* Prepare mapping for the new page.*/ + newpage->index = page->index; + newpage->mapping = page->mapping; + /* * Pages are properly locked and writeback is complete. * Try to migrate the page. @@ -494,7 +486,8 @@ redo: * own migration function. This is the most common * path for page migration. */ - rc = mapping->a_ops->migratepage(newpage, page); + rc = mapping->a_ops->migratepage(mapping, + newpage, page); goto unlock_both; } @@ -524,7 +517,7 @@ redo: */ if (!page_has_buffers(page) || try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); + rc = migrate_page(mapping, newpage, page); goto unlock_both; } @@ -553,12 +546,17 @@ unlock_page: unlock_page(page); next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; + if (rc) { + if (newpage) + newpage->mapping = NULL; + + if (rc == -EAGAIN) + retry++; + else { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } } else { if (newpage) { /* Successful migration. Return page to LRU */ -- cgit v1.2.3 From 8351a6e4785218a2b03c142be92926baff95ba5c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:33 -0700 Subject: [PATCH] page migration cleanup: move fallback handling into special function Move the fallback code into a new fallback function and make the function behave like any other migration function. This requires retaking the lock if pageout() drops it. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 90 ++++++++++++++++++++++++++---------------------------------- 1 file changed, 39 insertions(+), 51 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index f65e69d9452..5a340f4ca21 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -349,6 +349,42 @@ int buffer_migrate_page(struct address_space *mapping, } EXPORT_SYMBOL(buffer_migrate_page); +static int fallback_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + return -EAGAIN; + + case PAGE_SUCCESS: + /* Relock since we lost the lock */ + lock_page(page); + /* Must retry since page state may have changed */ + return -EAGAIN; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_buffers(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + + return migrate_page(mapping, newpage, page); +} + /* * migrate_pages * @@ -478,7 +514,7 @@ redo: if (!mapping) goto unlock_both; - if (mapping->a_ops->migratepage) { + if (mapping->a_ops->migratepage) /* * Most pages have a mapping and most filesystems * should provide a migration function. Anonymous @@ -488,56 +524,8 @@ redo: */ rc = mapping->a_ops->migratepage(mapping, newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(mapping, newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } + else + rc = fallback_migrate_page(mapping, newpage, page); unlock_both: unlock_page(newpage); -- cgit v1.2.3 From 0697212a411c1dae03c27845f2de2f3adb32c331 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:35 -0700 Subject: [PATCH] Swapless page migration: add R/W migration entries Implement read/write migration ptes We take the upper two swapfiles for the two types of migration ptes and define a series of macros in swapops.h. The VM is modified to handle the migration entries. migration entries can only be encountered when the page they are pointing to is locked. This limits the number of places one has to fix. We also check in copy_pte_range and in mprotect_pte_range() for migration ptes. We check for migration ptes in do_swap_cache and call a function that will then wait on the page lock. This allows us to effectively stop all accesses to apge. Migration entries are created by try_to_unmap if called for migration and removed by local functions in migrate.c From: Hugh Dickins Several times while testing swapless page migration (I've no NUMA, just hacking it up to migrate recklessly while running load), I've hit the BUG_ON(!PageLocked(p)) in migration_entry_to_page. This comes from an orphaned migration entry, unrelated to the current correctly locked migration, but hit by remove_anon_migration_ptes as it checks an address in each vma of the anon_vma list. Such an orphan may be left behind if an earlier migration raced with fork: copy_one_pte can duplicate a migration entry from parent to child, after remove_anon_migration_ptes has checked the child vma, but before it has removed it from the parent vma. (If the process were later to fault on this orphaned entry, it would hit the same BUG from migration_entry_wait.) This could be fixed by locking anon_vma in copy_one_pte, but we'd rather not. There's no such problem with file pages, because vma_prio_tree_add adds child vma after parent vma, and the page table locking at each end is enough to serialize. Follow that example with anon_vma: add new vmas to the tail instead of the head. (There's no corresponding problem when inserting migration entries, because a missed pte will leave the page count and mapcount high, which is allowed for. And there's no corresponding problem when migrating via swap, because a leftover swap entry will be correctly faulted. But the swapless method has no refcounting of its entries.) From: Ingo Molnar pte_unmap_unlock() takes the pte pointer as an argument. From: Hugh Dickins Several times while testing swapless page migration, gcc has tried to exec a pointer instead of a string: smells like COW mappings are not being properly write-protected on fork. The protection in copy_one_pte looks very convincing, until at last you realize that the second arg to make_migration_entry is a boolean "write", and SWP_MIGRATION_READ is 30. Anyway, it's better done like in change_pte_range, using is_write_migration_entry and make_migration_entry_read. From: Hugh Dickins Remove unnecessary obfuscation from sys_swapon's range check on swap type, which blew up causing memory corruption once swapless migration made MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT. Signed-off-by: Hugh Dickins Acked-by: Martin Schwidefsky Signed-off-by: Hugh Dickins Signed-off-by: Christoph Lameter Signed-off-by: Ingo Molnar From: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 18 ++++++++- mm/migrate.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- mm/mprotect.c | 23 +++++++++-- mm/rmap.c | 38 ++++++++++------- mm/swapfile.c | 20 ++++----- 5 files changed, 195 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7e3683fd4f3..11673c5d2c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { if (!pte_file(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); + swp_entry_t entry = pte_to_swp_entry(pte); + + swap_duplicate(entry); /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both parent + * and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -1879,6 +1891,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + goto out; + } page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); diff --git a/mm/migrate.c b/mm/migrate.c index 5a340f4ca21..0a011e421bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,7 +24,6 @@ #include #include #include -#include #include "internal.h" @@ -119,6 +119,132 @@ int putback_lru_pages(struct list_head *l) return count; } +static inline int is_swap_pte(pte_t pte) +{ + return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); +} + +/* + * Restore a potential migration pte to a working pte entry + */ +static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, + struct page *old, struct page *new) +{ + struct mm_struct *mm = vma->vm_mm; + swp_entry_t entry; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return; + + ptep = pte_offset_map(pmd, addr); + + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + return; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + + if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) + goto out; + + inc_mm_counter(mm, anon_rss); + get_page(new); + pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (is_write_migration_entry(entry)) + pte = pte_mkwrite(pte); + set_pte_at(mm, addr, ptep, pte); + page_add_anon_rmap(new, vma, addr); +out: + pte_unmap_unlock(ptep, ptl); +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + * + * Must hold mmap_sem lock on at least one of the vmas containing + * the page so that the anon_vma cannot vanish. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long mapping; + + mapping = (unsigned long)new->mapping; + + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) + return; + + /* + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + */ + anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + remove_migration_pte(vma, page_address_in_vma(new, vma), + old, new); + + spin_unlock(&anon_vma->lock); +} + +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + * + * This function is called from do_swap_page(). + */ +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *ptep, pte; + spinlock_t *ptl; + swp_entry_t entry; + struct page *page; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; + + page = migration_entry_to_page(entry); + + get_page(page); + pte_unmap_unlock(ptep, ptl); + wait_on_page_locked(page); + put_page(page); + return; +out: + pte_unmap_unlock(ptep, ptl); +} + /* * swapout a single page * page is locked upon entry, unlocked on exit diff --git a/mm/mprotect.c b/mm/mprotect.c index 5faf01ad3ef..14f93e62270 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -19,7 +19,8 @@ #include #include #include - +#include +#include #include #include #include @@ -28,12 +29,13 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot) { - pte_t *pte; + pte_t *pte, oldpte; spinlock_t *ptl; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { - if (pte_present(*pte)) { + oldpte = *pte; + if (pte_present(oldpte)) { pte_t ptent; /* Avoid an SMP race with hardware updated dirty/clean @@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); set_pte_at(mm, addr, pte, ptent); lazy_mmu_prot_update(ptent); +#ifdef CONFIG_MIGRATION + } else if (!pte_file(oldpte)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + + if (is_write_migration_entry(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + set_pte_at(mm, addr, pte, + swp_entry_to_pte(entry)); + } +#endif } + } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); } diff --git a/mm/rmap.c b/mm/rmap.c index 10806b7af40..3b8ce86daa3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); allocated = NULL; } spin_unlock(&mm->page_table_lock); @@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma) struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) { - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); validate_anon_vma(vma); } } @@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma) if (anon_vma) { spin_lock(&anon_vma->lock); - list_add(&vma->anon_vma_node, &anon_vma->head); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); validate_anon_vma(vma); spin_unlock(&anon_vma->lock); } @@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - BUG_ON(!PageSwapCache(page)); - swap_duplicate(entry); - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); + + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + } else { + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + BUG_ON(!migration); + entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); diff --git a/mm/swapfile.c b/mm/swapfile.c index 47a6812f5f8..e3b1362372c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry) struct swap_info_struct * p; struct page *page = NULL; + if (is_migration_entry(entry)) + return; + p = swap_info_get(entry); if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { @@ -1400,19 +1403,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) if (!(p->flags & SWP_USED)) break; error = -EPERM; - /* - * Test if adding another swap device is possible. There are - * two limiting factors: 1) the number of bits for the swap - * type swp_entry_t definition and 2) the number of bits for - * the swap type in the swap ptes as defined by the different - * architectures. To honor both limitations a swap entry - * with swap offset 0 and swap type ~0UL is created, encoded - * to a swap pte, decoded to a swp_entry_t again and finally - * the swap type part is extracted. This will mask all bits - * from the initial ~0UL that can't be encoded in either the - * swp_entry_t or the architecture definition of a swap pte. - */ - if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { + if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); goto out; } @@ -1702,6 +1693,9 @@ int swap_duplicate(swp_entry_t entry) unsigned long offset, type; int result = 0; + if (is_migration_entry(entry)) + return 1; + type = swp_type(entry); if (type >= nr_swapfiles) goto bad_file; -- cgit v1.2.3 From d75a0fcda2cfc71b50e16dc89e0c32c57d427e85 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:36 -0700 Subject: [PATCH] Swapless page migration: rip out swap based logic Rip the page migration logic out. Remove all code that has to do with swapping during page migration. This also guts the ability to migrate pages to swap. No one used that so lets let it go for good. Page migration should be a bit broken after this patch. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 75 +++-------------------------------------------------------- mm/rmap.c | 38 ------------------------------ mm/swapfile.c | 9 ------- 3 files changed, 3 insertions(+), 119 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 0a011e421bb..81721a061d5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -70,10 +70,6 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) */ int migrate_prep(void) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return -ENODEV; - /* * Clear the LRU lists so pages can be isolated. * Note that pages may be moved off the LRU after we have @@ -245,52 +241,6 @@ out: pte_unmap_unlock(ptep, ptl); } -/* - * swapout a single page - * page is locked upon entry, unlocked on exit - */ -static int swap_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; - - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } - - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } - -unlock_retry: - unlock_page(page); - -retry: - return -EAGAIN; -} - /* * Replace the page in the mapping. * @@ -517,8 +467,7 @@ static int fallback_migrate_page(struct address_space *mapping, * Two lists are passed to this function. The first list * contains the pages isolated from the LRU to be migrated. * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. + * can be moved to. * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty @@ -574,29 +523,11 @@ redo: * Only wait on writeback if we have already done a pass where * we we may have triggered writeouts for lots of pages. */ - if (pass > 0) { + if (pass > 0) wait_on_page_writeback(page); - } else { + else if (PageWriteback(page)) goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } /* * Establish swap ptes for anonymous pages or destroy pte diff --git a/mm/rmap.c b/mm/rmap.c index 3b8ce86daa3..a53a10b93ec 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -205,44 +205,6 @@ out: return anon_vma; } -#ifdef CONFIG_MIGRATION -/* - * Remove an anonymous page from swap replacing the swap pte's - * through real pte's pointing to valid pages and then releasing - * the page from the swap cache. - * - * Must hold page lock on page and mmap_sem of one vma that contains - * the page. - */ -void remove_from_swap(struct page *page) -{ - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned long mapping; - - if (!PageSwapCache(page)) - return; - - mapping = (unsigned long)page->mapping; - - if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) - return; - - /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. - */ - anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) - remove_vma_swap(vma, page); - - spin_unlock(&anon_vma->lock); - delete_from_swap_cache(page); -} -EXPORT_SYMBOL(remove_from_swap); -#endif - /* * At what user virtual address is page expected in vma? */ diff --git a/mm/swapfile.c b/mm/swapfile.c index e3b1362372c..fbceed67a07 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -618,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm, return 0; } -#ifdef CONFIG_MIGRATION -int remove_vma_swap(struct vm_area_struct *vma, struct page *page) -{ - swp_entry_t entry = { .val = page_private(page) }; - - return unuse_vma(vma, entry, page); -} -#endif - /* * Scan swap_map from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. -- cgit v1.2.3 From 6c5240ae7f48c83fcaa8e24fa63e7eb09aba5651 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:37 -0700 Subject: [PATCH] Swapless page migration: modify core logic Use the migration entries for page migration This modifies the migration code to use the new migration entries. It now becomes possible to migrate anonymous pages without having to add a swap entry. We add a couple of new functions to replace migration entries with the proper ptes. We cannot take the tree_lock for migrating anonymous pages anymore. However, we know that we hold the only remaining reference to the page when the page count reaches 1. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 4 ++-- mm/migrate.c | 53 +++++++++++++++++++++-------------------------------- 2 files changed, 23 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 332f5c29b53..66e65ab3942 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS # config MIGRATION bool "Page migration" - def_bool y if NUMA - depends on SWAP && NUMA + def_bool y + depends on NUMA help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful for diff --git a/mm/migrate.c b/mm/migrate.c index 81721a061d5..8f91463eab4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -254,14 +254,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, { struct page **radix_pointer; + if (!mapping) { + /* Anonymous page */ + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } + write_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( &mapping->page_tree, page_index(page)); - if (!page_mapping(page) || - page_count(page) != 2 + !!PagePrivate(page) || + if (page_count(page) != 2 + !!PagePrivate(page) || *radix_pointer != page) { write_unlock_irq(&mapping->tree_lock); return -EAGAIN; @@ -271,10 +277,12 @@ static int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page. */ get_page(newpage); +#ifdef CONFIG_SWAP if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); } +#endif *radix_pointer = newpage; __put_page(page); @@ -308,7 +316,9 @@ static void migrate_page_copy(struct page *newpage, struct page *page) set_page_dirty(newpage); } +#ifdef CONFIG_SWAP ClearPageSwapCache(page); +#endif ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); @@ -353,16 +363,6 @@ int migrate_page(struct address_space *mapping, return rc; migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); return 0; } EXPORT_SYMBOL(migrate_page); @@ -530,23 +530,7 @@ redo: goto unlock_page; /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. + * Establish migration ptes or remove ptes */ rc = -EPERM; if (try_to_unmap(page, 1) == SWAP_FAIL) @@ -569,9 +553,9 @@ redo: */ mapping = page_mapping(page); if (!mapping) - goto unlock_both; + rc = migrate_page(mapping, newpage, page); - if (mapping->a_ops->migratepage) + else if (mapping->a_ops->migratepage) /* * Most pages have a mapping and most filesystems * should provide a migration function. Anonymous @@ -584,10 +568,15 @@ redo: else rc = fallback_migrate_page(mapping, newpage, page); -unlock_both: + if (!rc) + remove_migration_ptes(page, newpage); + unlock_page(newpage); unlock_page: + if (rc) + remove_migration_ptes(page, page); + unlock_page(page); next: -- cgit v1.2.3 From 442c9137de8d769053e81d325709dca72f0b5e44 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:38 -0700 Subject: [PATCH] More page migration: do not inc/dec rss counters If we install a migration entry then the rss not really decreases since the page is just moved somewhere else. We can save ourselves the work of decrementing and later incrementing which will just eventually cause cacheline bouncing. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 1 - mm/rmap.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 8f91463eab4..96b9546e69e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -164,7 +164,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) goto out; - inc_mm_counter(mm, anon_rss); get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) diff --git a/mm/rmap.c b/mm/rmap.c index a53a10b93ec..05d6d73a692 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -595,6 +595,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } + dec_mm_counter(mm, anon_rss); } else { /* * Store the pfn of the page in a special migration @@ -606,7 +607,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - dec_mm_counter(mm, anon_rss); } else dec_mm_counter(mm, file_rss); -- cgit v1.2.3 From 04e62a29bf157ce1edd168f2b71b533c80d13628 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:38 -0700 Subject: [PATCH] More page migration: use migration entries for file pages This implements the use of migration entries to preserve ptes of file backed pages during migration. Processes can therefore be migrated back and forth without loosing their connection to pagecache pages. Note that we implement the migration entries only for linear mappings. Nonlinear mappings still require the unmapping of the ptes for migration. And another writepage() ugliness shows up. writepage() can drop the page lock. Therefore we have to remove migration ptes before calling writepages() in order to avoid having migration entries point to unlocked pages. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++------------- mm/rmap.c | 11 ++++++ mm/vmscan.c | 14 ++++++- 3 files changed, 124 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 96b9546e69e..b5000d46389 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" @@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte) /* * Restore a potential migration pte to a working pte entry */ -static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, +static void remove_migration_pte(struct vm_area_struct *vma, struct page *old, struct page *new) { struct mm_struct *mm = vma->vm_mm; @@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; + unsigned long addr = page_address_in_vma(new, vma); + + if (addr == -EFAULT) + return; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) @@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); set_pte_at(mm, addr, ptep, pte); - page_add_anon_rmap(new, vma, addr); + + if (PageAnon(new)) + page_add_anon_rmap(new, vma, addr); + else + page_add_file_rmap(new); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, pte); + lazy_mmu_prot_update(pte); + out: pte_unmap_unlock(ptep, ptl); } /* - * Get rid of all migration entries and replace them by - * references to the indicated page. - * + * Note that remove_file_migration_ptes will only work on regular mappings, + * Nonlinear mappings do not use migration entries. + */ +static void remove_file_migration_ptes(struct page *old, struct page *new) +{ + struct vm_area_struct *vma; + struct address_space *mapping = page_mapping(new); + struct prio_tree_iter iter; + pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!mapping) + return; + + spin_lock(&mapping->i_mmap_lock); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) + remove_migration_pte(vma, old, new); + + spin_unlock(&mapping->i_mmap_lock); +} + +/* * Must hold mmap_sem lock on at least one of the vmas containing * the page so that the anon_vma cannot vanish. */ -static void remove_migration_ptes(struct page *old, struct page *new) +static void remove_anon_migration_ptes(struct page *old, struct page *new) { struct anon_vma *anon_vma; struct vm_area_struct *vma; @@ -199,12 +232,23 @@ static void remove_migration_ptes(struct page *old, struct page *new) spin_lock(&anon_vma->lock); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) - remove_migration_pte(vma, page_address_in_vma(new, vma), - old, new); + remove_migration_pte(vma, old, new); spin_unlock(&anon_vma->lock); } +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + if (PageAnon(new)) + remove_anon_migration_ptes(old, new); + else + remove_file_migration_ptes(old, new); +} + /* * Something used the pte of a page under migration. We need to * get to the page and wait until migration is finished. @@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping, } EXPORT_SYMBOL(buffer_migrate_page); -static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) +/* + * Writeback a page to clean the dirty state + */ +static int writeout(struct address_space *mapping, struct page *page) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .nonblocking = 1, + .for_reclaim = 1 + }; + int rc; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + return -EINVAL; + + if (!clear_page_dirty_for_io(page)) + /* Someone else already triggered a write */ + return -EAGAIN; + /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. + * A dirty page may imply that the underlying filesystem has + * the page on some queue. So the page must be clean for + * migration. Writeout may mean we loose the lock and the + * page state is no longer what we checked for earlier. + * At this point we know that the migration attempt cannot + * be successful. */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - return -EAGAIN; + remove_migration_ptes(page, page); - case PAGE_SUCCESS: - /* Relock since we lost the lock */ - lock_page(page); - /* Must retry since page state may have changed */ - return -EAGAIN; + rc = mapping->a_ops->writepage(page, &wbc); + if (rc < 0) + /* I/O Error writing */ + return -EIO; - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } + if (rc != AOP_WRITEPAGE_ACTIVATE) + /* unlocked. Relock */ + lock_page(page); + + return -EAGAIN; +} + +/* + * Default handling if a filesystem does not provide a migration function. + */ +static int fallback_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + if (PageDirty(page)) + return writeout(mapping, page); /* * Buffers may be managed in a filesystem specific way. diff --git a/mm/rmap.c b/mm/rmap.c index 05d6d73a692..882a85826bb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, anon_rss); +#ifdef CONFIG_MIGRATION } else { /* * Store the pfn of the page in a special migration @@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ BUG_ON(!migration); entry = make_migration_entry(page, pte_write(pteval)); +#endif } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); } else +#ifdef CONFIG_MIGRATION + if (migration) { + /* Establish migration entry for a file page */ + swp_entry_t entry; + entry = make_migration_entry(page, pte_write(pteval)); + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + } else +#endif dec_mm_counter(mm, file_rss); + page_remove_rmap(page); page_cache_release(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index bc5d4f43036..71a02e29503 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + /* * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write -- cgit v1.2.3 From ddc2e812d592457747c4367fb73edcaa8e1e49ff Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 23 Jun 2006 02:03:40 -0700 Subject: [PATCH] slab: verify pointers before free Passing an invalid pointer to kfree() and kmem_cache_free() is likely to cause bad memory corruption or even take down the whole system because the bad pointer is likely reused immediately due to the per-CPU caches. Until now, we don't do any verification for this if CONFIG_DEBUG_SLAB is disabled. As suggested by Linus, add PageSlab check to page_to_cache() and page_to_slab() to verify pointers passed to kfree(). Also, move the stronger check from cache_free_debugcheck() to kmem_cache_free() to ensure the passed pointer actually belongs to the cache we're about to free the object. For page_to_cache() and page_to_slab(), the assertions should have virtually no extra cost (two instructions, no data cache pressure) and for kmem_cache_free() the overhead should be minimal. Signed-off-by: Pekka Enberg Cc: Manfred Spraul Cc: Christoph Lameter Cc: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2046da2aa46..e3260db04b9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -592,6 +592,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page) { if (unlikely(PageCompound(page))) page = (struct page *)page_private(page); + BUG_ON(!PageSlab(page)); return (struct kmem_cache *)page->lru.next; } @@ -604,6 +605,7 @@ static inline struct slab *page_get_slab(struct page *page) { if (unlikely(PageCompound(page))) page = (struct page *)page_private(page); + BUG_ON(!PageSlab(page)); return (struct slab *)page->lru.prev; } @@ -2669,15 +2671,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, kfree_debugcheck(objp); page = virt_to_page(objp); - if (page_get_cache(page) != cachep) { - printk(KERN_ERR "mismatch in kmem_cache_free: expected " - "cache %p, got %p\n", - page_get_cache(page), cachep); - printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); - printk(KERN_ERR "%p is %s.\n", page_get_cache(page), - page_get_cache(page)->name); - WARN_ON(1); - } slabp = page_get_slab(page); if (cachep->flags & SLAB_RED_ZONE) { @@ -3393,6 +3386,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + BUG_ON(virt_to_cache(objp) != cachep); + local_irq_save(flags); __cache_free(cachep, objp); local_irq_restore(flags); -- cgit v1.2.3 From 30c253e6da655d73eb8bfe2adca9b8f4d82fb81e Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 23 Jun 2006 02:03:41 -0700 Subject: [PATCH] sparsemem: record nid during memory present Record the node id as we mark sections for instantiation. Use this nid during instantiation to direct allocations. Signed-off-by: Andy Whitcroft Cc: Mike Kravetz Cc: Dave Hansen Cc: Mel Gorman Cc: Bob Picco Cc: Jack Steiner Cc: Yasunori Goto Cc: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 100040c0dfb..e0a3fe48aa3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -99,6 +99,22 @@ int __section_nr(struct mem_section* ms) return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } +/* + * During early boot, before section_mem_map is used for an actual + * mem_map, we use section_mem_map to store the section's NUMA + * node. This keeps us from having to use another data structure. The + * node information is cleared just before we store the real mem_map. + */ +static inline unsigned long sparse_encode_early_nid(int nid) +{ + return (nid << SECTION_NID_SHIFT); +} + +static inline int sparse_early_nid(struct mem_section *section) +{ + return (section->section_mem_map >> SECTION_NID_SHIFT); +} + /* Record a memory area against a node. */ void memory_present(int nid, unsigned long start, unsigned long end) { @@ -113,7 +129,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) ms = __nr_to_section(section); if (!ms->section_mem_map) - ms->section_mem_map = SECTION_MARKED_PRESENT; + ms->section_mem_map = sparse_encode_early_nid(nid) | + SECTION_MARKED_PRESENT; } } @@ -164,6 +181,7 @@ static int sparse_init_one_section(struct mem_section *ms, if (!valid_section(ms)) return -EINVAL; + ms->section_mem_map &= ~SECTION_MAP_MASK; ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); return 1; @@ -172,8 +190,8 @@ static int sparse_init_one_section(struct mem_section *ms, static struct page *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; - int nid = early_pfn_to_nid(section_nr_to_pfn(pnum)); struct mem_section *ms = __nr_to_section(pnum); + int nid = sparse_early_nid(ms); map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) -- cgit v1.2.3 From 9637a5efd4fbe36164c5ce7f6a0ee68b2bf22b7f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2006 02:03:43 -0700 Subject: [PATCH] add page_mkwrite() vm_operations method Add a new VMA operation to notify a filesystem or other driver about the MMU generating a fault because userspace attempted to write to a page mapped through a read-only PTE. This facility permits the filesystem or driver to: (*) Implement storage allocation/reservation on attempted write, and so to deal with problems such as ENOSPC more gracefully (perhaps by generating SIGBUS). (*) Delay making the page writable until the contents have been written to a backing cache. This is useful for NFS/AFS when using FS-Cache/CacheFS. It permits the filesystem to have some guarantee about the state of the cache. (*) Account and limit number of dirty pages. This is one piece of the puzzle needed to make shared writable mapping work safely in FUSE. Needed by cachefs (Or is it cachefiles? Or fscache? ). At least four other groups have stated an interest in it or a desire to use the functionality it provides: FUSE, OCFS2, NTFS and JFFS2. Also, things like EXT3 really ought to use it to deal with the case of shared-writable mmap encountering ENOSPC before we permit the page to be dirtied. From: Peter Zijlstra get_user_pages(.write=1, .force=1) can generate COW hits on read-only shared mappings, this patch traps those as mkpage_write candidates and fails to handle them the old way. Signed-off-by: David Howells Cc: Miklos Szeredi Cc: Joel Becker Cc: Mark Fasheh Cc: Anton Altaparmakov Cc: David Woodhouse Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 100 ++++++++++++++++++++++++++++++++++++++++++++-------------- mm/mmap.c | 12 +++++-- mm/mprotect.c | 11 +++++-- 3 files changed, 95 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 11673c5d2c2..247b5c312b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1457,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int ret = VM_FAULT_MINOR; + int reuse, ret = VM_FAULT_MINOR; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) goto gotten; - if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { - int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_set_access_flags(vma, address, page_table, entry, 1); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); - ret |= VM_FAULT_WRITE; - goto unlock; + if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == + (VM_SHARED|VM_WRITE))) { + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + /* + * Notify the address space that the page is about to + * become writable so that it can prohibit this or wait + * for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can + * sleep if it needs to. + */ + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + + if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + goto unwritable_page; + + page_cache_release(old_page); + + /* + * Since we dropped the lock we need to revalidate + * the PTE as someone else may have changed it. If + * they did, we just return, as we can count on the + * MMU to tell us if they didn't also make it writable. + */ + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) + goto unlock; } + + reuse = 1; + } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { + reuse = can_share_swap_page(old_page); + unlock_page(old_page); + } else { + reuse = 0; + } + + if (reuse) { + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ptep_set_access_flags(vma, address, page_table, entry, 1); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + ret |= VM_FAULT_WRITE; + goto unlock; } /* @@ -1535,6 +1570,10 @@ oom: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; + +unwritable_page: + page_cache_release(old_page); + return VM_FAULT_SIGBUS; } /* @@ -2083,18 +2122,31 @@ retry: /* * Should we do an early C-O-W break? */ - if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page *page; + if (write_access) { + if (!(vma->vm_flags & VM_SHARED)) { + struct page *page; - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!page) - goto oom; - copy_user_highpage(page, new_page, address); - page_cache_release(new_page); - new_page = page; - anon = 1; + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!page) + goto oom; + copy_user_highpage(page, new_page, address); + page_cache_release(new_page); + new_page = page; + anon = 1; + + } else { + /* if the page will be shareable, see if the backing + * address space wants to know that the page is about + * to become writable */ + if (vma->vm_ops->page_mkwrite && + vma->vm_ops->page_mkwrite(vma, new_page) < 0 + ) { + page_cache_release(new_page); + return VM_FAULT_SIGBUS; + } + } } page_table = pte_offset_map_lock(mm, pmd, address, &ptl); diff --git a/mm/mmap.c b/mm/mmap.c index e6ee12344b1..6446c6134b0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1065,7 +1065,8 @@ munmap_back: vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; - vma->vm_page_prot = protection_map[vm_flags & 0x0f]; + vma->vm_page_prot = protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; vma->vm_pgoff = pgoff; if (file) { @@ -1089,6 +1090,12 @@ munmap_back: goto free_vma; } + /* Don't make the VMA automatically writable if it's shared, but the + * backer wishes to know when pages are first written to */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) + vma->vm_page_prot = + protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; + /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) * that memory reservation must be checked; but that reservation @@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_end = addr + len; vma->vm_pgoff = pgoff; vma->vm_flags = flags; - vma->vm_page_prot = protection_map[flags & 0x0f]; + vma->vm_page_prot = protection_map[flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; vma_link(mm, vma, prev, rb_link, rb_parent); out: mm->total_vm += len >> PAGE_SHIFT; diff --git a/mm/mprotect.c b/mm/mprotect.c index 14f93e62270..638edabaff7 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -123,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; unsigned long charged = 0; + unsigned int mask; pgprot_t newprot; pgoff_t pgoff; int error; @@ -149,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } } - newprot = protection_map[newflags & 0xf]; - /* * First try to merge with previous and/or next vma. */ @@ -177,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } success: + /* Don't make the VMA automatically writable if it's shared, but the + * backer wishes to know when pages are first written to */ + mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; + if (vma->vm_ops && vma->vm_ops->page_mkwrite) + mask &= ~VM_SHARED; + + newprot = protection_map[newflags & mask]; + /* * vm_flags and vm_page_prot are protected by the mmap_sem * held in write mode. -- cgit v1.2.3 From 70af7c5c6492ef6ad137dbff6c4568c73edbcaf0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 23 Jun 2006 02:03:44 -0700 Subject: [PATCH] swapoff: use atomic_inc_not_zero() on mm_users Now that we have atomic_inc_not_zero, it's more elegant for try_to_unuse to use that on mm_users: doesn't actually matter at present, but safer to be sure that once mm_users has gone to 0, nothing raises it for an instant. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index fbceed67a07..f2824c3c31b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -772,10 +772,8 @@ static int try_to_unuse(unsigned int type) while (*swap_map > 1 && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); - if (atomic_inc_return(&mm->mm_users) == 1) { - atomic_dec(&mm->mm_users); + if (!atomic_inc_not_zero(&mm->mm_users)) continue; - } spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; -- cgit v1.2.3 From 668e0d8f1a02fd75f1c1e8142a6b08455914242c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 23 Jun 2006 02:03:45 -0700 Subject: [PATCH] fix update_mmu_cache in fremap.c There are two calls to update_mmu_cache in fremap.c, both defective. The one in install_page needs to be accompanied by lazy_mmu_prot_update (some other cleanup time, move that into ia64 update_mmu_cache itself); and the one in install_file_pte should be removed since the pte is not present. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 9f381e58bf4..21b7d0cbc98 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_file_rmap(page); pte_val = *pte; update_mmu_cache(vma, addr, pte_val); + lazy_mmu_prot_update(pte_val); err = 0; unlock: pte_unmap_unlock(pte, ptl); @@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; - update_mmu_cache(vma, addr, pte_val); + /* + * We don't need to run update_mmu_cache() here because the "file pte" + * being installed by install_file_pte() is not a real pte - it's a + * non-present entry (like a swap entry), noting what file offset should + * be mapped there when there's a fault (in a non-linear vma where + * that's not obvious). + */ pte_unmap_unlock(pte, ptl); err = 0; out: -- cgit v1.2.3 From e0a42726794f71336ff4b26084d453dd597471ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 23 Jun 2006 02:03:46 -0700 Subject: [PATCH] mm/slab.c: fix early init assumption The SLAB bootstrap code assumes that the first two kmalloc caches created (the INDEX_AC and INDEX_L3 kmalloc caches) wont be off-slab. But due to AC and L3 structure size increase in lockdep, one of them ended up being off-slab, and subsequently crashing with: Unable to handle kernel NULL pointer dereference at 0000000000000000 RIP: [] kmem_cache_alloc+0x26/0x7d The fix is to introduce a bootstrap flag and to use it to prevent off-slab caches being created so early during bootup. (The calculation for off-slab caches is quite complex so i didnt want to complicate things with introducing yet another INDEX_ calculation, the flag approach is simpler and smaller.) Signed-off-by: Ingo Molnar Cc: Manfred Spraul Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e3260db04b9..664c3a10acf 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -331,6 +331,8 @@ static __always_inline int index_of(const size_t size) return 0; } +static int slab_early_init = 1; + #define INDEX_AC index_of(sizeof(struct arraycache_init)) #define INDEX_L3 index_of(sizeof(struct kmem_list3)) @@ -1376,6 +1378,8 @@ void __init kmem_cache_init(void) NULL, NULL); } + slab_early_init = 0; + while (sizes->cs_size != ULONG_MAX) { /* * For performance, all the general caches are L1 aligned. @@ -2106,8 +2110,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, #endif #endif - /* Determine if the slab management is 'on' or 'off' slab. */ - if (size >= (PAGE_SIZE >> 3)) + /* + * Determine if the slab management is 'on' or 'off' slab. + * (bootstrapping cannot cope with offslab caches so don't do + * it too early on.) + */ + if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). -- cgit v1.2.3 From bd1e22b8e0a90f9a91e4c27db14ca15773659bf7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Jun 2006 02:03:47 -0700 Subject: [PATCH] initialise total_memory() earlier Initialise total_memory earlier in boot. Because if for some reason we run page reclaim early in boot, we don't want total_memory to be zero when we use it as a divisor. And rename total_memory to vm_total_pages to avoid naming clashes with architectures. Cc: Yasunori Goto Cc: KAMEZAWA Hiroyuki Cc: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- mm/vmscan.c | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5af33186a25..71a0b2a23f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1725,9 +1725,9 @@ void __meminit build_all_zonelists(void) stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); /* cpuset refresh routine should be here */ } - - printk("Built %i zonelists\n", num_online_nodes()); - + vm_total_pages = nr_free_pagecache_pages(); + printk("Built %i zonelists. Total pages: %ld\n", + num_online_nodes(), vm_total_pages); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 71a02e29503..72babac71de 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -110,7 +110,7 @@ struct shrinker { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -static long total_memory; +long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -743,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * how much memory * is mapped. */ - mapped_ratio = (sc->nr_mapped * 100) / total_memory; + mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages; /* * Now decide how much we really want to unmap some pages. The @@ -1482,7 +1482,6 @@ static int __init kswapd_init(void) pgdat->kswapd = find_task_by_pid(pid); read_unlock(&tasklist_lock); } - total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; } -- cgit v1.2.3 From 5a4d43615921575b0c8299a5407ce4836e4138fd Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 23 Jun 2006 02:03:47 -0700 Subject: [PATCH] update vm_total_pages at memory hotadd Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1b1ac3db218..841a077d5ae 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -160,6 +160,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (need_zonelists_rebuild) build_all_zonelists(); - + vm_total_pages = nr_free_pagecache_pages(); return 0; } -- cgit v1.2.3 From 800590f523bf3bde9fa6c8e4d6763e4bf6a2c8ec Mon Sep 17 00:00:00 2001 From: Paul Drynoff Date: Fri, 23 Jun 2006 02:03:48 -0700 Subject: [PATCH] slab: kmalloc, kzalloc comments cleanup and fix - Move comments for kmalloc to right place, currently it near __do_kmalloc - Comments for kzalloc - More detailed comments for kmalloc - Appearance of "kmalloc" and "kzalloc" man pages after "make mandocs" [rdunlap@xenotime.net: simplification] Signed-off-by: Paul Drynoff Acked-by: Randy Dunlap Cc: Pekka Enberg Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 664c3a10acf..98ac20bc0de 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3277,26 +3277,10 @@ EXPORT_SYMBOL(kmalloc_node); #endif /** - * kmalloc - allocate memory + * __do_kmalloc - allocate memory * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate. + * @flags: the type of memory to allocate (see kmalloc). * @caller: function caller for debug tracking of the caller - * - * kmalloc is the normal method of allocating memory - * in the kernel. - * - * The @flags argument may be one of: - * - * %GFP_USER - Allocate memory on behalf of user. May sleep. - * - * %GFP_KERNEL - Allocate normal kernel ram. May sleep. - * - * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. - * - * Additionally, the %GFP_DMA flag may be set to indicate the memory - * must be suitable for DMA. This can mean different things on different - * platforms. For example, on i386, it means that the memory must come - * from the first 16MB. */ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller) -- cgit v1.2.3 From 485bb99b49a173a22a0bbf4e189465414947ecac Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 23 Jun 2006 02:03:49 -0700 Subject: [PATCH] kernel-doc for mm/filemap.c mm/filemap.c: - add lots of kernel-doc; - fix some typos and kernel-doc errors; - drop some blank lines between function close and EXPORT_SYMBOL(); Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 173 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 120 insertions(+), 53 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 3342067ca43..368678c2d53 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -171,15 +171,17 @@ static int sync_page(void *word) } /** - * filemap_fdatawrite_range - start writeback against all of a mapping's - * dirty pages that lie within the byte offsets + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets inclusive. + * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory * cleansing writeback. The difference between + * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. */ @@ -219,7 +221,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } -/* +/** + * filemap_flush - mostly a non-blocking flush + * @mapping: target address_space + * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. */ @@ -229,7 +234,12 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); -/* +/** + * wait_on_page_writeback_range - wait for writeback to complete + * @mapping: target address_space + * @start: beginning page index + * @end: ending page index + * * Wait for writeback to complete against pages indexed by start->end * inclusive */ @@ -276,7 +286,13 @@ int wait_on_page_writeback_range(struct address_space *mapping, return ret; } -/* +/** + * sync_page_range - write and wait on all pages in the passed range + * @inode: target inode + * @mapping: target address_space + * @pos: beginning offset in pages to write + * @count: number of bytes to write + * * Write and wait upon all the pages in the passed range. This is a "data * integrity" operation. It waits upon in-flight writeout before starting and * waiting upon new writeout. If there was an IO error, return it. @@ -305,7 +321,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, } EXPORT_SYMBOL(sync_page_range); -/* +/** + * sync_page_range_nolock + * @inode: target inode + * @mapping: target address_space + * @pos: beginning offset in pages to write + * @count: number of bytes to write + * * Note: Holding i_mutex across sync_page_range_nolock is not a good idea * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. @@ -329,10 +351,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, EXPORT_SYMBOL(sync_page_range_nolock); /** - * filemap_fdatawait - walk the list of under-writeback pages of the given - * address space and wait for all of them. - * + * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. */ int filemap_fdatawait(struct address_space *mapping) { @@ -368,7 +391,12 @@ int filemap_write_and_wait(struct address_space *mapping) } EXPORT_SYMBOL(filemap_write_and_wait); -/* +/** + * filemap_write_and_wait_range - write out & wait on a file range + * @mapping: the address_space for the pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that `lend' is inclusive (describes the last byte to be written) so @@ -394,8 +422,14 @@ int filemap_write_and_wait_range(struct address_space *mapping, return err; } -/* - * This function is used to add newly allocated pagecache pages: +/** + * add_to_page_cache - add newly allocated pagecache pages + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add newly allocated pagecache pages; * the page is new, so we can just run SetPageLocked() against it. * The other page state flags were set by rmqueue(). * @@ -422,7 +456,6 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, } return error; } - EXPORT_SYMBOL(add_to_page_cache); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, @@ -489,8 +522,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr) EXPORT_SYMBOL(wait_on_page_bit); /** - * unlock_page() - unlock a locked page - * + * unlock_page - unlock a locked page * @page: the page * * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). @@ -513,8 +545,9 @@ void fastcall unlock_page(struct page *page) } EXPORT_SYMBOL(unlock_page); -/* - * End writeback against a page. +/** + * end_page_writeback - end writeback against a page + * @page: the page */ void end_page_writeback(struct page *page) { @@ -527,10 +560,11 @@ void end_page_writeback(struct page *page) } EXPORT_SYMBOL(end_page_writeback); -/* - * Get a lock on the page, assuming we need to sleep to get it. +/** + * __lock_page - get a lock on the page, assuming we need to sleep to get it + * @page: the page to lock * - * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some + * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some * random driver's requestfn sets TASK_RUNNING, we could busywait. However * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. @@ -544,8 +578,12 @@ void fastcall __lock_page(struct page *page) } EXPORT_SYMBOL(__lock_page); -/* - * a rather lightweight function, finding and getting a reference to a +/** + * find_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * + * A rather lightweight function, finding and getting a reference to a * hashed page atomically. */ struct page * find_get_page(struct address_space *mapping, unsigned long offset) @@ -559,11 +597,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset) read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_get_page); -/* - * Same as above, but trylock it instead of incrementing the count. +/** + * find_trylock_page - find and lock a page + * @mapping: the address_space to search + * @offset: the page index + * + * Same as find_get_page(), but trylock it instead of incrementing the count. */ struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) { @@ -576,12 +617,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_trylock_page); /** * find_lock_page - locate, pin and lock a pagecache page - * * @mapping: the address_space to search * @offset: the page index * @@ -617,12 +656,10 @@ repeat: read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_lock_page); /** * find_or_create_page - locate or add a pagecache page - * * @mapping: the page's address_space * @index: the page's index into the mapping * @gfp_mask: page allocation mode @@ -663,7 +700,6 @@ repeat: page_cache_release(cached_page); return page; } - EXPORT_SYMBOL(find_or_create_page); /** @@ -729,9 +765,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return i; } -/* +/** + * find_get_pages_tag - find and return pages that match @tag + * @mapping: the address_space to search + * @index: the starting page index + * @tag: the tag index + * @nr_pages: the maximum number of pages + * @pages: where the resulting pages are placed + * * Like find_get_pages, except we only return pages which are tagged with - * `tag'. We update *index to index the next page for the traversal. + * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) @@ -750,7 +793,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, return ret; } -/* +/** + * grab_cache_page_nowait - returns locked page at given index in given cache + * @mapping: target address_space + * @index: the page index + * * Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should @@ -779,19 +826,25 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) } return page; } - EXPORT_SYMBOL(grab_cache_page_nowait); -/* +/** + * do_generic_mapping_read - generic file read routine + * @mapping: address_space to be read + * @_ra: file's readahead state + * @filp: the file to read + * @ppos: current file position + * @desc: read_descriptor + * @actor: read method + * * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level - * stuff. + * mapping->a_ops->readpage() function for the actual low-level stuff. * * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. * - * Note the struct file* is only passed for the use of readpage. It may be - * NULL. + * Note the struct file* is only passed for the use of readpage. + * It may be NULL. */ void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, @@ -1004,7 +1057,6 @@ out: if (filp) file_accessed(filp); } - EXPORT_SYMBOL(do_generic_mapping_read); int file_read_actor(read_descriptor_t *desc, struct page *page, @@ -1045,7 +1097,13 @@ success: return size; } -/* +/** + * __generic_file_aio_read - generic filesystem read routine + * @iocb: kernel I/O control block + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @ppos: current file position + * * This is the "read()" routine for all filesystems * that can use the page cache directly. */ @@ -1124,7 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, out: return retval; } - EXPORT_SYMBOL(__generic_file_aio_read); ssize_t @@ -1135,7 +1192,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t BUG_ON(iocb->ki_pos != pos); return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); } - EXPORT_SYMBOL(generic_file_aio_read); ssize_t @@ -1151,7 +1207,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo ret = wait_on_sync_kiocb(&kiocb); return ret; } - EXPORT_SYMBOL(generic_file_read); int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) @@ -1192,7 +1247,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, return desc.written; return desc.error; } - EXPORT_SYMBOL(generic_file_sendfile); static ssize_t @@ -1228,11 +1282,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } #ifdef CONFIG_MMU -/* +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +/** + * page_cache_read - adds requested page to the page cache if not already there + * @file: file to read + * @offset: page index + * * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); static int fastcall page_cache_read(struct file * file, unsigned long offset) { struct address_space *mapping = file->f_mapping; @@ -1259,7 +1317,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) #define MMAP_LOTSAMISS (100) -/* +/** + * filemap_nopage - read in file data for page fault handling + * @area: the applicable vm_area + * @address: target address to read in + * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL + * * filemap_nopage() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * @@ -1462,7 +1525,6 @@ page_not_uptodate: page_cache_release(page); return NULL; } - EXPORT_SYMBOL(filemap_nopage); static struct page * filemap_getpage(struct file *file, unsigned long pgoff, @@ -1716,7 +1778,13 @@ repeat: return page; } -/* +/** + * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * * Read into the page cache. If a page already exists, * and PageUptodate() is not set, try to fill the page. */ @@ -1754,7 +1822,6 @@ retry: out: return page; } - EXPORT_SYMBOL(read_cache_page); /* @@ -1854,7 +1921,7 @@ __filemap_copy_from_user_iovec(char *vaddr, /* * Performs necessary checks before doing a write * - * Can adjust writing position aor amount of bytes to write. + * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ -- cgit v1.2.3 From 8f9de51a4a98ba32f839903b7d009788bc2c295d Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Fri, 23 Jun 2006 02:03:50 -0700 Subject: [PATCH] printk() should not be called under zone->lock This patch fixes printk() under zone->lock in show_free_areas(). It can be unsafe to call printk() under this lock, since caller can try to allocate/free some memory and selfdeadlock on this lock. I found allocations/freeing mem both in netconsole and serial console. This issue was faced in reallity when meminfo was periodically printed for debug purposes and netconsole was used. Signed-off-by: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 71a0b2a23f5..423db0db7c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1491,7 +1491,7 @@ void show_free_areas(void) } for_each_zone(zone) { - unsigned long nr, flags, order, total = 0; + unsigned long nr[MAX_ORDER], flags, order, total = 0; show_node(zone); printk("%s: ", zone->name); @@ -1502,11 +1502,12 @@ void show_free_areas(void) spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - nr = zone->free_area[order].nr_free; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + nr[order] = zone->free_area[order].nr_free; + total += nr[order] << order; } spin_unlock_irqrestore(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", nr[order], K(1UL) << order); printk("= %lukB\n", K(total)); } -- cgit v1.2.3 From e24f0b8f76cc3dd96f36f5b6a9f020f6c3fce198 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:51 -0700 Subject: [PATCH] page migration: simplify migrate_pages() Currently migrate_pages() is mess with lots of goto. Extract two functions from migrate_pages() and get rid of the gotos. Plus we can just unconditionally set the locked bit on the new page since we are the only one holding a reference. Locking is to stop others from accessing the page once we establish references to the new page. Remove the list_del from move_to_lru in order to have finer control over list processing. [akpm@osdl.org: add debug check] Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 218 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 115 insertions(+), 103 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index b5000d46389..09038163bfe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -84,7 +84,6 @@ int migrate_prep(void) static inline void move_to_lru(struct page *page) { - list_del(&page->lru); if (PageActive(page)) { /* * lru_cache_add_active checks that @@ -110,6 +109,7 @@ int putback_lru_pages(struct list_head *l) int count = 0; list_for_each_entry_safe(page, page2, l, lru) { + list_del(&page->lru); move_to_lru(page); count++; } @@ -533,12 +533,109 @@ static int fallback_migrate_page(struct address_space *mapping, return migrate_page(mapping, newpage, page); } +/* + * Move a page to a newly allocated page + * The page is locked and all ptes have been successfully removed. + * + * The new page will have replaced the old page if this function + * is successful. + */ +static int move_to_new_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping; + int rc; + + /* + * Block others from accessing the page when we get around to + * establishing additional references. We are the only one + * holding a reference to the new page at this point. + */ + if (TestSetPageLocked(newpage)) + BUG(); + + /* Prepare mapping for the new page.*/ + newpage->index = page->index; + newpage->mapping = page->mapping; + + mapping = page_mapping(page); + if (!mapping) + rc = migrate_page(mapping, newpage, page); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page); + else + rc = fallback_migrate_page(mapping, newpage, page); + + if (!rc) + remove_migration_ptes(page, newpage); + else + newpage->mapping = NULL; + + unlock_page(newpage); + + return rc; +} + +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(struct page *newpage, struct page *page, int force) +{ + int rc = 0; + + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto ret; + + rc = -EAGAIN; + if (TestSetPageLocked(page)) { + if (!force) + goto ret; + lock_page(page); + } + + if (PageWriteback(page)) { + if (!force) + goto unlock; + wait_on_page_writeback(page); + } + + /* + * Establish migration ptes or remove ptes + */ + if (try_to_unmap(page, 1) != SWAP_FAIL) { + if (!page_mapped(page)) + rc = move_to_new_page(newpage, page); + } else + /* A vma has VM_LOCKED set -> permanent failure */ + rc = -EPERM; + + if (rc) + remove_migration_ptes(page, page); +unlock: + unlock_page(page); +ret: + if (rc != -EAGAIN) { + list_del(&newpage->lru); + move_to_lru(newpage); + } + return rc; +} + /* * migrate_pages * * Two lists are passed to this function. The first list * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated + * The second list contains new pages that the isolated pages * can be moved to. * * The function returns after 10 attempts or if no pages @@ -550,7 +647,7 @@ static int fallback_migrate_page(struct address_space *mapping, int migrate_pages(struct list_head *from, struct list_head *to, struct list_head *moved, struct list_head *failed) { - int retry; + int retry = 1; int nr_failed = 0; int pass = 0; struct page *page; @@ -561,118 +658,33 @@ int migrate_pages(struct list_head *from, struct list_head *to, if (!swapwrite) current->flags |= PF_SWAPWRITE; -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) - wait_on_page_writeback(page); - else - if (PageWriteback(page)) - goto unlock_page; - - /* - * Establish migration ptes or remove ptes - */ - rc = -EPERM; - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> permanent failure */ - goto unlock_page; + for(pass = 0; pass < 10 && retry; pass++) { + retry = 0; - rc = -EAGAIN; - if (page_mapped(page)) - goto unlock_page; + list_for_each_entry_safe(page, page2, from, lru) { - newpage = lru_to_page(to); - lock_page(newpage); - /* Prepare mapping for the new page.*/ - newpage->index = page->index; - newpage->mapping = page->mapping; + if (list_empty(to)) + break; - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - rc = migrate_page(mapping, newpage, page); + cond_resched(); - else if (mapping->a_ops->migratepage) - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); - - if (!rc) - remove_migration_ptes(page, newpage); - - unlock_page(newpage); - -unlock_page: - if (rc) - remove_migration_ptes(page, page); + rc = unmap_and_move(lru_to_page(to), page, pass > 2); - unlock_page(page); - -next: - if (rc) { - if (newpage) - newpage->mapping = NULL; - - if (rc == -EAGAIN) + switch(rc) { + case -EAGAIN: retry++; - else { + break; + case 0: + list_move(&page->lru, moved); + break; + default: /* Permanent failure */ list_move(&page->lru, failed); nr_failed++; + break; } - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); } } - if (retry && pass++ < 10) - goto redo; if (!swapwrite) current->flags &= ~PF_SWAPWRITE; -- cgit v1.2.3 From aaa994b300a172afafab47938804836b923e5ef7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:52 -0700 Subject: [PATCH] page migration: handle freeing of pages in migrate_pages() Do not leave pages on the lists passed to migrate_pages(). Seems that we will not need any postprocessing of pages. This will simplify the handling of pages by the callers of migrate_pages(). Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 8 +------- mm/migrate.c | 48 +++++++++++++++++++++++------------------------- 2 files changed, 24 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8778f58880c..244f3f130e4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -603,11 +603,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (!list_empty(&pagelist)) { + if (!list_empty(&pagelist)) err = migrate_pages_to(&pagelist, NULL, dest); - if (!list_empty(&pagelist)) - putback_lru_pages(&pagelist); - } return err; } @@ -773,9 +770,6 @@ long do_mbind(unsigned long start, unsigned long len, err = -EIO; } - if (!list_empty(&pagelist)) - putback_lru_pages(&pagelist); - up_write(&mm->mmap_sem); mpol_free(new); return err; diff --git a/mm/migrate.c b/mm/migrate.c index 09038163bfe..d3a1810a4c9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -624,6 +624,15 @@ unlock: unlock_page(page); ret: if (rc != -EAGAIN) { + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kepts its references and be + * restored. + */ + list_del(&page->lru); + move_to_lru(page); + list_del(&newpage->lru); move_to_lru(newpage); } @@ -640,12 +649,12 @@ ret: * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty - * or no retryable pages exist anymore. + * or no retryable pages exist anymore. All pages will be + * retruned to the LRU or freed. * - * Return: Number of pages not migrated when "to" ran empty. + * Return: Number of pages not migrated. */ -int migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) +int migrate_pages(struct list_head *from, struct list_head *to) { int retry = 1; int nr_failed = 0; @@ -675,11 +684,9 @@ int migrate_pages(struct list_head *from, struct list_head *to, retry++; break; case 0: - list_move(&page->lru, moved); break; default: /* Permanent failure */ - list_move(&page->lru, failed); nr_failed++; break; } @@ -689,6 +696,7 @@ int migrate_pages(struct list_head *from, struct list_head *to, if (!swapwrite) current->flags &= ~PF_SWAPWRITE; + putback_lru_pages(from); return nr_failed + retry; } @@ -702,11 +710,10 @@ int migrate_pages_to(struct list_head *pagelist, struct vm_area_struct *vma, int dest) { LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); int err = 0; unsigned long offset = 0; int nr_pages; + int nr_failed = 0; struct page *page; struct list_head *p; @@ -740,26 +747,17 @@ redo: if (nr_pages > MIGRATE_CHUNK_SIZE) break; } - err = migrate_pages(pagelist, &newlist, &moved, &failed); + err = migrate_pages(pagelist, &newlist); - putback_lru_pages(&moved); /* Call release pages instead ?? */ - - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; -out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); + if (err >= 0) { + nr_failed += err; + if (list_empty(&newlist) && !list_empty(pagelist)) + goto redo; } - list_splice(&failed, pagelist); - if (err < 0) - return err; +out: /* Calculate number of leftover pages */ - nr_pages = 0; list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + nr_failed++; + return nr_failed; } -- cgit v1.2.3 From 95a402c3847cc16f4ba03013cd01404fa0f14c2e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:53 -0700 Subject: [PATCH] page migration: use allocator function for migrate_pages() Instead of passing a list of new pages, pass a function to allocate a new page. This allows the correct placement of MPOL_INTERLEAVE pages during page migration. It also further simplifies the callers of migrate pages. migrate_pages() becomes similar to migrate_pages_to() so drop migrate_pages_to(). The batching of new page allocations becomes unnecessary. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 23 +++++++++++- mm/migrate.c | 115 ++++++++++++++++----------------------------------------- 2 files changed, 53 insertions(+), 85 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 244f3f130e4..f432642e9e6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -587,6 +588,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, isolate_lru_page(page, pagelist); } +static struct page *new_node_page(struct page *page, unsigned long node) +{ + return alloc_pages_node(node, GFP_HIGHUSER, 0); +} + /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. @@ -604,7 +610,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) - err = migrate_pages_to(&pagelist, NULL, dest); + err = migrate_pages(&pagelist, new_node_page, dest); + return err; } @@ -691,6 +698,12 @@ int do_migrate_pages(struct mm_struct *mm, } +static struct page *new_vma_page(struct page *page, unsigned long private) +{ + struct vm_area_struct *vma = (struct vm_area_struct *)private; + + return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); +} #else static void migrate_page_add(struct page *page, struct list_head *pagelist, @@ -703,6 +716,11 @@ int do_migrate_pages(struct mm_struct *mm, { return -ENOSYS; } + +static struct page *new_vma_page(struct page *page, unsigned long private) +{ + return NULL; +} #endif long do_mbind(unsigned long start, unsigned long len, @@ -764,7 +782,8 @@ long do_mbind(unsigned long start, unsigned long len, err = mbind_range(vma, start, end, new); if (!list_empty(&pagelist)) - nr_failed = migrate_pages_to(&pagelist, vma, -1); + nr_failed = migrate_pages(&pagelist, new_vma_page, + (unsigned long)vma); if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; diff --git a/mm/migrate.c b/mm/migrate.c index d3a1810a4c9..251a8d15825 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -28,9 +28,6 @@ #include "internal.h" -/* The maximum number of pages to take off the LRU for migration */ -#define MIGRATE_CHUNK_SIZE 256 - #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) /* @@ -587,18 +584,23 @@ static int move_to_new_page(struct page *newpage, struct page *page) * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(struct page *newpage, struct page *page, int force) +static int unmap_and_move(new_page_t get_new_page, unsigned long private, + struct page *page, int force) { int rc = 0; + struct page *newpage = get_new_page(page, private); + + if (!newpage) + return -ENOMEM; if (page_count(page) == 1) /* page was freed from under us. So we are done. */ - goto ret; + goto move_newpage; rc = -EAGAIN; if (TestSetPageLocked(page)) { if (!force) - goto ret; + goto move_newpage; lock_page(page); } @@ -622,7 +624,7 @@ static int unmap_and_move(struct page *newpage, struct page *page, int force) remove_migration_ptes(page, page); unlock: unlock_page(page); -ret: + if (rc != -EAGAIN) { /* * A page that has been migrated has all references @@ -632,29 +634,33 @@ ret: */ list_del(&page->lru); move_to_lru(page); - - list_del(&newpage->lru); - move_to_lru(newpage); } + +move_newpage: + /* + * Move the new page to the LRU. If migration was not successful + * then this will free the page. + */ + move_to_lru(newpage); return rc; } /* * migrate_pages * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the isolated pages - * can be moved to. + * The function takes one list of pages to migrate and a function + * that determines from the page to be migrated and the private data + * the target of the move and allocates the page. * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty * or no retryable pages exist anymore. All pages will be * retruned to the LRU or freed. * - * Return: Number of pages not migrated. + * Return: Number of pages not migrated or error code. */ -int migrate_pages(struct list_head *from, struct list_head *to) +int migrate_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private) { int retry = 1; int nr_failed = 0; @@ -671,15 +677,14 @@ int migrate_pages(struct list_head *from, struct list_head *to) retry = 0; list_for_each_entry_safe(page, page2, from, lru) { - - if (list_empty(to)) - break; - cond_resched(); - rc = unmap_and_move(lru_to_page(to), page, pass > 2); + rc = unmap_and_move(get_new_page, private, + page, pass > 2); switch(rc) { + case -ENOMEM: + goto out; case -EAGAIN: retry++; break; @@ -692,72 +697,16 @@ int migrate_pages(struct list_head *from, struct list_head *to) } } } - + rc = 0; +out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; putback_lru_pages(from); - return nr_failed + retry; -} -/* - * Migrate the list 'pagelist' of pages to a certain destination. - * - * Specify destination with either non-NULL vma or dest_node >= 0 - * Return the number of pages not migrated or error code - */ -int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest) -{ - LIST_HEAD(newlist); - int err = 0; - unsigned long offset = 0; - int nr_pages; - int nr_failed = 0; - struct page *page; - struct list_head *p; - -redo: - nr_pages = 0; - list_for_each(p, pagelist) { - if (vma) { - /* - * The address passed to alloc_page_vma is used to - * generate the proper interleave behavior. We fake - * the address here by an increasing offset in order - * to get the proper distribution of pages. - * - * No decision has been made as to which page - * a certain old page is moved to so we cannot - * specify the correct address. - */ - page = alloc_page_vma(GFP_HIGHUSER, vma, - offset + vma->vm_start); - offset += PAGE_SIZE; - } - else - page = alloc_pages_node(dest, GFP_HIGHUSER, 0); - - if (!page) { - err = -ENOMEM; - goto out; - } - list_add_tail(&page->lru, &newlist); - nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE) - break; - } - err = migrate_pages(pagelist, &newlist); - - if (err >= 0) { - nr_failed += err; - if (list_empty(&newlist) && !list_empty(pagelist)) - goto redo; - } -out: + if (rc) + return rc; - /* Calculate number of leftover pages */ - list_for_each(p, pagelist) - nr_failed++; - return nr_failed; + return nr_failed + retry; } + -- cgit v1.2.3 From 742755a1d8ce2b548428f7aacf1758b4bba50080 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:55 -0700 Subject: [PATCH] page migration: sys_move_pages(): support moving of individual pages move_pages() is used to move individual pages of a process. The function can be used to determine the location of pages and to move them onto the desired node. move_pages() returns status information for each page. long move_pages(pid, number_of_pages_to_move, addresses_of_pages[], nodes[] or NULL, status[], flags); The addresses of pages is an array of void * pointing to the pages to be moved. The nodes array contains the node numbers that the pages should be moved to. If a NULL is passed instead of an array then no pages are moved but the status array is updated. The status request may be used to determine the page state before issuing another move_pages() to move pages. The status array will contain the state of all individual page migration attempts when the function terminates. The status array is only valid if move_pages() completed successfullly. Possible page states in status[]: 0..MAX_NUMNODES The page is now on the indicated node. -ENOENT Page is not present -EACCES Page is mapped by multiple processes and can only be moved if MPOL_MF_MOVE_ALL is specified. -EPERM The page has been mlocked by a process/driver and cannot be moved. -EBUSY Page is busy and cannot be moved. Try again later. -EFAULT Invalid address (no VMA or zero page). -ENOMEM Unable to allocate memory on target node. -EIO Unable to write back page. The page must be written back in order to move it since the page is dirty and the filesystem does not provide a migration function that would allow the moving of dirty pages. -EINVAL A dirty page cannot be moved. The filesystem does not provide a migration function and has no ability to write back pages. The flags parameter indicates what types of pages to move: MPOL_MF_MOVE Move pages that are only mapped by the process. MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes. Requires sufficient capabilities. Possible return codes from move_pages() -ENOENT No pages found that would require moving. All pages are either already on the target node, not present, had an invalid address or could not be moved because they were mapped by multiple processes. -EINVAL Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt to migrate pages in a kernel thread. -EPERM MPOL_MF_MOVE_ALL specified without sufficient priviledges. or an attempt to move a process belonging to another user. -EACCES One of the target nodes is not allowed by the current cpuset. -ENODEV One of the target nodes is not online. -ESRCH Process does not exist. -E2BIG Too many pages to move. -ENOMEM Not enough memory to allocate control array. -EFAULT Parameters could not be accessed. A test program for move_pages() may be found with the patches on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3 From: Christoph Lameter Detailed results for sys_move_pages() Pass a pointer to an integer to get_new_page() that may be used to indicate where the completion status of a migration operation should be placed. This allows sys_move_pags() to report back exactly what happened to each page. Wish there would be a better way to do this. Looks a bit hacky. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 +- mm/migrate.c | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 266 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f432642e9e6..05b84acf0bb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, isolate_lru_page(page, pagelist); } -static struct page *new_node_page(struct page *page, unsigned long node) +static struct page *new_node_page(struct page *page, unsigned long node, int **x) { return alloc_pages_node(node, GFP_HIGHUSER, 0); } @@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm, } -static struct page *new_vma_page(struct page *page, unsigned long private) +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) { struct vm_area_struct *vma = (struct vm_area_struct *)private; diff --git a/mm/migrate.c b/mm/migrate.c index 251a8d15825..033a12f4c94 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) } /* - * migrate_prep() needs to be called after we have compiled the list of pages - * to be migrated using isolate_lru_page() but before we begin a series of calls - * to migrate_pages(). + * migrate_prep() needs to be called before we start compiling a list of pages + * to be migrated using isolate_lru_page(). */ int migrate_prep(void) { @@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, struct page *page, int force) { int rc = 0; - struct page *newpage = get_new_page(page, private); + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); if (!newpage) return -ENOMEM; @@ -642,6 +644,12 @@ move_newpage: * then this will free the page. */ move_to_lru(newpage); + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(newpage); + } return rc; } @@ -710,3 +718,255 @@ out: return nr_failed + retry; } +#ifdef CONFIG_NUMA +/* + * Move a list of individual pages + */ +struct page_to_node { + unsigned long addr; + struct page *page; + int node; + int status; +}; + +static struct page *new_page_node(struct page *p, unsigned long private, + int **result) +{ + struct page_to_node *pm = (struct page_to_node *)private; + + while (pm->node != MAX_NUMNODES && pm->page != p) + pm++; + + if (pm->node == MAX_NUMNODES) + return NULL; + + *result = &pm->status; + + return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); +} + +/* + * Move a set of pages as indicated in the pm array. The addr + * field must be set to the virtual address of the page to be moved + * and the node number must contain a valid target node. + */ +static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, + int migrate_all) +{ + int err; + struct page_to_node *pp; + LIST_HEAD(pagelist); + + down_read(&mm->mmap_sem); + + /* + * Build a list of pages to migrate + */ + migrate_prep(); + for (pp = pm; pp->node != MAX_NUMNODES; pp++) { + struct vm_area_struct *vma; + struct page *page; + + /* + * A valid page pointer that will not match any of the + * pages that will be moved. + */ + pp->page = ZERO_PAGE(0); + + err = -EFAULT; + vma = find_vma(mm, pp->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pp->addr, FOLL_GET); + err = -ENOENT; + if (!page) + goto set_status; + + if (PageReserved(page)) /* Check for zero page */ + goto put_and_set; + + pp->page = page; + err = page_to_nid(page); + + if (err == pp->node) + /* + * Node already in the right place + */ + goto put_and_set; + + err = -EACCES; + if (page_mapcount(page) > 1 && + !migrate_all) + goto put_and_set; + + err = isolate_lru_page(page, &pagelist); +put_and_set: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +set_status: + pp->status = err; + } + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_page_node, + (unsigned long)pm); + else + err = -ENOENT; + + up_read(&mm->mmap_sem); + return err; +} + +/* + * Determine the nodes of a list of pages. The addr in the pm array + * must have been set to the virtual address of which we want to determine + * the node number. + */ +static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +{ + down_read(&mm->mmap_sem); + + for ( ; pm->node != MAX_NUMNODES; pm++) { + struct vm_area_struct *vma; + struct page *page; + int err; + + err = -EFAULT; + vma = find_vma(mm, pm->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pm->addr, 0); + err = -ENOENT; + /* Use PageReserved to check for zero page */ + if (!page || PageReserved(page)) + goto set_status; + + err = page_to_nid(page); +set_status: + pm->status = err; + } + + up_read(&mm->mmap_sem); + return 0; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + int err = 0; + int i; + struct task_struct *task; + nodemask_t task_nodes; + struct mm_struct *mm; + struct page_to_node *pm = NULL; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out2; + } + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out2; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out2; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out; + + err = -ENODEV; + if (!node_online(node)) + goto out; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out; + + pm[i].node = node; + } + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + if (nodes) + err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); + else + err = do_pages_stat(mm, pm); + + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out: + vfree(pm); +out2: + mmput(mm); + return err; +} +#endif + -- cgit v1.2.3 From 86c3a7645c05a7d06b72653aa4b2bea4e7229d1b Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 23 Jun 2006 02:04:02 -0700 Subject: [PATCH] SELinux: add security_task_movememory calls to mm code This patch inserts security_task_movememory hook calls into memory management code to enable security modules to mediate this operation between tasks. Since the last posting, the hook has been renamed following feedback from Christoph Lameter. Signed-off-by: David Quigley Acked-by: Stephen Smalley Signed-off-by: James Morris Cc: Andi Kleen Acked-by: Christoph Lameter Acked-by: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 +++++ mm/migrate.c | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 05b84acf0bb..ec4a1a950df 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include @@ -942,6 +943,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, goto out; } + err = security_task_movememory(task); + if (err) + goto out; + err = do_migrate_pages(mm, &old, &new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: diff --git a/mm/migrate.c b/mm/migrate.c index 033a12f4c94..1c2a71aa05c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "internal.h" @@ -905,6 +906,11 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, goto out2; } + err = security_task_movememory(task); + if (err) + goto out2; + + task_nodes = cpuset_mems_allowed(task); /* Limit nr_pages so that the multiplication may not overflow */ -- cgit v1.2.3 From c22ce143d15eb288543fe9873e1c5ac1c01b69a1 Mon Sep 17 00:00:00 2001 From: Hiro Yoshioka Date: Fri, 23 Jun 2006 02:04:16 -0700 Subject: [PATCH] x86: cache pollution aware __copy_from_user_ll() Use the x86 cache-bypassing copy instructions for copy_from_user(). Some performance data are Total of GLOBAL_POWER_EVENTS (CPU cycle samples) 2.6.12.4.orig 1921587 2.6.12.4.nt 1599424 1599424/1921587=83.23% (16.77% reduction) BSQ_CACHE_REFERENCE (L3 cache miss) 2.6.12.4.orig 57427 2.6.12.4.nt 20858 20858/57427=36.32% (63.7% reduction) L3 cache miss reduction of __copy_from_user_ll samples % 37408 65.1412 vmlinux __copy_from_user_ll 23 0.1103 vmlinux __copy_user_zeroing_intel_nocache 23/37408=0.061% (99.94% reduction) Top 5 of 2.6.12.4.nt Counted GLOBAL_POWER_EVENTS events (time during which processor is not stopped) with a unit mask of 0x01 (mandatory) count 100000 samples % app name symbol name 128392 8.0274 vmlinux __copy_user_zeroing_intel_nocache 64206 4.0143 vmlinux journal_add_journal_head 59746 3.7355 vmlinux do_get_write_access 47674 2.9807 vmlinux journal_put_journal_head 46021 2.8774 vmlinux journal_dirty_metadata pattern9-0-cpu4-0-09011728/summary.out Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x3f (multiple flags) count 3000 samples % app name symbol name 69755 4.2861 vmlinux __copy_user_zeroing_intel_nocache 55685 3.4215 vmlinux journal_add_journal_head 52371 3.2179 vmlinux __find_get_block 45504 2.7960 vmlinux journal_put_journal_head 36005 2.2123 vmlinux journal_stop pattern9-0-cpu4-0-09011744/summary.out Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x200 (read 3rd level cache miss) count 3000 samples % app name symbol name 1147 5.4994 vmlinux journal_add_journal_head 881 4.2240 vmlinux journal_dirty_data 872 4.1809 vmlinux blk_rq_map_sg 734 3.5192 vmlinux journal_commit_transaction 617 2.9582 vmlinux radix_tree_delete pattern9-0-cpu4-0-09011731/summary.out iozone results are original 2.6.12.4 CPU time = 207.768 sec cache aware CPU time = 184.783 sec (three times run) 184.783/207.768=88.94% (11.06% reduction) original: pattern9-0-cpu4-0-08191720/iozone.out: CPU Utilization: Wall time 45.997 CPU time 64.527 CPU utilization 140.28 % pattern9-0-cpu4-0-08191741/iozone.out: CPU Utilization: Wall time 46.878 CPU time 71.933 CPU utilization 153.45 % pattern9-0-cpu4-0-08191743/iozone.out: CPU Utilization: Wall time 45.152 CPU time 71.308 CPU utilization 157.93 % cache awre: pattern9-0-cpu4-0-09011728/iozone.out: CPU Utilization: Wall time 44.842 CPU time 62.465 CPU utilization 139.30 % pattern9-0-cpu4-0-09011731/iozone.out: CPU Utilization: Wall time 44.718 CPU time 59.273 CPU utilization 132.55 % pattern9-0-cpu4-0-09011744/iozone.out: CPU Utilization: Wall time 44.367 CPU time 63.045 CPU utilization 142.10 % Signed-off-by: Hiro Yoshioka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- mm/filemap.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 368678c2d53..807a463fd5e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,6 @@ */ #include /* for generic_osync_inode */ -#include #include static ssize_t @@ -1902,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr, int copy = min(bytes, iov->iov_len - base); base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); + left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); copied += copy; bytes -= copy; vaddr += copy; diff --git a/mm/filemap.h b/mm/filemap.h index 13793ba0ce1..5683cde2205 100644 --- a/mm/filemap.h +++ b/mm/filemap.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include size_t __filemap_copy_from_user_iovec(char *vaddr, @@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset, int left; kaddr = kmap_atomic(page, KM_USER0); - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); kunmap_atomic(kaddr, KM_USER0); if (left != 0) { /* Do it the slow way */ kaddr = kmap(page); - left = __copy_from_user(kaddr + offset, buf, bytes); + left = __copy_from_user_nocache(kaddr + offset, buf, bytes); kunmap(page); } return bytes - left; -- cgit v1.2.3 From 090d2b185d8680fc26a2eaf4245d4171dcf4baf1 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 23 Jun 2006 02:05:08 -0700 Subject: [PATCH] read_mapping_page for address space Add read_mapping_page() which is used for callers that pass mapping->a_ops->readpage as the filler for read_cache_page. This removes some duplication from filesystem code. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index f2824c3c31b..cc367f7e75d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1477,8 +1477,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = -EINVAL; goto bad_swap; } - page = read_cache_page(mapping, 0, - (filler_t *)mapping->a_ops->readpage, swap_file); + page = read_mapping_page(mapping, 0, swap_file); if (IS_ERR(page)) { error = PTR_ERR(page); goto bad_swap; -- cgit v1.2.3 From 3cbc564024d8f174202f023e8a2991782f6a9431 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Fri, 23 Jun 2006 02:05:40 -0700 Subject: [PATCH] percpu_counters: create lib/percpu_counter.c - Move percpu_counter routines from mm/swap.c to lib/percpu_counter.c Signed-off-by: Ravikiran Thirumalai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 88895c249bc..03ae2076f92 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -480,48 +480,6 @@ static int cpu_swap_callback(struct notifier_block *nfb, #endif /* CONFIG_HOTPLUG_CPU */ #endif /* CONFIG_SMP */ -#ifdef CONFIG_SMP -void percpu_counter_mod(struct percpu_counter *fbc, long amount) -{ - long count; - long *pcount; - int cpu = get_cpu(); - - pcount = per_cpu_ptr(fbc->counters, cpu); - count = *pcount + amount; - if (count >= FBC_BATCH || count <= -FBC_BATCH) { - spin_lock(&fbc->lock); - fbc->count += count; - *pcount = 0; - spin_unlock(&fbc->lock); - } else { - *pcount = count; - } - put_cpu(); -} -EXPORT_SYMBOL(percpu_counter_mod); - -/* - * Add up all the per-cpu counts, return the result. This is a more accurate - * but much slower version of percpu_counter_read_positive() - */ -long percpu_counter_sum(struct percpu_counter *fbc) -{ - long ret; - int cpu; - - spin_lock(&fbc->lock); - ret = fbc->count; - for_each_possible_cpu(cpu) { - long *pcount = per_cpu_ptr(fbc->counters, cpu); - ret += *pcount; - } - spin_unlock(&fbc->lock); - return ret < 0 ? 0 : ret; -} -EXPORT_SYMBOL(percpu_counter_sum); -#endif - /* * Perform any setup for the swap system */ -- cgit v1.2.3 From e0f23603fb2607315ce52432cc4225df410828cf Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Jun 2006 02:05:48 -0700 Subject: [PATCH] Remove semi-softlockup from invalidate_mapping_pages If invalidate_mapping_pages is called to invalidate a very large mapping (e.g. a very large block device) and if the only active page in that device is near the end (or at least, at a very large index), such as, say, the superblock of an md array, and if that page happens to be locked when invalidate_mapping_pages is called, then pagevec_lookup will return this page and as it is locked, 'next' will be incremented and pagevec_lookup will be called again. and again. and again. while we count from 0 upto a very large number. We should really always set 'next' to 'page->index+1' before going around the loop again, not just if the page isn't locked. Cc: "Steinar H. Gunderson" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 6cb3fff25f6..cf1b015df4a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; + pgoff_t index; + int lock_failed; - if (TestSetPageLocked(page)) { - next++; - continue; - } - if (page->index > next) - next = page->index; + lock_failed = TestSetPageLocked(page); + + /* + * We really shouldn't be looking at the ->index of an + * unlocked page. But we're not allowed to lock these + * pages. So we rely upon nobody altering the ->index + * of this (pinned-by-us) page. + */ + index = page->index; + if (index > next) + next = index; next++; + if (lock_failed) + continue; + if (PageDirty(page) || PageWriteback(page)) goto unlock; if (page_mapped(page)) -- cgit v1.2.3 From 125e18745f16685f69a34fd6130d47598fc4bf54 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Fri, 23 Jun 2006 02:06:06 -0700 Subject: [PATCH] More BUG_ON conversion Signed-off-by: Eric Sesterhenn Signed-off-by: Alexey Dobriyan Cc: Bartlomiej Zolnierkiewicz Cc: Alan Cox Cc: James Bottomley Acked-by: "Salyzyn, Mark" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pdflush.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/pdflush.c b/mm/pdflush.c index c4b6d0afd73..df7e50b8f70 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -202,8 +202,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) unsigned long flags; int ret = 0; - if (fn == NULL) - BUG(); /* Hard to diagnose if it's deferred */ + BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ spin_lock_irqsave(&pdflush_lock, flags); if (list_empty(&pdflush_list)) { -- cgit v1.2.3 From b31dc66a54ad986b6b73bdc49c8efc17cbad1833 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Jun 2006 08:26:10 +0200 Subject: [PATCH] Kill PF_SYNCWRITE flag A process flag to indicate whether we are doing sync io is incredibly ugly. It also causes performance problems when one does a lot of async io and then proceeds to sync it. Part of the io will go out as async, and the other part as sync. This causes a disconnect between the previously submitted io and the synced io. For io schedulers such as CFQ, this will cause us lost merges and suboptimal behaviour in scheduling. Remove PF_SYNCWRITE completely from the fsync/msync paths, and let the O_DIRECT path just directly indicate that the writes are sync by using WRITE_SYNC instead. Signed-off-by: Jens Axboe --- mm/msync.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index bc6c9537636..d083544df21 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) * just ignore them, but return -ENOMEM at the end. */ down_read(¤t->mm->mmap_sem); - if (flags & MS_SYNC) - current->flags |= PF_SYNCWRITE; vma = find_vma(current->mm, start); if (!vma) { error = -ENOMEM; @@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) } } while (vma && !done); out_unlock: - current->flags &= ~PF_SYNCWRITE; up_read(¤t->mm->mmap_sem); out: return error; -- cgit v1.2.3 From 9f1a3cfcffaed2fbb3206179295c79ca8289f5c3 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Sun, 25 Jun 2006 05:46:46 -0700 Subject: [PATCH] AOP_TRUNCATED_PAGE victims in read_pages() belong in the LRU AOP_TRUNCATED_PAGE victims in read_pages() belong in the LRU Nick Piggin rightly pointed out that the introduction of AOP_TRUNCATED_PAGE to read_pages() was wrong to leave A_T_P victim pages in the page cache but not put them in the LRU. Failing to do so hid them from the VM. A_T_P just means that the aop method unlocked the page rather than performing IO. It would be very rare that the page was truncated between the unlock and testing A_T_P. So we leave the pages in the LRU for likely reuse soon rather than backing them back out of the page cache. We do this by matching the behaviour before the A_T_P introduction which added pages to the LRU regardless of what ->readpage() did. This doesn't include the unrelated cleanup in Nick's initial fix which changed read_pages() to return void to match its only caller's behaviour of ignoring errors. Signed-off-by: Nick Piggin Signed-off-by: Zach Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 0f142a40984..4ee52cadab9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -182,14 +182,11 @@ static int read_pages(struct address_space *mapping, struct file *filp, list_del(&page->lru); if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { - ret = mapping->a_ops->readpage(filp, page); - if (ret != AOP_TRUNCATED_PAGE) { - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - continue; - } /* else fall through to release */ - } - page_cache_release(page); + mapping->a_ops->readpage(filp, page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + } else + page_cache_release(page); } pagevec_lru_add(&lru_pvec); ret = 0; -- cgit v1.2.3 From 7b2259b3e53f128c10a9fded0965e69d4a949847 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 25 Jun 2006 05:46:48 -0700 Subject: [PATCH] page migration: Support a vma migration function Hooks for calling vma specific migration functions With this patch a vma may define a vma->vm_ops->migrate function. That function may perform page migration on its own (some vmas may not contain page structs and therefore cannot be handled by regular page migration. Pages in a vma may require special preparatory treatment before migration is possible etc) . Only mmap_sem is held when the migration function is called. The migrate() function gets passed two sets of nodemasks describing the source and the target of the migration. The flags parameter either contains MPOL_MF_MOVE which means that only pages used exclusively by the specified mm should be moved or MPOL_MF_MOVE_ALL which means that pages shared with other processes should also be moved. The migration function returns 0 on success or an error condition. An error condition will prevent regular page migration from occurring. On its own this patch cannot be included since there are no users for this functionality. But it seems that the uncached allocator will need this functionality at some point. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 +++++- mm/migrate.c | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ec4a1a950df..73e0f23b7f5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -632,6 +632,10 @@ int do_migrate_pages(struct mm_struct *mm, down_read(&mm->mmap_sem); + err = migrate_vmas(mm, from_nodes, to_nodes, flags); + if (err) + goto out; + /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -691,7 +695,7 @@ int do_migrate_pages(struct mm_struct *mm, if (err < 0) break; } - +out: up_read(&mm->mmap_sem); if (err < 0) return err; diff --git a/mm/migrate.c b/mm/migrate.c index 1c2a71aa05c..0576c053598 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -976,3 +976,23 @@ out2: } #endif +/* + * Call migration functions in the vma_ops that may prepare + * memory in a vm for migration. migration functions may perform + * the migration for vmas that do not have an underlying page struct. + */ +int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, + const nodemask_t *from, unsigned long flags) +{ + struct vm_area_struct *vma; + int err = 0; + + for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { + if (vma->vm_ops && vma->vm_ops->migrate) { + err = vma->vm_ops->migrate(vma, to, from, flags); + if (err) + break; + } + } + return err; +} -- cgit v1.2.3 From e6a1530d692d6a60cdf15dfbcfea07f5324d7b9f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 25 Jun 2006 05:46:49 -0700 Subject: [PATCH] Allow migration of mlocked pages Hugh clarified the role of VM_LOCKED. So we can now implement page migration for mlocked pages. Allow the migration of mlocked pages. This means that try_to_unmap must unmap mlocked pages in the migration case. Signed-off-by: Christoph Lameter Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 10 ++++------ mm/rmap.c | 9 ++++----- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 0576c053598..3f1e0c2c942 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -616,15 +616,13 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* * Establish migration ptes or remove ptes */ - if (try_to_unmap(page, 1) != SWAP_FAIL) { - if (!page_mapped(page)) - rc = move_to_new_page(newpage, page); - } else - /* A vma has VM_LOCKED set -> permanent failure */ - rc = -EPERM; + try_to_unmap(page, 1); + if (!page_mapped(page)) + rc = move_to_new_page(newpage, page); if (rc) remove_migration_ptes(page, page); + unlock: unlock_page(page); diff --git a/mm/rmap.c b/mm/rmap.c index 882a85826bb..e76909e880c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -562,9 +562,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young(vma, address, pte) - && !migration)) { + if (!migration && ((vma->vm_flags & VM_LOCKED) || + (ptep_clear_flush_young(vma, address, pte)))) { ret = SWAP_FAIL; goto out_unmap; } @@ -771,7 +770,7 @@ static int try_to_unmap_file(struct page *page, int migration) list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !migration) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -805,7 +804,7 @@ static int try_to_unmap_file(struct page *page, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !migration) continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && -- cgit v1.2.3 From d616e09ab33aa4d013a93c9b393efd5cebf78521 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 25 Jun 2006 05:47:46 -0700 Subject: [PATCH] pdflush: handle resume wakeups pdflush is carefully designed to ensure that all wakeups have some corresponding work to do - if a woken-up pdflush thread discovers that it hasn't been given any work to do then this is considered an error. That all broke when swsusp came along - because a timer-delivered wakeup to a frozen pdflush thread will just get lost. This causes the pdflush thread to get lost as well: the writeback timer is supposed to be re-armed by pdflush in process context, but pdflush doesn't execute the callout which does this. Fix that up by ignoring the return value from try_to_freeze(): jsut proceed, see if we have any work pending and only go back to sleep if that is not the case. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pdflush.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/pdflush.c b/mm/pdflush.c index df7e50b8f70..b02102feeb4 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -104,21 +104,20 @@ static int __pdflush(struct pdflush_work *my_work) list_move(&my_work->list, &pdflush_list); my_work->when_i_went_to_sleep = jiffies; spin_unlock_irq(&pdflush_lock); - schedule(); - if (try_to_freeze()) { - spin_lock_irq(&pdflush_lock); - continue; - } - + try_to_freeze(); spin_lock_irq(&pdflush_lock); if (!list_empty(&my_work->list)) { - printk("pdflush: bogus wakeup!\n"); + /* + * Someone woke us up, but without removing our control + * structure from the global list. swsusp will do this + * in try_to_freeze()->refrigerator(). Handle it. + */ my_work->fn = NULL; continue; } if (my_work->fn == NULL) { - printk("pdflush: NULL work function\n"); + printk("pdflush: bogus wakeup\n"); continue; } spin_unlock_irq(&pdflush_lock); -- cgit v1.2.3 From 43b0bc00fdbf2f1503a57f0c2c1338438c5d2805 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Sun, 25 Jun 2006 05:47:55 -0700 Subject: [PATCH] cpuset: remove extra cpuset_zone_allowed check in __alloc_pages This is redundant with check in wakeup_kswapd. Signed-off-by: Chris Wright Acked-by: Paul Jackson Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 423db0db7c0..6c1174fcf52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -957,8 +957,7 @@ restart: goto got_pg; do { - if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL)) - wakeup_kswapd(*z, order); + wakeup_kswapd(*z, order); } while (*(++z)); /* -- cgit v1.2.3 From 01408c4939479ec46c15aa7ef6e2406be50eeeca Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 25 Jun 2006 05:47:58 -0700 Subject: [PATCH] Prepare for __copy_from_user_inatomic to not zero missed bytes The problem is that when we write to a file, the copy from userspace to pagecache is first done with preemption disabled, so if the source address is not immediately available the copy fails *and* *zeros* *the* *destination*. This is a problem because a concurrent read (which admittedly is an odd thing to do) might see zeros rather that was there before the write, or what was there after, or some mixture of the two (any of these being a reasonable thing to see). If the copy did fail, it will immediately be retried with preemption re-enabled so any transient problem with accessing the source won't cause an error. The first copying does not need to zero any uncopied bytes, and doing so causes the problem. It uses copy_from_user_atomic rather than copy_from_user so the simple expedient is to change copy_from_user_atomic to *not* zero out bytes on failure. The first of these two patches prepares for the change by fixing two places which assume copy_from_user_atomic does zero the tail. The two usages are very similar pieces of code which copy from a userspace iovec into one or more page-cache pages. These are changed to remove the assumption. The second patch changes __copy_from_user_inatomic* to not zero the tail. Once these are accepted, I will look at similar patches of other architectures where this is important (ppc, mips and sparc being the ones I can find). This patch: There is a problem with __copy_from_user_inatomic zeroing the tail of the buffer in the case of an error. As it is called in atomic context, the error may be transient, so it results in zeros being written where maybe they shouldn't be. In the usage in filemap, this opens a window for a well timed read to see data (zeros) which is not consistent with any ordering of reads and writes. Most cases where __copy_from_user_inatomic is called, a failure results in __copy_from_user being called immediately. As long as the latter zeros the tail, the former doesn't need to. However in *copy_from_user_iovec implementations (in both filemap and ntfs/file), it is assumed that copy_from_user_inatomic will zero the tail. This patch removes that assumption, so that after this patch it will be safe for copy_from_user_inatomic to not zero the tail. This patch also adds some commentary to filemap.h and asm-i386/uaccess.h. After this patch, all architectures that might disable preempt when kmap_atomic is called need to have their __copy_from_user_inatomic* "fixed". This includes - powerpc - i386 - mips - sparc Signed-off-by: Neil Brown Cc: David Howells Cc: Anton Altaparmakov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ralf Baechle Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++------ mm/filemap.h | 26 ++++++++++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 807a463fd5e..1ed4be2a765 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1892,7 +1892,7 @@ int remove_suid(struct dentry *dentry) EXPORT_SYMBOL(remove_suid); size_t -__filemap_copy_from_user_iovec(char *vaddr, +__filemap_copy_from_user_iovec_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { size_t copied = 0, left = 0; @@ -1908,12 +1908,8 @@ __filemap_copy_from_user_iovec(char *vaddr, vaddr += copy; iov++; - if (unlikely(left)) { - /* zero the rest of the target like __copy_from_user */ - if (bytes) - memset(vaddr, 0, bytes); + if (unlikely(left)) break; - } } return copied - left; } diff --git a/mm/filemap.h b/mm/filemap.h index 5683cde2205..536979fb4ba 100644 --- a/mm/filemap.h +++ b/mm/filemap.h @@ -16,15 +16,23 @@ #include size_t -__filemap_copy_from_user_iovec(char *vaddr, - const struct iovec *iov, - size_t base, - size_t bytes); +__filemap_copy_from_user_iovec_inatomic(char *vaddr, + const struct iovec *iov, + size_t base, + size_t bytes); /* * Copy as much as we can into the page and return the number of bytes which * were sucessfully copied. If a fault is encountered then clear the page * out to (offset+bytes) and return the number of bytes which were copied. + * + * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache + * to *NOT* zero any tail of the buffer that it failed to copy. If it does, + * and if the following non-atomic copy succeeds, then there is a small window + * where the target page contains neither the data before the write, nor the + * data after the write (it contains zero). A read at this time will see + * data that is inconsistent with any ordering of the read and the write. + * (This has been detected in practice). */ static inline size_t filemap_copy_from_user(struct page *page, unsigned long offset, @@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset, size_t copied; kaddr = kmap_atomic(page, KM_USER0); - copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, - base, bytes); + copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, + base, bytes); kunmap_atomic(kaddr, KM_USER0); if (copied != bytes) { kaddr = kmap(page); - copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, - base, bytes); + copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, + base, bytes); + if (bytes - copied) + memset(kaddr + offset + copied, 0, bytes - copied); kunmap(page); } return copied; -- cgit v1.2.3 From bd40cddae2211950c81c41f25a818189f80fd0b5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 25 Jun 2006 05:48:08 -0700 Subject: [PATCH] kernel-doc: mm/readhead fixup Put short function description for read_cache_pages() on one line as needed by kernel-doc. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 4ee52cadab9..e39e416860d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -118,8 +118,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra) #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) /** - * read_cache_pages - populate an address space with some pages, and - * start reads against them. + * read_cache_pages - populate an address space with some pages & start reads against them * @mapping: the address_space * @pages: The address of a list_head which contains the target pages. These * pages have their ->index populated and are otherwise uninitialised. -- cgit v1.2.3 From 76d42bd96984832c4ea8bc8cbd74e496ac31409e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 25 Jun 2006 05:48:43 -0700 Subject: [PATCH] readahead: backoff on I/O error Backoff readahead size exponentially on I/O error. Michael Tokarev described the problem as: [QUOTE] Suppose there's a CD-rom with a scratch/etc, one sector is unreadable. In order to "fix" it, one have to read it and write to another CD-rom, or something.. or just ignore the error (if it's just a skip in a video stream). Let's assume the unreadable block is number U. But current behavior is just insane. An application requests block number N, which is before U. Kernel tries to read-ahead blocks N..U. Cdrom drive tries to read it, re-read it.. for some time. Finally, when all the N..U-1 blocks are read, kernel returns block number N (as requested) to an application, successefully. Now an app requests block number N+1, and kernel tries to read blocks N+1..U+1. Retrying again as in previous step. And so on, up to when an app requests block number U-1. And when, finally, it requests block U, it receives read error. So, kernel currentry tries to re-read the same failing block as many times as the current readahead value (256 (times?) by default). This whole process already killed my cdrom drive (I posted about it to LKML several months ago) - literally, the drive has fried, and does not work anymore. Ofcourse that problem was a bug in firmware (or whatever) of the drive *too*, but.. main problem with that is current readahead logic as described above. [/QUOTE] Which was confirmed by Jens Axboe : [QUOTE] For ide-cd, it tends do only end the first part of the request on a medium error. So you may see a lot of repeats :/ [/QUOTE] With this patch, retries are expected to be reduced from, say, 256, to 5. [akpm@osdl.org: cleanups] Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1ed4be2a765..9c7334bafda 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -828,6 +828,32 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) } EXPORT_SYMBOL(grab_cache_page_nowait); +/* + * CD/DVDs are error prone. When a medium error occurs, the driver may fail + * a _large_ part of the i/o request. Imagine the worst scenario: + * + * ---R__________________________________________B__________ + * ^ reading here ^ bad block(assume 4k) + * + * read(R) => miss => readahead(R...B) => media error => frustrating retries + * => failing the whole request => read(R) => read(R+1) => + * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => + * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => + * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... + * + * It is going insane. Fix it by quickly scaling down the readahead size. + */ +static void shrink_readahead_size_eio(struct file *filp, + struct file_ra_state *ra) +{ + if (!ra->ra_pages) + return; + + ra->ra_pages /= 4; + printk(KERN_WARNING "Reducing readahead size to %luK\n", + ra->ra_pages << (PAGE_CACHE_SHIFT - 10)); +} + /** * do_generic_mapping_read - generic file read routine * @mapping: address_space to be read @@ -985,6 +1011,7 @@ readpage: } unlock_page(page); error = -EIO; + shrink_readahead_size_eio(filp, &ra); goto readpage_error; } unlock_page(page); @@ -1522,6 +1549,7 @@ page_not_uptodate: * Things didn't work out. Return zero to tell the * mm layer so, possibly freeing the page cache page first. */ + shrink_readahead_size_eio(file, ra); page_cache_release(page); return NULL; } -- cgit v1.2.3 From d6e05edc59ecd79e8badf440c0d295a979bdfa3e Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Mon, 26 Jun 2006 18:35:02 +0200 Subject: spelling fixes acquired (aquired) contiguous (contigious) successful (succesful, succesfull) surprise (suprise) whether (weather) some other misspellings Signed-off-by: Andreas Mohr Signed-off-by: Adrian Bunk --- mm/page_alloc.c | 2 +- mm/readahead.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6c1174fcf52..9f86191bb63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -266,7 +266,7 @@ static inline void rmv_page_order(struct page *page) * satisfies the following equation: * P = B & ~(1 << O) * - * Assumption: *_mem_map is contigious at least up to MAX_ORDER + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline struct page * __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) diff --git a/mm/readahead.c b/mm/readahead.c index e39e416860d..aa7ec424656 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -390,8 +390,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' * is set wait till the read completes. Otherwise attempt to read without * blocking. - * Returns 1 meaning 'success' if read is succesfull without switching off - * readhaead mode. Otherwise return failure. + * Returns 1 meaning 'success' if read is successful without switching off + * readahead mode. Otherwise return failure. */ static int blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, -- cgit v1.2.3 From 1bfba4e8ea0e555e3a0296051517d96253660ccc Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 26 Jun 2006 00:24:40 -0700 Subject: [PATCH] core: use list_move() This patch converts the combination of list_del(A) and list_add(A, B) to list_move(A, B). Cc: Greg Kroah-Hartman Cc: Ram Pai Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 03ae2076f92..990868afc1c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -86,8 +86,7 @@ int rotate_reclaimable_page(struct page *page) zone = page_zone(page); spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { - list_del(&page->lru); - list_add_tail(&page->lru, &zone->inactive_list); + list_move_tail(&page->lru, &zone->inactive_list); inc_page_state(pgrotated); } if (!test_clear_page_writeback(page)) -- cgit v1.2.3 From 99f895518368252ba862cc15ce4eb98ebbe1bec6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Jun 2006 00:25:55 -0700 Subject: [PATCH] proc: don't lock task_structs indefinitely Every inode in /proc holds a reference to a struct task_struct. If a directory or file is opened and remains open after the the task exits this pinning continues. With 8K stacks on a 32bit machine the amount pinned per file descriptor is about 10K. Normally I would figure a reasonable per user process limit is about 100 processes. With 80 processes, with a 1000 file descriptors each I can trigger the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless data. This patch replaces the struct task_struct pointer with a pointer to a struct task_ref which has a struct task_struct pointer. The so the pinning of dead tasks does not happen. The code now has to contend with the fact that the task may now exit at any time. Which is a little but not muh more complicated. With this change it takes about 1000 processes each opening up 1000 file descriptors before I can trigger the OOM killer. Much better. [mlp@google.com: task_mmu small fixes] Signed-off-by: Eric W. Biederman Cc: Trond Myklebust Cc: Paul Jackson Cc: Oleg Nesterov Cc: Albert Cahalan Signed-off-by: Prasanna Meda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 73e0f23b7f5..6b9740bbf4c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1821,7 +1821,7 @@ static inline void check_huge_range(struct vm_area_struct *vma, int show_numa_map(struct seq_file *m, void *v) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; struct file *file = vma->vm_file; @@ -1837,7 +1837,7 @@ int show_numa_map(struct seq_file *m, void *v) return 0; mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + get_vma_policy(priv->task, vma, vma->vm_start)); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1891,7 +1891,7 @@ out: kfree(md); if (m->count < m->size) - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; } -- cgit v1.2.3 From 95dc112a5770dc670a1b45a3d9ee346fdd2b2697 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 20 Jun 2005 21:15:16 -0700 Subject: [PATCH] devfs: Remove devfs_mk_dir() function from the kernel tree Removes the devfs_mk_dir() function and all callers of it. Signed-off-by: Greg Kroah-Hartman --- mm/shmem.c | 4 +--- mm/tiny-shmem.c | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 38bc3334f26..a4161abb9e7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2252,9 +2252,7 @@ static int __init init_tmpfs(void) printk(KERN_ERR "Could not register tmpfs\n"); goto out2; } -#ifdef CONFIG_TMPFS - devfs_mk_dir("shm"); -#endif + shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, tmpfs_fs_type.name, NULL); if (IS_ERR(shm_mnt)) { diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index f9d6a9cc91c..11ab99b8c61 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -33,9 +33,6 @@ static int __init init_tmpfs(void) { BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); -#ifdef CONFIG_TMPFS - devfs_mk_dir("shm"); -#endif shm_mnt = kern_mount(&tmpfs_fs_type); BUG_ON(IS_ERR(shm_mnt)); -- cgit v1.2.3 From ff23eca3e8f613034e0d20ff86f6a89b62f5a14e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 20 Jun 2005 21:15:16 -0700 Subject: [PATCH] devfs: Remove the devfs_fs_kernel.h file from the tree Also fixes up all files that #include it. Signed-off-by: Greg Kroah-Hartman --- mm/shmem.c | 1 - mm/tiny-shmem.c | 1 - 2 files changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index a4161abb9e7..355904712a8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 11ab99b8c61..5f2cbf0f153 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -12,7 +12,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 6550e07f41ce8473ed684dac54fbfbd42183ffda Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 12 Jun 2006 17:11:31 -0700 Subject: [PATCH] 64bit Resource: finally enable 64bit resource sizes Introduce the Kconfig entry and actually switch to a 64bit value, if wanted, for resource_size_t. Based on a patch series originally from Vivek Goyal Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 66e65ab3942..e3644b0062b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -145,3 +145,9 @@ config MIGRATION while the virtual addresses are not changed. This is useful for example on NUMA systems to put pages nearer to the processors accessing the page. + +config RESOURCES_64BIT + bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) + default 64BIT + help + This option allows memory and IO resources to be 64 bit. -- cgit v1.2.3 From bc02af93dd2bbddce1b55e0a493f833a1b7cf140 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 27 Jun 2006 02:53:30 -0700 Subject: [PATCH] pgdat allocation for new node add (specify node id) Change the name of old add_memory() to arch_add_memory. And use node id to get pgdat for the node at NODE_DATA(). Note: Powerpc's old add_memory() is defined as __devinit. However, add_memory() is usually called only after bootup. I suppose it may be redundant. But, I'm not well known about powerpc. So, I keep it. (But, __meminit is better at least.) Signed-off-by: Yasunori Goto Cc: Dave Hansen Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 841a077d5ae..6cdeabe9f6d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -163,3 +163,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) vm_total_pages = nr_free_pagecache_pages(); return 0; } + +int add_memory(int nid, u64 start, u64 size) +{ + int ret; + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, start, size); + + return ret; +} +EXPORT_SYMBOL_GPL(add_memory); -- cgit v1.2.3 From 3218ae14b1e3ee2ab81df30ed690c8e864d23316 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 27 Jun 2006 02:53:33 -0700 Subject: [PATCH] pgdat allocation for new node add (export kswapd start func) When node is hot-added, kswapd for the node should start. This export kswapd start function as kswapd_run() to use at add_memory(). [akpm@osdl.org: daemonize() isn't needed when using the kthread API] Signed-off-by: Yasunori Goto Signed-off-by: KAMEZAWA Hiroyuki Cc: Dave Hansen Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 72babac71de..f03da33d914 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1223,7 +1224,6 @@ static int kswapd(void *p) }; cpumask_t cpumask; - daemonize("kswapd%d", pgdat->node_id); cpumask = node_to_cpumask(pgdat->node_id); if (!cpus_empty(cpumask)) set_cpus_allowed(tsk, cpumask); @@ -1468,20 +1468,35 @@ static int cpu_callback(struct notifier_block *nfb, } #endif /* CONFIG_HOTPLUG_CPU */ +/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ +int kswapd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kswapd) + return 0; + + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; + } + return ret; +} + static int __init kswapd_init(void) { - pg_data_t *pgdat; + int nid; swap_setup(); - for_each_online_pgdat(pgdat) { - pid_t pid; - - pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); - BUG_ON(pid < 0); - read_lock(&tasklist_lock); - pgdat->kswapd = find_task_by_pid(pid); - read_unlock(&tasklist_lock); - } + for_each_online_node(nid) + kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; } -- cgit v1.2.3 From 9af3c2dea3a3ae4248d81a70b556adfe1dc65d55 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 27 Jun 2006 02:53:34 -0700 Subject: [PATCH] pgdat allocation for new node add (call pgdat allocation) Add node-hot-add support to add_memory(). node hotadd uses this sequence. 1. allocate pgdat. 2. refresh NODE_DATA() 3. call free_area_init_node() to initialize 4. create sysfs entry 5. add memory (old add_memory()) 6. set node online 7. run kswapd for new node. (8). update zonelist after pages are onlined. (This is already merged in -mm due to update phase is difference.) Note: To make common function as much as possible, there is 2 changes from v2. - The old add_memory(), which is defiend by each archs, is renamed to arch_add_memory(). New add_memory becomes caller of arch dependent function as a common code. - This patch changes add_memory()'s interface From: add_memory(start, end) TO : add_memory(nid, start, end). It was cause of similar code that finding node id from physical address is inside of old add_memory() on each arch. In addition, acpi memory hotplug driver can find node id easier. In v2, it must walk DSDT'S _CRS by matching physical address to get the handle of its memory device, then get _PXM and node id. Because input is just physical address. However, in v3, the acpi driver can use handle to get _PXM and node id for the new memory device. It can pass just node id to add_memory(). Fix interface of arch_add_memory() is in next patche. Signed-off-by: Yasunori Goto Signed-off-by: KAMEZAWA Hiroyuki Cc: Dave Hansen Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6cdeabe9f6d..83d37a401b3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -164,13 +164,65 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) return 0; } +static pg_data_t *hotadd_new_pgdat(int nid, u64 start) +{ + struct pglist_data *pgdat; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; + unsigned long start_pfn = start >> PAGE_SHIFT; + + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) + return NULL; + + arch_refresh_nodedata(nid, pgdat); + + /* we can use NODE_DATA(nid) from here */ + + /* init node's zones as empty zones, we don't have any present pages.*/ + free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); + + return pgdat; +} + +static void rollback_node_hotadd(int nid, pg_data_t *pgdat) +{ + arch_refresh_nodedata(nid, NULL); + arch_free_nodedata(pgdat); + return; +} + int add_memory(int nid, u64 start, u64 size) { + pg_data_t *pgdat = NULL; + int new_pgdat = 0; int ret; + if (!node_online(nid)) { + pgdat = hotadd_new_pgdat(nid, start); + if (!pgdat) + return -ENOMEM; + new_pgdat = 1; + ret = kswapd_run(nid); + if (ret) + goto error; + } + /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size); + if (ret < 0) + goto error; + + /* we online node here. we have no error path from here. */ + node_set_online(nid); + + return ret; +error: + /* rollback pgdat allocation and others */ + if (new_pgdat) + rollback_node_hotadd(nid, pgdat); + return ret; } EXPORT_SYMBOL_GPL(add_memory); -- cgit v1.2.3 From 0a54703904a4a206686b4e8c3f5a6927b60747aa Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 27 Jun 2006 02:53:35 -0700 Subject: [PATCH] register hot-added memory to iomem resource Register hot-added memory to iomem_resource. With this, /proc/iomem can show hot-added memory. Note: kdump uses /proc/iomem to catch memory range when it is installed. So, kdump should be re-installed after /proc/iomem change. Signed-off-by: KAMEZAWA Hiroyuki Cc: Vivek Goyal Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 83d37a401b3..0b11a854344 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -192,6 +193,27 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) return; } +/* add this memory to iomem resource */ +static void register_memory_resource(u64 start, u64 size) +{ + struct resource *res; + + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + BUG_ON(!res); + + res->name = "System RAM"; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM; + if (request_resource(&iomem_resource, res) < 0) { + printk("System RAM resource %llx - %llx cannot be added\n", + (unsigned long long)res->start, (unsigned long long)res->end); + kfree(res); + } +} + + + int add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; @@ -217,6 +239,9 @@ int add_memory(int nid, u64 start, u64 size) /* we online node here. we have no error path from here. */ node_set_online(nid); + /* register this memory as resource */ + register_memory_resource(start, size); + return ret; error: /* rollback pgdat allocation and others */ -- cgit v1.2.3 From 2842f11419704f8707fffc82e10d2263427fc130 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 27 Jun 2006 02:53:36 -0700 Subject: [PATCH] catch valid mem range at onlining memory This patch allows hot-add memory which is not aligned to section. Now, hot-added memory has to be aligned to section size. Considering big section sized archs, this is not useful. When hot-added memory is registerd as iomem resoruce by iomem resource patch, we can make use of that information to detect valid memory range. Note: With this, not-aligned memory can be registerd. To allow hot-add memory with holes, we have to do more work around add_memory(). (It doesn't allows add memory to already existing mem section.) Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0b11a854344..f13783e81eb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -127,6 +127,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) unsigned long i; unsigned long flags; unsigned long onlined_pages = 0; + struct resource res; + u64 section_end; + unsigned long start_pfn; struct zone *zone; int need_zonelists_rebuild = 0; @@ -149,10 +152,27 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (!populated_zone(zone)) need_zonelists_rebuild = 1; - for (i = 0; i < nr_pages; i++) { - struct page *page = pfn_to_page(pfn + i); - online_page(page); - onlined_pages++; + res.start = (u64)pfn << PAGE_SHIFT; + res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; + res.flags = IORESOURCE_MEM; /* we just need system ram */ + section_end = res.end; + + while (find_next_system_ram(&res) >= 0) { + start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); + nr_pages = (unsigned long) + ((res.end + 1 - res.start) >> PAGE_SHIFT); + + if (PageReserved(pfn_to_page(start_pfn))) { + /* this region's page is not onlined now */ + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(start_pfn + i); + online_page(page); + onlined_pages++; + } + } + + res.start = res.end + 1; + res.end = section_end; } zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; -- cgit v1.2.3 From 1f04bbd2d396a701c5af2e5b92bad896c2550c16 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 27 Jun 2006 02:53:37 -0700 Subject: [PATCH] sparc64: support sparsemem and !memory hotplug Fix "undefined reference to `arch_add_memory'" on sparc64 allmodconfig. sparc64 doesn't support memory hotplug. But we want it to support sparsemem. Signed-off-by: Yasunori Goto Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 66e65ab3942..e76c023eb0b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -116,6 +116,7 @@ config SPARSEMEM_EXTREME config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + depends on (IA64 || X86 || PPC64) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND -- cgit v1.2.3 From 0fc44159bfcb5b0afa178f9c3f50db23aebc76ff Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 27 Jun 2006 02:53:38 -0700 Subject: [PATCH] Register sysfs file for hotplugged new node When new node becomes enable by hot-add, new sysfs file must be created for new node. So, if new node is enabled by add_memory(), register_one_node() is called to create it. In addition, I386's arch_register_node() and a part of register_nodes() of powerpc are consolidated to register_one_node() as a generic_code(). This is tested by Tiger4(IPF) with node hot-plug emulation. Signed-off-by: Keiichiro Tokunaga Signed-off-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f13783e81eb..ea4038838b0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -256,9 +256,19 @@ int add_memory(int nid, u64 start, u64 size) if (ret < 0) goto error; - /* we online node here. we have no error path from here. */ + /* we online node here. we can't roll back from here. */ node_set_online(nid); + if (new_pgdat) { + ret = register_one_node(nid); + /* + * If sysfs file of new node can't create, cpu on the node + * can't be hot-added. There is no rollback way now. + * So, check by BUG_ON() to catch it reluctantly.. + */ + BUG_ON(ret); + } + /* register this memory as resource */ register_memory_resource(start, size); -- cgit v1.2.3 From c9cf55285e87ac423c45d9efca750d3f50234d10 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 27 Jun 2006 02:53:52 -0700 Subject: [PATCH] add poison.h and patch primary users Localize poison values into one header file for better documentation and easier/quicker debugging and so that the same values won't be used for multiple purposes. Use these constants in core arch., mm, driver, and fs code. Signed-off-by: Randy Dunlap Acked-by: Matt Mackall Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: "David S. Miller" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 98ac20bc0de..47982c2d9f3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include @@ -492,17 +493,6 @@ struct kmem_cache { #endif #if DEBUG -/* - * Magic nums for obj red zoning. - * Placed in the first word before and the first word after an obj. - */ -#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ -#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ - -/* ...and for poisoning */ -#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ -#define POISON_FREE 0x6b /* for use-after-free poisoning */ -#define POISON_END 0xa5 /* end-byte of poisoning */ /* * memory layout of objects: -- cgit v1.2.3 From 34af946a22724c4e2b204957f2b24b22a0fb121c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:53:55 -0700 Subject: [PATCH] spin/rwlock init cleanups locking init cleanups: - convert " = SPIN_LOCK_UNLOCKED" to spin_lock_init() or DEFINE_SPINLOCK() - convert rwlocks in a similar manner this patch was generated automatically. Motivation: - cleanliness - lockdep needs control of lock initialization, which the open-coded variants do not give - it's also useful for -rt and for lock debugging in general Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index e0a3fe48aa3..c7a2b3a0e46 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -45,7 +45,7 @@ static struct mem_section *sparse_index_alloc(int nid) static int sparse_index_init(unsigned long section_nr, int nid) { - static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(index_init_lock); unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; int ret = 0; -- cgit v1.2.3 From 6527c2bdf1f833cc18e8f42bd97973d583e4aa83 Mon Sep 17 00:00:00 2001 From: "Vladimir V. Saveliev" Date: Tue, 27 Jun 2006 02:53:57 -0700 Subject: [PATCH] generic_file_buffered_write(): deadlock on vectored write generic_file_buffered_write() prefaults in user pages in order to avoid deadlock on copying from the same page as write goes to. However, it looks like there is a problem when write is vectored: fault_in_pages_readable brings in current segment or its part (maxlen). OTOH, filemap_copy_from_user_iovec is called to copy number of bytes (bytes) which may exceed current segment, so filemap_copy_from_user_iovec switches to the next segment which is not brought in yet. Pagefault is generated. That causes the deadlock if pagefault is for the same page write goes to: page being written is locked and not uptodate, pagefault will deadlock trying to lock locked page. [akpm@osdl.org: somewhat rewritten] Cc: Neil Brown Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 9c7334bafda..d504d6e9888 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2095,14 +2095,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, do { unsigned long index; unsigned long offset; - unsigned long maxlen; size_t copied; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; + + /* Limit the size of the copy to the caller's write size */ + bytes = min(bytes, count); + + /* + * Limit the size of the copy to that of the current segment, + * because fault_in_pages_readable() doesn't know how to walk + * segments. + */ + bytes = min(bytes, cur_iov->iov_len - iov_base); /* * Bring in the user page that we will copy from _first_. @@ -2110,10 +2117,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * same page as we're writing to, without it being marked * up-to-date. */ - maxlen = cur_iov->iov_len - iov_base; - if (maxlen > bytes) - maxlen = bytes; - fault_in_pages_readable(buf, maxlen); + fault_in_pages_readable(buf, bytes); page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { -- cgit v1.2.3 From 9c7b216d23e820e0e148d5be01bbb5bd2d8378fe Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 27 Jun 2006 02:54:07 -0700 Subject: [PATCH] cpu hotplug: revert init patch submitted for 2.6.17 In 2.6.17, there was a problem with cpu_notifiers and XFS. I provided a band-aid solution to solve that problem. In the process, i undid all the changes you both were making to ensure that these notifiers were available only at init time (unless CONFIG_HOTPLUG_CPU is defined). We deferred the real fix to 2.6.18. Here is a set of patches that fixes the XFS problem cleanly and makes the cpu notifiers available only at init time (unless CONFIG_HOTPLUG_CPU is defined). If CONFIG_HOTPLUG_CPU is defined then cpu notifiers are available at run time. This patch reverts the notifier_call changes made in 2.6.17 Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/slab.c | 2 +- mm/vmscan.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f86191bb63..e9fb2d4064c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2009,7 +2009,7 @@ static inline void free_zone_pagesets(int cpu) } } -static int pageset_cpuup_callback(struct notifier_block *nfb, +static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { diff --git a/mm/slab.c b/mm/slab.c index 47982c2d9f3..631c0feb964 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1073,7 +1073,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) #endif -static int cpuup_callback(struct notifier_block *nfb, +static int __devinit cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; diff --git a/mm/vmscan.c b/mm/vmscan.c index f03da33d914..eeacb0d695c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1450,7 +1450,7 @@ out: not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { pg_data_t *pgdat; -- cgit v1.2.3 From 74b85f3790aa2550c617fe14439482e13e615fa0 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 27 Jun 2006 02:54:09 -0700 Subject: [PATCH] cpu hotplug: make cpu_notifier related notifier blocks __cpuinit only Make notifier_blocks associated with cpu_notifier as __cpuinitdata. __cpuinitdata makes sure that the data is init time only unless CONFIG_HOTPLUG_CPU is defined. Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- mm/page_alloc.c | 2 +- mm/slab.c | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8ccf6f1b147..8ac6bfb4007 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -523,7 +523,7 @@ ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) return 0; } -static struct notifier_block ratelimit_nb = { +static struct notifier_block __cpuinitdata ratelimit_nb = { .notifier_call = ratelimit_handler, .next = NULL, }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e9fb2d4064c..dafd12ec7a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2031,7 +2031,7 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, return ret; } -static struct notifier_block pageset_notifier = +static struct notifier_block __cpuinitdata pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; void __init setup_per_cpu_pageset(void) diff --git a/mm/slab.c b/mm/slab.c index 631c0feb964..d1d55279202 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1255,7 +1255,9 @@ bad: return NOTIFY_BAD; } -static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; +static struct notifier_block __cpuinitdata cpucache_notifier = { + &cpuup_callback, NULL, 0 +}; /* * swap the static kmem_list3 with kmalloced memory -- cgit v1.2.3 From 26c2143b63b8078d08d562733716de142927e17a Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 27 Jun 2006 02:54:10 -0700 Subject: [PATCH] cpu hotplug: make cpu_notifier related notifier calls __cpuinit only Make notifier_calls associated with cpu_notifier as __cpuinit. __cpuinit makes sure that the function is init time only unless CONFIG_HOTPLUG_CPU is defined. [akpm@osdl.org: section fix] Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8ac6bfb4007..4ec7026c7ba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -516,7 +516,7 @@ static void set_ratelimit(void) ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; } -static int +static int __cpuinit ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) { set_ratelimit(); -- cgit v1.2.3 From f9b8404cf8f8456dfa83459510762b700dc00385 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:49 -0700 Subject: [PATCH] pi-futex: introduce debug_check_no_locks_freed() Add debug_check_no_locks_freed(), as a central inline to add bad-lock-free-debugging functionality to. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- mm/slab.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dafd12ec7a0..084a2de7e52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -446,8 +446,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); if (!PageHighMem(page)) - mutex_debug_check_no_locks_freed(page_address(page), - PAGE_SIZE< Date: Tue, 27 Jun 2006 02:54:55 -0700 Subject: [PATCH] pi-futex: rt mutex debug Runtime debugging functionality for rt-mutexes. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f378d027c68..233e39d14ca 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -107,6 +107,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From f5e54d6e53a20cef45af7499e86164f0e0d16bb2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jun 2006 04:26:44 -0700 Subject: [PATCH] mark address_space_operations const Same as with already do with the file operations: keep them in .rodata and prevents people from doing runtime patching. Signed-off-by: Christoph Hellwig Cc: Steven French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- mm/filemap_xip.c | 2 +- mm/shmem.c | 4 ++-- mm/swap_state.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index d504d6e9888..4082b3b3cea 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2069,7 +2069,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; - struct address_space_operations *a_ops = mapping->a_ops; + const struct address_space_operations *a_ops = mapping->a_ops; struct inode *inode = mapping->host; long status = 0; struct page *page; @@ -2219,7 +2219,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; + const struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b960ac8e591..b4fd0d7c9bf 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf, size_t count, loff_t pos, loff_t *ppos) { struct address_space * mapping = filp->f_mapping; - struct address_space_operations *a_ops = mapping->a_ops; + const struct address_space_operations *a_ops = mapping->a_ops; struct inode *inode = mapping->host; long status = 0; struct page *page; diff --git a/mm/shmem.c b/mm/shmem.c index 38bc3334f26..ea64c07cbe7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -174,7 +174,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) } static struct super_operations shmem_ops; -static struct address_space_operations shmem_aops; +static const struct address_space_operations shmem_aops; static struct file_operations shmem_file_operations; static struct inode_operations shmem_inode_operations; static struct inode_operations shmem_dir_inode_operations; @@ -2162,7 +2162,7 @@ static void destroy_inodecache(void) printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); } -static struct address_space_operations shmem_aops = { +static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_nobuffers, #ifdef CONFIG_TMPFS diff --git a/mm/swap_state.c b/mm/swap_state.c index e0e1583f32c..7535211bb49 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -24,7 +24,7 @@ * vmscan's shrink_list, to make sync_page look nicer, and to allow * future use of radix_tree tags in the swap cache. */ -static struct address_space_operations swap_aops = { +static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .sync_page = block_sync_page, .set_page_dirty = __set_page_dirty_nobuffers, -- cgit v1.2.3 From 81b0c8713385ce1b1b9058e916edcf9561ad76d6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 29 Jun 2006 02:24:26 -0700 Subject: [PATCH] generic_file_buffered_write(): handle zero-length iovec segments The recent generic_file_write() deadlock fix caused generic_file_buffered_write() to loop inifinitely when presented with a zero-length iovec segment. Fix. Note that this fix deliberately avoids calling ->prepare_write(), ->commit_write() etc with a zero-length write. This is because I don't trust all filesystems to get that right. This is a cautious approach, for 2.6.17.x. For 2.6.18 we should just go ahead and call ->prepare_write() and ->commit_write() with the zero length and fix any broken filesystems. So I'll make that change once this code is stabilised and backported into 2.6.17.x. The reason for preferring to call ->prepare_write() and ->commit_write() with the zero-length segment: a zero-length segment _should_ be sufficiently uncommon that this is the correct way of handling it. We don't want to optimise for poorly-written userspace at the expense of well-written userspace. Cc: "Vladimir V. Saveliev" Cc: Neil Brown Cc: Martin Schwidefsky Cc: Chris Wright Cc: Greg KH Cc: Cc: walt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 9 ++++++++- mm/filemap.h | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 4082b3b3cea..648f2c0c8e1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2125,6 +2125,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, break; } + if (unlikely(bytes == 0)) { + status = 0; + copied = 0; + goto zero_length_segment; + } + status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { loff_t isize = i_size_read(inode); @@ -2154,7 +2160,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, page_cache_release(page); continue; } - if (likely(copied > 0)) { +zero_length_segment: + if (likely(copied >= 0)) { if (!status) status = copied; diff --git a/mm/filemap.h b/mm/filemap.h index 536979fb4ba..3f2a343c601 100644 --- a/mm/filemap.h +++ b/mm/filemap.h @@ -88,7 +88,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) const struct iovec *iov = *iovp; size_t base = *basep; - while (bytes) { + do { int copy = min(bytes, iov->iov_len - base); bytes -= copy; @@ -97,7 +97,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) iov++; base = 0; } - } + } while (bytes); *iovp = iov; *basep = base; } -- cgit v1.2.3 From cc57637b0b015fb5d70dbbec740de516d33af07d Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Thu, 29 Jun 2006 02:24:27 -0700 Subject: [PATCH] solve config broken: undefined reference to `online_page' Memory hotplug code of i386 adds memory to only highmem. So, if CONFIG_HIGHMEM is not set, CONFIG_MEMORY_HOTPLUG shouldn't be set. Otherwise, it causes compile error. In addition, many architecture can't use memory hotplug feature yet. So, I introduce CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG. Signed-off-by: Yasunori Goto Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index e76c023eb0b..83917291d45 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -115,7 +115,7 @@ config SPARSEMEM_EXTREME # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" - depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC64) comment "Memory hotplug is currently incompatible with Software Suspend" -- cgit v1.2.3 From 6ab3d5624e172c553004ecc862bfeac16d9d68b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Engel?= Date: Fri, 30 Jun 2006 19:25:36 +0200 Subject: Remove obsolete #include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jörn Engel Signed-off-by: Adrian Bunk --- mm/filemap.c | 1 - mm/memory_hotplug.c | 1 - mm/mmzone.c | 1 - mm/page_alloc.c | 1 - mm/shmem.c | 1 - mm/slob.c | 1 - mm/sparse.c | 1 - mm/swapfile.c | 1 - 8 files changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 648f2c0c8e1..f02ca30372c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -9,7 +9,6 @@ * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ -#include #include #include #include diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ea4038838b0..01c9fb97c61 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -4,7 +4,6 @@ * Copyright (C) */ -#include #include #include #include diff --git a/mm/mmzone.c b/mm/mmzone.c index b022370e612..0959ee1a479 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -5,7 +5,6 @@ */ -#include #include #include #include diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 084a2de7e52..60f2feddbe5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -14,7 +14,6 @@ * (lots of bits borrowed from Ingo Molnar & Andrew Morton) */ -#include #include #include #include diff --git a/mm/shmem.c b/mm/shmem.c index b14ff817d16..83c9fea1e0e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -23,7 +23,6 @@ * which makes it a completely usable filesystem. */ -#include #include #include #include diff --git a/mm/slob.c b/mm/slob.c index a68255ba455..7b52b20b960 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -29,7 +29,6 @@ * essentially no allocation space overhead. */ -#include #include #include #include diff --git a/mm/sparse.c b/mm/sparse.c index c7a2b3a0e46..86c52ab8087 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -1,7 +1,6 @@ /* * sparse memory mappings. */ -#include #include #include #include diff --git a/mm/swapfile.c b/mm/swapfile.c index cc367f7e75d..e70d6c6d6fe 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -5,7 +5,6 @@ * Swap reorganised 29.12.95, Stephen Tweedie */ -#include #include #include #include -- cgit v1.2.3 From f6ac2354d791195ca40822b84d73d48a4e8b7f2b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:32 -0700 Subject: [PATCH] zoned vm counters: create vmstat.c/.h from page_alloc.c/.h NOTE: ZVC are *not* the lightweight event counters. ZVCs are reliable whereas event counters do not need to be. Zone based VM statistics are necessary to be able to determine what the state of memory in one zone is. In a NUMA system this can be helpful for local reclaim and other memory optimizations that may be able to shift VM load in order to get more balanced memory use. It is also useful to know how the computing load affects the memory allocations on various zones. This patchset allows the retrieval of that data from userspace. The patchset introduces a framework for counters that is a cross between the existing page_stats --which are simply global counters split per cpu-- and the approach of deferred incremental updates implemented for nr_pagecache. Small per cpu 8 bit counters are added to struct zone. If the counter exceeds certain thresholds then the counters are accumulated in an array of atomic_long in the zone and in a global array that sums up all zone values. The small 8 bit counters are next to the per cpu page pointers and so they will be in high in the cpu cache when pages are allocated and freed. Access to VM counter information for a zone and for the whole machine is then possible by simply indexing an array (Thanks to Nick Piggin for pointing out that approach). The access to the total number of pages of various types does no longer require the summing up of all per cpu counters. Benefits of this patchset right now: - Ability for UP and SMP configuration to determine how memory is balanced between the DMA, NORMAL and HIGHMEM zones. - loops over all processors are avoided in writeback and reclaim paths. We can avoid caching the writeback information because the needed information is directly accessible. - Special handling for nr_pagecache removed. - zone_reclaim_interval vanishes since VM stats can now determine when it is worth to do local reclaim. - Fast inline per node page state determination. - Accurate counters in /sys/devices/system/node/node*/meminfo. Current counters are counting simply which processor allocated a page somewhere and guestimate based on that. So the counters were not useful to show the actual distribution of page use on a specific zone. - The swap_prefetch patch requires per node statistics in order to figure out when processors of a node can prefetch. This patch provides some of the needed numbers. - Detailed VM counters available in more /proc and /sys status files. References to earlier discussions: V1 http://marc.theaimsgroup.com/?l=linux-kernel&m=113511649910826&w=2 V2 http://marc.theaimsgroup.com/?l=linux-kernel&m=114980851924230&w=2 V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115014697910351&w=2 V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767318740&w=2 Performance tests with AIM7 did not show any regressions. Seems to be a tad faster even. Tested on ia64/NUMA. Builds fine on i386, SMP / UP. Includes fixes for s390/arm/uml arch code. This patch: Move counter code from page_alloc.c/page-flags.h to vmstat.c/h. Create vmstat.c/vmstat.h by separating the counter code and the proc functions. Move the vm_stat_text array before zoneinfo_show. [akpm@osdl.org: s390 build fix] [akpm@osdl.org: HOTPLUG_CPU build fix] Signed-off-by: Christoph Lameter Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 2 +- mm/page_alloc.c | 407 ------------------------------------------------------ mm/vmstat.c | 417 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 418 insertions(+), 408 deletions(-) create mode 100644 mm/vmstat.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 0b8f73f2ed1..9dd824c11ee 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ - prio_tree.o util.o mmzone.o $(mmu-y) + prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 084a2de7e52..87dc1297fe3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1231,141 +1231,6 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif -/* - * Accumulate the page_state information across all CPUs. - * The result is unavoidably approximate - it can change - * during and after execution of this function. - */ -static DEFINE_PER_CPU(struct page_state, page_states) = {0}; - -atomic_t nr_pagecache = ATOMIC_INIT(0); -EXPORT_SYMBOL(nr_pagecache); -#ifdef CONFIG_SMP -DEFINE_PER_CPU(long, nr_pagecache_local) = 0; -#endif - -static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) -{ - unsigned cpu; - - memset(ret, 0, nr * sizeof(unsigned long)); - cpus_and(*cpumask, *cpumask, cpu_online_map); - - for_each_cpu_mask(cpu, *cpumask) { - unsigned long *in; - unsigned long *out; - unsigned off; - unsigned next_cpu; - - in = (unsigned long *)&per_cpu(page_states, cpu); - - next_cpu = next_cpu(cpu, *cpumask); - if (likely(next_cpu < NR_CPUS)) - prefetch(&per_cpu(page_states, next_cpu)); - - out = (unsigned long *)ret; - for (off = 0; off < nr; off++) - *out++ += *in++; - } -} - -void get_page_state_node(struct page_state *ret, int node) -{ - int nr; - cpumask_t mask = node_to_cpumask(node); - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr+1, &mask); -} - -void get_page_state(struct page_state *ret) -{ - int nr; - cpumask_t mask = CPU_MASK_ALL; - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr + 1, &mask); -} - -void get_full_page_state(struct page_state *ret) -{ - cpumask_t mask = CPU_MASK_ALL; - - __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); -} - -unsigned long read_page_state_offset(unsigned long offset) -{ - unsigned long ret = 0; - int cpu; - - for_each_online_cpu(cpu) { - unsigned long in; - - in = (unsigned long)&per_cpu(page_states, cpu) + offset; - ret += *((unsigned long *)in); - } - return ret; -} - -void __mod_page_state_offset(unsigned long offset, unsigned long delta) -{ - void *ptr; - - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; -} -EXPORT_SYMBOL(__mod_page_state_offset); - -void mod_page_state_offset(unsigned long offset, unsigned long delta) -{ - unsigned long flags; - void *ptr; - - local_irq_save(flags); - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; - local_irq_restore(flags); -} -EXPORT_SYMBOL(mod_page_state_offset); - -void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat) -{ - struct zone *zones = pgdat->node_zones; - int i; - - *active = 0; - *inactive = 0; - *free = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - *active += zones[i].nr_active; - *inactive += zones[i].nr_inactive; - *free += zones[i].free_pages; - } -} - -void get_zone_counts(unsigned long *active, - unsigned long *inactive, unsigned long *free) -{ - struct pglist_data *pgdat; - - *active = 0; - *inactive = 0; - *free = 0; - for_each_online_pgdat(pgdat) { - unsigned long l, m, n; - __get_zone_counts(&l, &m, &n, pgdat); - *active += l; - *inactive += m; - *free += n; - } -} - void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; @@ -2253,278 +2118,6 @@ void __init free_area_init(unsigned long *zones_size) __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -#ifdef CONFIG_PROC_FS - -#include - -static void *frag_start(struct seq_file *m, loff_t *pos) -{ - pg_data_t *pgdat; - loff_t node = *pos; - for (pgdat = first_online_pgdat(); - pgdat && node; - pgdat = next_online_pgdat(pgdat)) - --node; - - return pgdat; -} - -static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - - (*pos)++; - return next_online_pgdat(pgdat); -} - -static void frag_stop(struct seq_file *m, void *arg) -{ -} - -/* - * This walks the free areas for each zone. - */ -static int frag_show(struct seq_file *m, void *arg) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - int order; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } - return 0; -} - -struct seq_operations fragmentation_op = { - .start = frag_start, - .next = frag_next, - .stop = frag_stop, - .show = frag_show, -}; - -/* - * Output information about zones in @pgdat. - */ -static int zoneinfo_show(struct seq_file *m, void *arg) -{ - pg_data_t *pgdat = arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { - int i; - - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); - seq_printf(m, - "\n pages free %lu" - "\n min %lu" - "\n low %lu" - "\n high %lu" - "\n active %lu" - "\n inactive %lu" - "\n scanned %lu (a: %lu i: %lu)" - "\n spanned %lu" - "\n present %lu", - zone->free_pages, - zone->pages_min, - zone->pages_low, - zone->pages_high, - zone->nr_active, - zone->nr_inactive, - zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, - zone->spanned_pages, - zone->present_pages); - seq_printf(m, - "\n protection: (%lu", - zone->lowmem_reserve[0]); - for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); - for_each_online_cpu(i) { - struct per_cpu_pageset *pageset; - int j; - - pageset = zone_pcp(zone, i); - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - if (pageset->pcp[j].count) - break; - } - if (j == ARRAY_SIZE(pageset->pcp)) - continue; - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - seq_printf(m, - "\n cpu: %i pcp: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", - i, j, - pageset->pcp[j].count, - pageset->pcp[j].high, - pageset->pcp[j].batch); - } -#ifdef CONFIG_NUMA - seq_printf(m, - "\n numa_hit: %lu" - "\n numa_miss: %lu" - "\n numa_foreign: %lu" - "\n interleave_hit: %lu" - "\n local_node: %lu" - "\n other_node: %lu", - pageset->numa_hit, - pageset->numa_miss, - pageset->numa_foreign, - pageset->interleave_hit, - pageset->local_node, - pageset->other_node); -#endif - } - seq_printf(m, - "\n all_unreclaimable: %u" - "\n prev_priority: %i" - "\n temp_priority: %i" - "\n start_pfn: %lu", - zone->all_unreclaimable, - zone->prev_priority, - zone->temp_priority, - zone->zone_start_pfn); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } - return 0; -} - -struct seq_operations zoneinfo_op = { - .start = frag_start, /* iterate over all zones. The same as in - * fragmentation. */ - .next = frag_next, - .stop = frag_stop, - .show = zoneinfo_show, -}; - -static char *vmstat_text[] = { - "nr_dirty", - "nr_writeback", - "nr_unstable", - "nr_page_table_pages", - "nr_mapped", - "nr_slab", - - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - - "pgalloc_high", - "pgalloc_normal", - "pgalloc_dma32", - "pgalloc_dma", - - "pgfree", - "pgactivate", - "pgdeactivate", - - "pgfault", - "pgmajfault", - - "pgrefill_high", - "pgrefill_normal", - "pgrefill_dma32", - "pgrefill_dma", - - "pgsteal_high", - "pgsteal_normal", - "pgsteal_dma32", - "pgsteal_dma", - - "pgscan_kswapd_high", - "pgscan_kswapd_normal", - "pgscan_kswapd_dma32", - "pgscan_kswapd_dma", - - "pgscan_direct_high", - "pgscan_direct_normal", - "pgscan_direct_dma32", - "pgscan_direct_dma", - - "pginodesteal", - "slabs_scanned", - "kswapd_steal", - "kswapd_inodesteal", - "pageoutrun", - "allocstall", - - "pgrotated", - "nr_bounce", -}; - -static void *vmstat_start(struct seq_file *m, loff_t *pos) -{ - struct page_state *ps; - - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - - ps = kmalloc(sizeof(*ps), GFP_KERNEL); - m->private = ps; - if (!ps) - return ERR_PTR(-ENOMEM); - get_full_page_state(ps); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; - return (unsigned long *)ps + *pos; -} - -static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) -{ - (*pos)++; - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - return (unsigned long *)m->private + *pos; -} - -static int vmstat_show(struct seq_file *m, void *arg) -{ - unsigned long *l = arg; - unsigned long off = l - (unsigned long *)m->private; - - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); - return 0; -} - -static void vmstat_stop(struct seq_file *m, void *arg) -{ - kfree(m->private); - m->private = NULL; -} - -struct seq_operations vmstat_op = { - .start = vmstat_start, - .next = vmstat_next, - .stop = vmstat_stop, - .show = vmstat_show, -}; - -#endif /* CONFIG_PROC_FS */ - #ifdef CONFIG_HOTPLUG_CPU static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/mm/vmstat.c b/mm/vmstat.c new file mode 100644 index 00000000000..ad456202ff1 --- /dev/null +++ b/mm/vmstat.c @@ -0,0 +1,417 @@ +/* + * linux/mm/vmstat.c + * + * Manages VM statistics + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include +#include + +/* + * Accumulate the page_state information across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. + */ +DEFINE_PER_CPU(struct page_state, page_states) = {0}; + +atomic_t nr_pagecache = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_pagecache); +#ifdef CONFIG_SMP +DEFINE_PER_CPU(long, nr_pagecache_local) = 0; +#endif + +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +{ + unsigned cpu; + + memset(ret, 0, nr * sizeof(unsigned long)); + cpus_and(*cpumask, *cpumask, cpu_online_map); + + for_each_cpu_mask(cpu, *cpumask) { + unsigned long *in; + unsigned long *out; + unsigned off; + unsigned next_cpu; + + in = (unsigned long *)&per_cpu(page_states, cpu); + + next_cpu = next_cpu(cpu, *cpumask); + if (likely(next_cpu < NR_CPUS)) + prefetch(&per_cpu(page_states, next_cpu)); + + out = (unsigned long *)ret; + for (off = 0; off < nr; off++) + *out++ += *in++; + } +} + +void get_page_state_node(struct page_state *ret, int node) +{ + int nr; + cpumask_t mask = node_to_cpumask(node); + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr+1, &mask); +} + +void get_page_state(struct page_state *ret) +{ + int nr; + cpumask_t mask = CPU_MASK_ALL; + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr + 1, &mask); +} + +void get_full_page_state(struct page_state *ret) +{ + cpumask_t mask = CPU_MASK_ALL; + + __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); +} + +unsigned long read_page_state_offset(unsigned long offset) +{ + unsigned long ret = 0; + int cpu; + + for_each_online_cpu(cpu) { + unsigned long in; + + in = (unsigned long)&per_cpu(page_states, cpu) + offset; + ret += *((unsigned long *)in); + } + return ret; +} + +void __mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + void *ptr; + + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; +} +EXPORT_SYMBOL(__mod_page_state_offset); + +void mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + unsigned long flags; + void *ptr; + + local_irq_save(flags); + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_page_state_offset); + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zones[i].nr_active; + *inactive += zones[i].nr_inactive; + *free += zones[i].free_pages; + } +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct pglist_data *pgdat; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_online_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; + } +} + +#ifdef CONFIG_PROC_FS + +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return next_online_pgdat(pgdat); +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + int order; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static char *vmstat_text[] = { + "nr_dirty", + "nr_writeback", + "nr_unstable", + "nr_page_table_pages", + "nr_mapped", + "nr_slab", + + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + + "pgalloc_high", + "pgalloc_normal", + "pgalloc_dma32", + "pgalloc_dma", + + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + + "pgrefill_high", + "pgrefill_normal", + "pgrefill_dma32", + "pgrefill_dma", + + "pgsteal_high", + "pgsteal_normal", + "pgsteal_dma32", + "pgsteal_dma", + + "pgscan_kswapd_high", + "pgscan_kswapd_normal", + "pgscan_kswapd_dma32", + "pgscan_kswapd_dma", + + "pgscan_direct_high", + "pgscan_direct_normal", + "pgscan_direct_dma32", + "pgscan_direct_dma", + + "pginodesteal", + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", + "nr_bounce", +}; + +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { + int i; + + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n active %lu" + "\n inactive %lu" + "\n scanned %lu (a: %lu i: %lu)" + "\n spanned %lu" + "\n present %lu", + zone->free_pages, + zone->pages_min, + zone->pages_low, + zone->pages_high, + zone->nr_active, + zone->nr_inactive, + zone->pages_scanned, + zone->nr_scan_active, zone->nr_scan_inactive, + zone->spanned_pages, + zone->present_pages); + seq_printf(m, + "\n protection: (%lu", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for_each_online_cpu(i) { + struct per_cpu_pageset *pageset; + int j; + + pageset = zone_pcp(zone, i); + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + if (pageset->pcp[j].count) + break; + } + if (j == ARRAY_SIZE(pageset->pcp)) + continue; + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + seq_printf(m, + "\n cpu: %i pcp: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, j, + pageset->pcp[j].count, + pageset->pcp[j].high, + pageset->pcp[j].batch); + } +#ifdef CONFIG_NUMA + seq_printf(m, + "\n numa_hit: %lu" + "\n numa_miss: %lu" + "\n numa_foreign: %lu" + "\n interleave_hit: %lu" + "\n local_node: %lu" + "\n other_node: %lu", + pageset->numa_hit, + pageset->numa_miss, + pageset->numa_foreign, + pageset->interleave_hit, + pageset->local_node, + pageset->other_node); +#endif + } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" + "\n temp_priority: %i" + "\n start_pfn: %lu", + zone->all_unreclaimable, + zone->prev_priority, + zone->temp_priority, + zone->zone_start_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + struct page_state *ps; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); + get_full_page_state(ps); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +#endif /* CONFIG_PROC_FS */ + -- cgit v1.2.3 From 2244b95a7bcf8d24196f8a3a44187ba5dfff754c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:33 -0700 Subject: [PATCH] zoned vm counters: basic ZVC (zoned vm counter) implementation Per zone counter infrastructure The counters that we currently have for the VM are split per processor. The processor however has not much to do with the zone these pages belong to. We cannot tell f.e. how many ZONE_DMA pages are dirty. So we are blind to potentially inbalances in the usage of memory in various zones. F.e. in a NUMA system we cannot tell how many pages are dirty on a particular node. If we knew then we could put measures into the VM to balance the use of memory between different zones and different nodes in a NUMA system. For example it would be possible to limit the dirty pages per node so that fast local memory is kept available even if a process is dirtying huge amounts of pages. Another example is zone reclaim. We do not know how many unmapped pages exist per zone. So we just have to try to reclaim. If it is not working then we pause and try again later. It would be better if we knew when it makes sense to reclaim unmapped pages from a zone. This patchset allows the determination of the number of unmapped pages per zone. We can remove the zone reclaim interval with the counters introduced here. Futhermore the ability to have various usage statistics available will allow the development of new NUMA balancing algorithms that may be able to improve the decision making in the scheduler of when to move a process to another node and hopefully will also enable automatic page migration through a user space program that can analyse the memory load distribution and then rebalance memory use in order to increase performance. The counter framework here implements differential counters for each processor in struct zone. The differential counters are consolidated when a threshold is exceeded (like done in the current implementation for nr_pageache), when slab reaping occurs or when a consolidation function is called. Consolidation uses atomic operations and accumulates counters per zone in the zone structure and also globally in the vm_stat array. VM functions can access the counts by simply indexing a global or zone specific array. The arrangement of counters in an array also simplifies processing when output has to be generated for /proc/*. Counters can be updated by calling inc/dec_zone_page_state or _inc/dec_zone_page_state analogous to *_page_state. The second group of functions can be called if it is known that interrupts are disabled. Special optimized increment and decrement functions are provided. These can avoid certain checks and use increment or decrement instructions that an architecture may provide. We also add a new CONFIG_DMA_IS_NORMAL that signifies that an architecture can do DMA to all memory and therefore ZONE_NORMAL will not be populated. This is only currently set for IA64 SGI SN2 and currently only affects node_page_state(). In the best case node_page_state can be reduced to retrieving a single counter for the one zone on the node. [akpm@osdl.org: cleanups] [akpm@osdl.org: export vm_stat[] for filesystems] Signed-off-by: Christoph Lameter Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 + mm/slab.c | 1 + mm/vmstat.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 217 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 87dc1297fe3..3a877fecc30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2045,6 +2045,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->nr_scan_inactive = 0; zone->nr_active = 0; zone->nr_inactive = 0; + zap_zone_vm_stats(zone); atomic_set(&zone->reclaim_in_progress, 0); if (!size) continue; @@ -2147,6 +2148,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, } local_irq_enable(); + refresh_cpu_vm_stats(cpu); } return NOTIFY_OK; } diff --git a/mm/slab.c b/mm/slab.c index 233e39d14ca..0c33820038c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3763,6 +3763,7 @@ next: check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); + refresh_cpu_vm_stats(smp_processor_id()); /* Set up the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); } diff --git a/mm/vmstat.c b/mm/vmstat.c index ad456202ff1..210f9bbbb04 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -3,10 +3,15 @@ * * Manages VM statistics * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * zoned VM statistics + * Copyright (C) 2006 Silicon Graphics, Inc., + * Christoph Lameter */ #include #include +#include /* * Accumulate the page_state information across all CPUs. @@ -143,6 +148,197 @@ void get_zone_counts(unsigned long *active, } } +/* + * Manage combined zone based / global counters + * + * vm_stat contains the global counters + */ +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +EXPORT_SYMBOL(vm_stat); + +#ifdef CONFIG_SMP + +#define STAT_THRESHOLD 32 + +/* + * Determine pointer to currently valid differential byte given a zone and + * the item number. + * + * Preemption must be off + */ +static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) +{ + return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; +} + +/* + * For use when we know that interrupts are disabled. + */ +void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + s8 *p; + long x; + + p = diff_pointer(zone, item); + x = delta + *p; + + if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { + zone_page_state_add(x, zone, item); + x = 0; + } + + *p = x; +} +EXPORT_SYMBOL(__mod_zone_page_state); + +/* + * For an unknown interrupt state + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_zone_page_state); + +/* + * Optimized increment and decrement functions. + * + * These are only for a single page and therefore can take a struct page * + * argument instead of struct zone *. This allows the inclusion of the code + * generated for page_zone(page) into the optimized functions. + * + * No overflow check is necessary and therefore the differential can be + * incremented or decremented in place which may allow the compilers to + * generate better code. + * + * The increment or decrement is known and therefore one boundary check can + * be omitted. + * + * Some processors have inc/dec instructions that are atomic vs an interrupt. + * However, the code must first determine the differential location in a zone + * based on the processor number and then inc/dec the counter. There is no + * guarantee without disabling preemption that the processor will not change + * in between and therefore the atomicity vs. interrupt cannot be exploited + * in a useful way here. + */ +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + struct zone *zone = page_zone(page); + s8 *p = diff_pointer(zone, item); + + (*p)++; + + if (unlikely(*p > STAT_THRESHOLD)) { + zone_page_state_add(*p, zone, item); + *p = 0; + } +} +EXPORT_SYMBOL(__inc_zone_page_state); + +void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + struct zone *zone = page_zone(page); + s8 *p = diff_pointer(zone, item); + + (*p)--; + + if (unlikely(*p < -STAT_THRESHOLD)) { + zone_page_state_add(*p, zone, item); + *p = 0; + } +} +EXPORT_SYMBOL(__dec_zone_page_state); + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + struct zone *zone; + s8 *p; + + zone = page_zone(page); + local_irq_save(flags); + p = diff_pointer(zone, item); + + (*p)++; + + if (unlikely(*p > STAT_THRESHOLD)) { + zone_page_state_add(*p, zone, item); + *p = 0; + } + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + struct zone *zone; + s8 *p; + + zone = page_zone(page); + local_irq_save(flags); + p = diff_pointer(zone, item); + + (*p)--; + + if (unlikely(*p < -STAT_THRESHOLD)) { + zone_page_state_add(*p, zone, item); + *p = 0; + } + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_zone_page_state); + +/* + * Update the zone counters for one cpu. + */ +void refresh_cpu_vm_stats(int cpu) +{ + struct zone *zone; + int i; + unsigned long flags; + + for_each_zone(zone) { + struct per_cpu_pageset *pcp; + + pcp = zone_pcp(zone, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (pcp->vm_stat_diff[i]) { + local_irq_save(flags); + zone_page_state_add(pcp->vm_stat_diff[i], + zone, i); + pcp->vm_stat_diff[i] = 0; + local_irq_restore(flags); + } + } +} + +static void __refresh_cpu_vm_stats(void *dummy) +{ + refresh_cpu_vm_stats(smp_processor_id()); +} + +/* + * Consolidate all counters. + * + * Note that the result is less inaccurate but still inaccurate + * if concurrent processes are allowed to run. + */ +void refresh_vm_stats(void) +{ + on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); +} +EXPORT_SYMBOL(refresh_vm_stats); + +#endif + #ifdef CONFIG_PROC_FS #include @@ -204,6 +400,9 @@ struct seq_operations fragmentation_op = { }; static char *vmstat_text[] = { + /* Zoned VM counters */ + + /* Page state */ "nr_dirty", "nr_writeback", "nr_unstable", @@ -297,6 +496,11 @@ static int zoneinfo_show(struct seq_file *m, void *arg) zone->nr_scan_active, zone->nr_scan_inactive, zone->spanned_pages, zone->present_pages); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + seq_printf(m, "\n protection: (%lu", zone->lowmem_reserve[0]); @@ -368,19 +572,25 @@ struct seq_operations zoneinfo_op = { static void *vmstat_start(struct seq_file *m, loff_t *pos) { + unsigned long *v; struct page_state *ps; + int i; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; - ps = kmalloc(sizeof(*ps), GFP_KERNEL); - m->private = ps; - if (!ps) + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + sizeof(*ps), GFP_KERNEL); + m->private = v; + if (!v) return ERR_PTR(-ENOMEM); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); + ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS); get_full_page_state(ps); ps->pgpgin /= 2; /* sectors -> kbytes */ ps->pgpgout /= 2; - return (unsigned long *)ps + *pos; + return v + *pos; } static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) -- cgit v1.2.3 From 65ba55f500a37272985d071c9bbb35256a2f7c14 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:34 -0700 Subject: [PATCH] zoned vm counters: convert nr_mapped to per zone counter nr_mapped is important because it allows a determination of how many pages of a zone are not mapped, which would allow a more efficient means of determining when we need to reclaim memory in a zone. We take the nr_mapped field out of the page state structure and define a new per zone counter named NR_FILE_MAPPED (the anonymous pages will be split off from NR_MAPPED in the next patch). We replace the use of nr_mapped in various kernel locations. This avoids the looping over all processors in try_to_free_pages(), writeback, reclaim (swap + zone reclaim). [akpm@osdl.org: bugfix] Signed-off-by: Christoph Lameter Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- mm/page_alloc.c | 2 +- mm/rmap.c | 6 +++--- mm/vmscan.c | 8 ++++---- mm/vmstat.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4ec7026c7ba..60c7244c42e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -111,7 +111,7 @@ static void get_writeback_state(struct writeback_state *wbs) { wbs->nr_dirty = read_page_state(nr_dirty); wbs->nr_unstable = read_page_state(nr_unstable); - wbs->nr_mapped = read_page_state(nr_mapped); + wbs->nr_mapped = global_page_state(NR_FILE_MAPPED); wbs->nr_writeback = read_page_state(nr_writeback); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3a877fecc30..04dd2b01b2b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1319,7 +1319,7 @@ void show_free_areas(void) ps.nr_unstable, nr_free_pages(), ps.nr_slab, - ps.nr_mapped, + global_page_state(NR_FILE_MAPPED), ps.nr_page_table_pages); for_each_zone(zone) { diff --git a/mm/rmap.c b/mm/rmap.c index e76909e880c..af5e9808e65 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -455,7 +455,7 @@ static void __page_set_anon_rmap(struct page *page, * nr_mapped state can be updated without turning off * interrupts because it is not modified via interrupt. */ - __inc_page_state(nr_mapped); + __inc_zone_page_state(page, NR_FILE_MAPPED); } /** @@ -499,7 +499,7 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) - __inc_page_state(nr_mapped); + __inc_zone_page_state(page, NR_FILE_MAPPED); } /** @@ -531,7 +531,7 @@ void page_remove_rmap(struct page *page) */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); - __dec_page_state(nr_mapped); + __dec_zone_page_state(page, NR_FILE_MAPPED); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index eeacb0d695c..d2caf7471cf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -990,7 +990,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) } for (priority = DEF_PRIORITY; priority >= 0; priority--) { - sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_mapped = global_page_state(NR_FILE_MAPPED); sc.nr_scanned = 0; if (!priority) disable_swap_token(); @@ -1075,7 +1075,7 @@ loop_again: total_scanned = 0; nr_reclaimed = 0; sc.may_writepage = !laptop_mode; - sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_mapped = global_page_state(NR_FILE_MAPPED); inc_page_state(pageoutrun); @@ -1407,7 +1407,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) for (prio = DEF_PRIORITY; prio >= 0; prio--) { unsigned long nr_to_scan = nr_pages - ret; - sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_mapped = global_page_state(NR_FILE_MAPPED); sc.nr_scanned = 0; ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); @@ -1548,7 +1548,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .nr_mapped = read_page_state(nr_mapped), + .nr_mapped = global_page_state(NR_FILE_MAPPED), .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, diff --git a/mm/vmstat.c b/mm/vmstat.c index 210f9bbbb04..4800091c129 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -401,13 +401,13 @@ struct seq_operations fragmentation_op = { static char *vmstat_text[] = { /* Zoned VM counters */ + "nr_mapped", /* Page state */ "nr_dirty", "nr_writeback", "nr_unstable", "nr_page_table_pages", - "nr_mapped", "nr_slab", "pgpgin", -- cgit v1.2.3 From 347ce434d57da80fd5809c0c836f206a50999c26 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:35 -0700 Subject: [PATCH] zoned vm counters: conversion of nr_pagecache to per zone counter Currently a single atomic variable is used to establish the size of the page cache in the whole machine. The zoned VM counters have the same method of implementation as the nr_pagecache code but also allow the determination of the pagecache size per zone. Remove the special implementation for nr_pagecache and make it a zoned counter named NR_FILE_PAGES. Updates of the page cache counters are always performed with interrupts off. We can therefore use the __ variant here. Signed-off-by: Christoph Lameter Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/page_alloc.c | 5 ----- mm/swap_state.c | 4 ++-- mm/vmstat.c | 7 +------ 6 files changed, 7 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 648f2c0c8e1..87d62c44c3f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -120,7 +120,7 @@ void __remove_from_page_cache(struct page *page) radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; - pagecache_acct(-1); + __dec_zone_page_state(page, NR_FILE_PAGES); } void remove_from_page_cache(struct page *page) @@ -449,7 +449,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, page->mapping = mapping; page->index = offset; mapping->nrpages++; - pagecache_acct(1); + __inc_zone_page_state(page, NR_FILE_PAGES); } write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); diff --git a/mm/mmap.c b/mm/mmap.c index 6446c6134b0..c1868ecdbc5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -96,7 +96,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { unsigned long n; - free = get_page_cache_size(); + free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* diff --git a/mm/nommu.c b/mm/nommu.c index 029fadac0fb..5151c44a825 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1122,7 +1122,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { unsigned long n; - free = get_page_cache_size(); + free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 04dd2b01b2b..8350720f98a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2124,16 +2124,11 @@ static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; - long *count; unsigned long *src, *dest; if (action == CPU_DEAD) { int i; - /* Drain local pagecache count. */ - count = &per_cpu(nr_pagecache_local, cpu); - atomic_add(*count, &nr_pagecache); - *count = 0; local_irq_disable(); __drain_pages(cpu); diff --git a/mm/swap_state.c b/mm/swap_state.c index 7535211bb49..fccbd9bba77 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -87,7 +87,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, SetPageSwapCache(page); set_page_private(page, entry.val); total_swapcache_pages++; - pagecache_acct(1); + __inc_zone_page_state(page, NR_FILE_PAGES); } write_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); @@ -132,7 +132,7 @@ void __delete_from_swap_cache(struct page *page) set_page_private(page, 0); ClearPageSwapCache(page); total_swapcache_pages--; - pagecache_acct(-1); + __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 4800091c129..f16b33eb6d5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -20,12 +20,6 @@ */ DEFINE_PER_CPU(struct page_state, page_states) = {0}; -atomic_t nr_pagecache = ATOMIC_INIT(0); -EXPORT_SYMBOL(nr_pagecache); -#ifdef CONFIG_SMP -DEFINE_PER_CPU(long, nr_pagecache_local) = 0; -#endif - static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { unsigned cpu; @@ -402,6 +396,7 @@ struct seq_operations fragmentation_op = { static char *vmstat_text[] = { /* Zoned VM counters */ "nr_mapped", + "nr_file_pages", /* Page state */ "nr_dirty", -- cgit v1.2.3 From bf02cf4b6cf931d060ad5c6ce9b960af6faefd2d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:36 -0700 Subject: [PATCH] zoned vm counters: remove NR_FILE_MAPPED from scan control structure We can now access the number of pages in a mapped state in an inexpensive way in shrink_active_list. So drop the nr_mapped field from scan_control. [akpm@osdl.org: bugfix] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index d2caf7471cf..08bc54e8086 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -47,8 +47,6 @@ struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; - unsigned long nr_mapped; /* From page_state */ - /* This context's GFP mask */ gfp_t gfp_mask; @@ -744,7 +742,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * how much memory * is mapped. */ - mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages; + mapped_ratio = (global_page_state(NR_FILE_MAPPED) * 100) / + vm_total_pages; /* * Now decide how much we really want to unmap some pages. The @@ -990,7 +989,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) } for (priority = DEF_PRIORITY; priority >= 0; priority--) { - sc.nr_mapped = global_page_state(NR_FILE_MAPPED); sc.nr_scanned = 0; if (!priority) disable_swap_token(); @@ -1075,8 +1073,6 @@ loop_again: total_scanned = 0; nr_reclaimed = 0; sc.may_writepage = !laptop_mode; - sc.nr_mapped = global_page_state(NR_FILE_MAPPED); - inc_page_state(pageoutrun); for (i = 0; i < pgdat->nr_zones; i++) { @@ -1407,9 +1403,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) for (prio = DEF_PRIORITY; prio >= 0; prio--) { unsigned long nr_to_scan = nr_pages - ret; - sc.nr_mapped = global_page_state(NR_FILE_MAPPED); sc.nr_scanned = 0; - ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); if (ret >= nr_pages) goto out; @@ -1548,7 +1542,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .nr_mapped = global_page_state(NR_FILE_MAPPED), .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, -- cgit v1.2.3 From f3dbd34460ff54962d3e3244b6bcb7f5295356e6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:36 -0700 Subject: [PATCH] zoned vm counters: split NR_ANON_PAGES off from NR_FILE_MAPPED The current NR_FILE_MAPPED is used by zone reclaim and the dirty load calculation as the number of mapped pagecache pages. However, that is not true. NR_FILE_MAPPED includes the mapped anonymous pages. This patch separates those and therefore allows an accurate tracking of the anonymous pages per zone. It then becomes possible to determine the number of unmapped pages per zone and we can avoid scanning for unmapped pages if there are none. Also it may now be possible to determine the mapped/unmapped ratio in get_dirty_limit. Isnt the number of anonymous pages irrelevant in that calculation? Note that this will change the meaning of the number of mapped pages reported in /proc/vmstat /proc/meminfo and in the per node statistics. This may affect user space tools that monitor these counters! NR_FILE_MAPPED works like NR_FILE_DIRTY. It is only valid for pagecache pages. Signed-off-by: Christoph Lameter Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 3 ++- mm/rmap.c | 5 +++-- mm/vmscan.c | 3 ++- mm/vmstat.c | 1 + 4 files changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 60c7244c42e..0faacfe1890 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -111,7 +111,8 @@ static void get_writeback_state(struct writeback_state *wbs) { wbs->nr_dirty = read_page_state(nr_dirty); wbs->nr_unstable = read_page_state(nr_unstable); - wbs->nr_mapped = global_page_state(NR_FILE_MAPPED); + wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES); wbs->nr_writeback = read_page_state(nr_writeback); } diff --git a/mm/rmap.c b/mm/rmap.c index af5e9808e65..40158b59729 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -455,7 +455,7 @@ static void __page_set_anon_rmap(struct page *page, * nr_mapped state can be updated without turning off * interrupts because it is not modified via interrupt. */ - __inc_zone_page_state(page, NR_FILE_MAPPED); + __inc_zone_page_state(page, NR_ANON_PAGES); } /** @@ -531,7 +531,8 @@ void page_remove_rmap(struct page *page) */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); - __dec_zone_page_state(page, NR_FILE_MAPPED); + __dec_zone_page_state(page, + PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 08bc54e8086..2f0390161c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -742,7 +742,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * how much memory * is mapped. */ - mapped_ratio = (global_page_state(NR_FILE_MAPPED) * 100) / + mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES)) * 100) / vm_total_pages; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index f16b33eb6d5..3baf4dffa62 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -395,6 +395,7 @@ struct seq_operations fragmentation_op = { static char *vmstat_text[] = { /* Zoned VM counters */ + "nr_anon_pages", "nr_mapped", "nr_file_pages", -- cgit v1.2.3 From 34aa1330f9b3c5783d269851d467326525207422 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:37 -0700 Subject: [PATCH] zoned vm counters: zone_reclaim: remove /proc/sys/vm/zone_reclaim_interval The zone_reclaim_interval was necessary because we were not able to determine how many unmapped pages exist in a zone. Therefore we had to scan in intervals to figure out if any pages were unmapped. With the zoned counters and NR_ANON_PAGES we now know the number of pagecache pages and the number of mapped pages in a zone. So we can simply skip the reclaim if there is an insufficient number of unmapped pages. We use SWAP_CLUSTER_MAX as the boundary. Drop all support for /proc/sys/vm/zone_reclaim_interval. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2f0390161c0..0960846d649 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1517,11 +1517,6 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ -/* - * Mininum time between zone reclaim scans - */ -int zone_reclaim_interval __read_mostly = 30*HZ; - /* * Priority for ZONE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of @@ -1587,16 +1582,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - - if (nr_reclaimed == 0) { - /* - * We were unable to reclaim enough pages to stay on node. We - * now allow off node accesses for a certain time period before - * trying again to reclaim pages from the local zone. - */ - zone->last_unsuccessful_zone_reclaim = jiffies; - } - return nr_reclaimed >= nr_pages; } @@ -1606,13 +1591,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int node_id; /* - * Do not reclaim if there was a recent unsuccessful attempt at zone - * reclaim. In that case we let allocations go off node for the - * zone_reclaim_interval. Otherwise we would scan for each off-node - * page allocation. + * Do not reclaim if there are not enough reclaimable pages in this + * zone that would satify this allocations. + * + * All unmapped pagecache pages are reclaimable. + * + * Both counters may be temporarily off a bit so we use + * SWAP_CLUSTER_MAX as the boundary. It may also be good to + * leave a few frequently used unmapped pagecache pages around. */ - if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + if (zone_page_state(zone, NR_FILE_PAGES) - + zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX) return 0; /* -- cgit v1.2.3 From 9a865ffa34b6117a5e0b67640a084d8c2e198c93 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:38 -0700 Subject: [PATCH] zoned vm counters: conversion of nr_slab to per zone counter - Allows reclaim to access counter without looping over processor counts. - Allows accurate statistics on how many pages are used in a zone by the slab. This may become useful to balance slab allocations over various zones. [akpm@osdl.org: bugfix] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/slab.c | 4 ++-- mm/vmscan.c | 2 +- mm/vmstat.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8350720f98a..a38a11cfb48 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1318,7 +1318,7 @@ void show_free_areas(void) ps.nr_writeback, ps.nr_unstable, nr_free_pages(), - ps.nr_slab, + global_page_state(NR_SLAB), global_page_state(NR_FILE_MAPPED), ps.nr_page_table_pages); diff --git a/mm/slab.c b/mm/slab.c index 0c33820038c..5dcfb904480 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1507,7 +1507,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) atomic_add(nr_pages, &slab_reclaim_pages); - add_page_state(nr_slab, nr_pages); + add_zone_page_state(page_zone(page), NR_SLAB, nr_pages); for (i = 0; i < nr_pages; i++) __SetPageSlab(page + i); return page_address(page); @@ -1522,12 +1522,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; + sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed); while (i--) { BUG_ON(!PageSlab(page)); __ClearPageSlab(page); page++; } - sub_page_state(nr_slab, nr_freed); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; free_pages((unsigned long)addr, cachep->gfporder); diff --git a/mm/vmscan.c b/mm/vmscan.c index 0960846d649..d6942436ac9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1362,7 +1362,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) for_each_zone(zone) lru_pages += zone->nr_active + zone->nr_inactive; - nr_slab = read_page_state(nr_slab); + nr_slab = global_page_state(NR_SLAB); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { reclaim_state.reclaimed_slab = 0; diff --git a/mm/vmstat.c b/mm/vmstat.c index 3baf4dffa62..dc9e6920922 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -398,13 +398,13 @@ static char *vmstat_text[] = { "nr_anon_pages", "nr_mapped", "nr_file_pages", + "nr_slab", /* Page state */ "nr_dirty", "nr_writeback", "nr_unstable", "nr_page_table_pages", - "nr_slab", "pgpgin", "pgpgout", -- cgit v1.2.3 From df849a1529c106f7460e51479ca78fe07b07dc8c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:38 -0700 Subject: [PATCH] zoned vm counters: conversion of nr_pagetables to per zone counter Conversion of nr_page_table_pages to a per zone counter [akpm@osdl.org: bugfix] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- mm/page_alloc.c | 2 +- mm/vmstat.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 247b5c312b9..1a78791590f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -126,7 +126,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) pmd_clear(pmd); pte_lock_deinit(page); pte_free_tlb(tlb, page); - dec_page_state(nr_page_table_pages); + dec_zone_page_state(page, NR_PAGETABLE); tlb->mm->nr_ptes--; } @@ -311,7 +311,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) pte_free(new); } else { mm->nr_ptes++; - inc_page_state(nr_page_table_pages); + inc_zone_page_state(new, NR_PAGETABLE); pmd_populate(mm, pmd, new); } spin_unlock(&mm->page_table_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a38a11cfb48..ed3f2a7b407 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1320,7 +1320,7 @@ void show_free_areas(void) nr_free_pages(), global_page_state(NR_SLAB), global_page_state(NR_FILE_MAPPED), - ps.nr_page_table_pages); + global_page_state(NR_PAGETABLE)); for_each_zone(zone) { int i; diff --git a/mm/vmstat.c b/mm/vmstat.c index dc9e6920922..292a35fe56c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -399,12 +399,12 @@ static char *vmstat_text[] = { "nr_mapped", "nr_file_pages", "nr_slab", + "nr_page_table_pages", /* Page state */ "nr_dirty", "nr_writeback", "nr_unstable", - "nr_page_table_pages", "pgpgin", "pgpgout", -- cgit v1.2.3 From b1e7a8fd854d2f895730e82137400012b509650e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:39 -0700 Subject: [PATCH] zoned vm counters: conversion of nr_dirty to per zone counter This makes nr_dirty a per zone counter. Looping over all processors is avoided during writeback state determination. The counter aggregation for nr_dirty had to be undone in the NFS layer since we summed up the page counts from multiple zones. Someone more familiar with NFS should probably review what I have done. [akpm@osdl.org: bugfix] Signed-off-by: Christoph Lameter Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 11 ++++++----- mm/page_alloc.c | 2 +- mm/vmstat.c | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0faacfe1890..da854783009 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -109,7 +109,7 @@ struct writeback_state static void get_writeback_state(struct writeback_state *wbs) { - wbs->nr_dirty = read_page_state(nr_dirty); + wbs->nr_dirty = global_page_state(NR_FILE_DIRTY); wbs->nr_unstable = read_page_state(nr_unstable); wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) + global_page_state(NR_ANON_PAGES); @@ -641,7 +641,8 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); if (mapping_cap_account_dirty(mapping)) - inc_page_state(nr_dirty); + __inc_zone_page_state(page, + NR_FILE_DIRTY); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -728,9 +729,9 @@ int test_clear_page_dirty(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping_cap_account_dirty(mapping)) - dec_page_state(nr_dirty); + __dec_zone_page_state(page, NR_FILE_DIRTY); + write_unlock_irqrestore(&mapping->tree_lock, flags); return 1; } write_unlock_irqrestore(&mapping->tree_lock, flags); @@ -761,7 +762,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping) { if (TestClearPageDirty(page)) { if (mapping_cap_account_dirty(mapping)) - dec_page_state(nr_dirty); + dec_zone_page_state(page, NR_FILE_DIRTY); return 1; } return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ed3f2a7b407..c2b9aa4acc4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1314,7 +1314,7 @@ void show_free_areas(void) "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", active, inactive, - ps.nr_dirty, + global_page_state(NR_FILE_DIRTY), ps.nr_writeback, ps.nr_unstable, nr_free_pages(), diff --git a/mm/vmstat.c b/mm/vmstat.c index 292a35fe56c..1982fb533a4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -400,9 +400,9 @@ static char *vmstat_text[] = { "nr_file_pages", "nr_slab", "nr_page_table_pages", + "nr_dirty", /* Page state */ - "nr_dirty", "nr_writeback", "nr_unstable", -- cgit v1.2.3 From ce866b34ae1b7f1ce60234cf65855886ac7e7d30 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:40 -0700 Subject: [PATCH] zoned vm counters: conversion of nr_writeback to per zone counter Conversion of nr_writeback to per zone counter. This removes the last page_state counter from arch/i386/mm/pgtable.c so we drop the page_state from there. [akpm@osdl.org: bugfix] Signed-off-by: Christoph Lameter Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- mm/page_alloc.c | 2 +- mm/vmstat.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index da854783009..3cfdff4b198 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -113,7 +113,7 @@ static void get_writeback_state(struct writeback_state *wbs) wbs->nr_unstable = read_page_state(nr_unstable); wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) + global_page_state(NR_ANON_PAGES); - wbs->nr_writeback = read_page_state(nr_writeback); + wbs->nr_writeback = global_page_state(NR_WRITEBACK); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2b9aa4acc4..a12e894a8bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1315,7 +1315,7 @@ void show_free_areas(void) active, inactive, global_page_state(NR_FILE_DIRTY), - ps.nr_writeback, + global_page_state(NR_WRITEBACK), ps.nr_unstable, nr_free_pages(), global_page_state(NR_SLAB), diff --git a/mm/vmstat.c b/mm/vmstat.c index 1982fb533a4..e84c7e520a1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -401,9 +401,9 @@ static char *vmstat_text[] = { "nr_slab", "nr_page_table_pages", "nr_dirty", + "nr_writeback", /* Page state */ - "nr_writeback", "nr_unstable", "pgpgin", -- cgit v1.2.3 From fd39fc8561be33065306bdac0e30414e1e8ac8e1 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:40 -0700 Subject: [PATCH] zoned vm counters: conversion of nr_unstable to per zone counter Conversion of nr_unstable to a per zone counter We need to do some special modifications to the nfs code since there are multiple cases of disposition and we need to have a page ref for proper accounting. This converts the last critical page state of the VM and therefore we need to remove several functions that were depending on GET_PAGE_STATE_LAST in order to make the kernel compile again. We are only left with event type counters in page state. [akpm@osdl.org: bugfixes] Signed-off-by: Christoph Lameter Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- mm/page_alloc.c | 4 +--- mm/vmstat.c | 25 +------------------------ 3 files changed, 3 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3cfdff4b198..de9836f43db 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -110,7 +110,7 @@ struct writeback_state static void get_writeback_state(struct writeback_state *wbs) { wbs->nr_dirty = global_page_state(NR_FILE_DIRTY); - wbs->nr_unstable = read_page_state(nr_unstable); + wbs->nr_unstable = global_page_state(NR_UNSTABLE_NFS); wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) + global_page_state(NR_ANON_PAGES); wbs->nr_writeback = global_page_state(NR_WRITEBACK); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a12e894a8bc..6aa2c31f513 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1271,7 +1271,6 @@ void si_meminfo_node(struct sysinfo *val, int nid) */ void show_free_areas(void) { - struct page_state ps; int cpu, temperature; unsigned long active; unsigned long inactive; @@ -1303,7 +1302,6 @@ void show_free_areas(void) } } - get_page_state(&ps); get_zone_counts(&active, &inactive, &free); printk("Free pages: %11ukB (%ukB HighMem)\n", @@ -1316,7 +1314,7 @@ void show_free_areas(void) inactive, global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), - ps.nr_unstable, + global_page_state(NR_UNSTABLE_NFS), nr_free_pages(), global_page_state(NR_SLAB), global_page_state(NR_FILE_MAPPED), diff --git a/mm/vmstat.c b/mm/vmstat.c index e84c7e520a1..9dc270aed5c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -45,28 +45,6 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) } } -void get_page_state_node(struct page_state *ret, int node) -{ - int nr; - cpumask_t mask = node_to_cpumask(node); - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr+1, &mask); -} - -void get_page_state(struct page_state *ret) -{ - int nr; - cpumask_t mask = CPU_MASK_ALL; - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr + 1, &mask); -} - void get_full_page_state(struct page_state *ret) { cpumask_t mask = CPU_MASK_ALL; @@ -402,10 +380,9 @@ static char *vmstat_text[] = { "nr_page_table_pages", "nr_dirty", "nr_writeback", - - /* Page state */ "nr_unstable", + /* Event counters */ "pgpgin", "pgpgout", "pswpin", -- cgit v1.2.3 From d2c5e30c9a1420902262aa923794d2ae4e0bc391 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:41 -0700 Subject: [PATCH] zoned vm counters: conversion of nr_bounce to per zone counter Conversion of nr_bounce to a per zone counter nr_bounce is only used for proc output. So it could be left as an event counter. However, the event counters may not be accurate and nr_bounce is categorizing types of pages in a zone. So we really need this to also be a per zone counter. [akpm@osdl.org: bugfix] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 6 +++--- mm/vmstat.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 9b274fdf9d0..9b2a5403c44 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -315,8 +315,8 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) if (bvec->bv_page == org_vec->bv_page) continue; - mempool_free(bvec->bv_page, pool); - dec_page_state(nr_bounce); + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); } bio_endio(bio_orig, bio_orig->bi_size, err); @@ -397,7 +397,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, to->bv_page = mempool_alloc(pool, q->bounce_gfp); to->bv_len = from->bv_len; to->bv_offset = from->bv_offset; - inc_page_state(nr_bounce); + inc_zone_page_state(to->bv_page, NR_BOUNCE); if (rw == WRITE) { char *vto, *vfrom; diff --git a/mm/vmstat.c b/mm/vmstat.c index 9dc270aed5c..25e5ca7c174 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -381,6 +381,7 @@ static char *vmstat_text[] = { "nr_dirty", "nr_writeback", "nr_unstable", + "nr_bounce", /* Event counters */ "pgpgin", @@ -428,7 +429,6 @@ static char *vmstat_text[] = { "allocstall", "pgrotated", - "nr_bounce", }; /* -- cgit v1.2.3 From c24f21bda88df4574de0a32a2a1558a23adae1b8 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:42 -0700 Subject: [PATCH] zoned vm counters: remove useless struct wbs Remove writeback state We can remove some functions now that were needed to calculate the page state for writeback control since these statistics are now directly available. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 85 +++++++++++++++++++++-------------------------------- 1 file changed, 34 insertions(+), 51 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index de9836f43db..e630188ccc4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -99,23 +99,6 @@ EXPORT_SYMBOL(laptop_mode); static void background_writeout(unsigned long _min_pages); -struct writeback_state -{ - unsigned long nr_dirty; - unsigned long nr_unstable; - unsigned long nr_mapped; - unsigned long nr_writeback; -}; - -static void get_writeback_state(struct writeback_state *wbs) -{ - wbs->nr_dirty = global_page_state(NR_FILE_DIRTY); - wbs->nr_unstable = global_page_state(NR_UNSTABLE_NFS); - wbs->nr_mapped = global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES); - wbs->nr_writeback = global_page_state(NR_WRITEBACK); -} - /* * Work out the current dirty-memory clamping and background writeout * thresholds. @@ -134,8 +117,8 @@ static void get_writeback_state(struct writeback_state *wbs) * clamping level. */ static void -get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, - struct address_space *mapping) +get_dirty_limits(long *pbackground, long *pdirty, + struct address_space *mapping) { int background_ratio; /* Percentages */ int dirty_ratio; @@ -145,8 +128,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, unsigned long available_memory = total_pages; struct task_struct *tsk; - get_writeback_state(wbs); - #ifdef CONFIG_HIGHMEM /* * If this mapping can only allocate from low memory, @@ -157,7 +138,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, #endif - unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; + unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES)) * 100) / + total_pages; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) @@ -190,7 +173,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, */ static void balance_dirty_pages(struct address_space *mapping) { - struct writeback_state wbs; long nr_reclaimable; long background_thresh; long dirty_thresh; @@ -208,11 +190,12 @@ static void balance_dirty_pages(struct address_space *mapping) .range_cyclic = 1, }; - get_dirty_limits(&wbs, &background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) - break; + get_dirty_limits(&background_thresh, &dirty_thresh, mapping); + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= + dirty_thresh) + break; if (!dirty_exceeded) dirty_exceeded = 1; @@ -225,11 +208,14 @@ static void balance_dirty_pages(struct address_space *mapping) */ if (nr_reclaimable) { writeback_inodes(&wbc); - get_dirty_limits(&wbs, &background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) - break; + get_dirty_limits(&background_thresh, + &dirty_thresh, mapping); + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + if (nr_reclaimable + + global_page_state(NR_WRITEBACK) + <= dirty_thresh) + break; pages_written += write_chunk - wbc.nr_to_write; if (pages_written >= write_chunk) break; /* We've done our duty */ @@ -237,8 +223,9 @@ static void balance_dirty_pages(struct address_space *mapping) blk_congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) - dirty_exceeded = 0; + if (nr_reclaimable + global_page_state(NR_WRITEBACK) + <= dirty_thresh && dirty_exceeded) + dirty_exceeded = 0; if (writeback_in_progress(bdi)) return; /* pdflush is already working this queue */ @@ -300,12 +287,11 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(void) { - struct writeback_state wbs; long background_thresh; long dirty_thresh; for ( ; ; ) { - get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL); /* * Boost the allowable dirty threshold a bit for page @@ -313,8 +299,9 @@ void throttle_vm_writeout(void) */ dirty_thresh += dirty_thresh / 10; /* wheeee... */ - if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) - break; + if (global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK) <= dirty_thresh) + break; blk_congestion_wait(WRITE, HZ/10); } } @@ -337,12 +324,12 @@ static void background_writeout(unsigned long _min_pages) }; for ( ; ; ) { - struct writeback_state wbs; long background_thresh; long dirty_thresh; - get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); - if (wbs.nr_dirty + wbs.nr_unstable < background_thresh + get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) break; wbc.encountered_congestion = 0; @@ -366,12 +353,9 @@ static void background_writeout(unsigned long _min_pages) */ int wakeup_pdflush(long nr_pages) { - if (nr_pages == 0) { - struct writeback_state wbs; - - get_writeback_state(&wbs); - nr_pages = wbs.nr_dirty + wbs.nr_unstable; - } + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); return pdflush_operation(background_writeout, nr_pages); } @@ -402,7 +386,6 @@ static void wb_kupdate(unsigned long arg) unsigned long start_jif; unsigned long next_jif; long nr_to_write; - struct writeback_state wbs; struct writeback_control wbc = { .bdi = NULL, .sync_mode = WB_SYNC_NONE, @@ -415,11 +398,11 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - get_writeback_state(&wbs); oldest_jif = jiffies - dirty_expire_interval; start_jif = jiffies; next_jif = start_jif + dirty_writeback_interval; - nr_to_write = wbs.nr_dirty + wbs.nr_unstable + + nr_to_write = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { wbc.encountered_congestion = 0; -- cgit v1.2.3 From bab1846a0582f627f5ec22aa2dc5f4f3e82e8176 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 30 Jun 2006 01:55:43 -0700 Subject: [PATCH] zoned-vm-counters: remove read_page_state() No callers. Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 25e5ca7c174..06a6d105219 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -52,20 +52,6 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); } -unsigned long read_page_state_offset(unsigned long offset) -{ - unsigned long ret = 0; - int cpu; - - for_each_online_cpu(cpu) { - unsigned long in; - - in = (unsigned long)&per_cpu(page_states, cpu) + offset; - ret += *((unsigned long *)in); - } - return ret; -} - void __mod_page_state_offset(unsigned long offset, unsigned long delta) { void *ptr; -- cgit v1.2.3 From ca889e6c45e0b112cb2ca9d35afc66297519b5d5 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:44 -0700 Subject: [PATCH] Use Zoned VM Counters for NUMA statistics The numa statistics are really event counters. But they are per node and so we have had special treatment for these counters through additional fields on the pcp structure. We can now use the per zone nature of the zoned VM counters to realize these. This will shrink the size of the pcp structure on NUMA systems. We will have some room to add additional per zone counters that will all still fit in the same cacheline. Bits Prior pcp size Size after patch We can add ------------------------------------------------------------------ 64 128 bytes (16 words) 80 bytes (10 words) 48 32 76 bytes (19 words) 56 bytes (14 words) 8 (64 byte cacheline) 72 (128 byte) Remove the special statistics for numa and replace them with zoned vm counters. This has the side effect that global sums of these events now show up in /proc/vmstat. Also take the opportunity to move the zone_statistics() function from page_alloc.c into vmstat.c. Discussions: V2 http://marc.theaimsgroup.com/?t=115048227000002&r=1&w=2 Signed-off-by: Christoph Lameter Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 ++--- mm/page_alloc.c | 23 +----------------- mm/vmstat.c | 73 +++++++++++++++++++++++++++++++++++++-------------------- 3 files changed, 50 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6b9740bbf4c..e07e27e846a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1209,10 +1209,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zl->zones[0]) { - zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; - put_cpu(); - } + if (page && page_zone(page) == zl->zones[0]) + inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6aa2c31f513..d61671260f9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -709,27 +709,6 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ -static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) -{ -#ifdef CONFIG_NUMA - pg_data_t *pg = z->zone_pgdat; - pg_data_t *orig = zonelist->zones[0]->zone_pgdat; - struct per_cpu_pageset *p; - - p = zone_pcp(z, cpu); - if (pg == orig) { - p->numa_hit++; - } else { - p->numa_miss++; - zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; - } - if (pg == NODE_DATA(numa_node_id())) - p->local_node++; - else - p->other_node++; -#endif -} - /* * Free a 0-order page */ @@ -827,7 +806,7 @@ again: } __mod_page_state_zone(zone, pgalloc, 1 << order); - zone_statistics(zonelist, zone, cpu); + zone_statistics(zonelist, zone); local_irq_restore(flags); put_cpu(); diff --git a/mm/vmstat.c b/mm/vmstat.c index 06a6d105219..ee7f8966625 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -185,9 +185,8 @@ EXPORT_SYMBOL(mod_zone_page_state); * in between and therefore the atomicity vs. interrupt cannot be exploited * in a useful way here. */ -void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct zone *zone = page_zone(page); s8 *p = diff_pointer(zone, item); (*p)++; @@ -197,6 +196,11 @@ void __inc_zone_page_state(struct page *page, enum zone_stat_item item) *p = 0; } } + +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + __inc_zone_state(page_zone(page), item); +} EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_page_state(struct page *page, enum zone_stat_item item) @@ -213,22 +217,23 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +} + void inc_zone_page_state(struct page *page, enum zone_stat_item item) { unsigned long flags; struct zone *zone; - s8 *p; zone = page_zone(page); local_irq_save(flags); - p = diff_pointer(zone, item); - - (*p)++; - - if (unlikely(*p > STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; - } + __inc_zone_state(zone, item); local_irq_restore(flags); } EXPORT_SYMBOL(inc_zone_page_state); @@ -297,6 +302,28 @@ EXPORT_SYMBOL(refresh_vm_stats); #endif +#ifdef CONFIG_NUMA +/* + * zonelist = the list of zones passed to the allocator + * z = the zone from which the allocation occurred. + * + * Must be called with interrupts disabled. + */ +void zone_statistics(struct zonelist *zonelist, struct zone *z) +{ + if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) { + __inc_zone_state(z, NUMA_HIT); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); + } + if (z->zone_pgdat == NODE_DATA(numa_node_id())) + __inc_zone_state(z, NUMA_LOCAL); + else + __inc_zone_state(z, NUMA_OTHER); +} +#endif + #ifdef CONFIG_PROC_FS #include @@ -369,6 +396,15 @@ static char *vmstat_text[] = { "nr_unstable", "nr_bounce", +#ifdef CONFIG_NUMA + "numa_hit", + "numa_miss", + "numa_foreign", + "numa_interleave", + "numa_local", + "numa_other", +#endif + /* Event counters */ "pgpgin", "pgpgout", @@ -490,21 +526,6 @@ static int zoneinfo_show(struct seq_file *m, void *arg) pageset->pcp[j].high, pageset->pcp[j].batch); } -#ifdef CONFIG_NUMA - seq_printf(m, - "\n numa_hit: %lu" - "\n numa_miss: %lu" - "\n numa_foreign: %lu" - "\n interleave_hit: %lu" - "\n local_node: %lu" - "\n other_node: %lu", - pageset->numa_hit, - pageset->numa_miss, - pageset->numa_foreign, - pageset->interleave_hit, - pageset->local_node, - pageset->other_node); -#endif } seq_printf(m, "\n all_unreclaimable: %u" -- cgit v1.2.3 From f8891e5e1f93a128c3900f82035e8541357896a7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:45 -0700 Subject: [PATCH] Light weight event counters The remaining counters in page_state after the zoned VM counter patches have been applied are all just for show in /proc/vmstat. They have no essential function for the VM. We use a simple increment of per cpu variables. In order to avoid the most severe races we disable preempt. Preempt does not prevent the race between an increment and an interrupt handler incrementing the same statistics counter. However, that race is exceedingly rare, we may only loose one increment or so and there is no requirement (at least not in kernel) that the vm event counters have to be accurate. In the non preempt case this results in a simple increment for each counter. For many architectures this will be reduced by the compiler to a single instruction. This single instruction is atomic for i386 and x86_64. And therefore even the rare race condition in an interrupt is avoided for both architectures in most cases. The patchset also adds an off switch for embedded systems that allows a building of linux kernels without these counters. The implementation of these counters is through inline code that hopefully results in only a single instruction increment instruction being emitted (i386, x86_64) or in the increment being hidden though instruction concurrency (EPIC architectures such as ia64 can get that done). Benefits: - VM event counter operations usually reduce to a single inline instruction on i386 and x86_64. - No interrupt disable, only preempt disable for the preempt case. Preempt disable can also be avoided by moving the counter into a spinlock. - Handling is similar to zoned VM counters. - Simple and easily extendable. - Can be omitted to reduce memory use for embedded use. References: RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=113512330605497&w=2 RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=114988082814934&w=2 local_t http://marc.theaimsgroup.com/?l=linux-kernel&m=114991748606690&w=2 V2 http://marc.theaimsgroup.com/?t=115014808400007&r=1&w=2 V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767022346&w=2 V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115047968808926&w=2 Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 +- mm/memory.c | 4 +- mm/page_alloc.c | 21 ++----- mm/page_io.c | 4 +- mm/shmem.c | 4 +- mm/swap.c | 4 +- mm/vmscan.c | 23 ++++---- mm/vmstat.c | 171 +++++++++++++++++++++++++++++--------------------------- 8 files changed, 114 insertions(+), 121 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 87d62c44c3f..796a5471b49 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1416,7 +1416,7 @@ retry_find: */ if (!did_readaround) { majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); } did_readaround = 1; ra_pages = max_sane_readahead(file->f_ra.ra_pages); @@ -1487,7 +1487,7 @@ no_cached_page: page_not_uptodate: if (!did_readaround) { majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); } lock_page(page); diff --git a/mm/memory.c b/mm/memory.c index 1a78791590f..7e2a4b1580e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1951,7 +1951,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); grab_swap_token(); } @@ -2324,7 +2324,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, __set_current_state(TASK_RUNNING); - inc_page_state(pgfault); + count_vm_event(PGFAULT); if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, write_access); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d61671260f9..30b0b97ad02 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -456,7 +456,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); - __mod_page_state(pgfree, 1 << order); + __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); local_irq_restore(flags); } @@ -729,7 +729,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - __inc_page_state(pgfree); + __count_vm_event(PGFREE); list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) { @@ -805,7 +805,7 @@ again: goto failed; } - __mod_page_state_zone(zone, pgalloc, 1 << order); + __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(zonelist, zone); local_irq_restore(flags); put_cpu(); @@ -2101,24 +2101,11 @@ static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; - unsigned long *src, *dest; if (action == CPU_DEAD) { - int i; - local_irq_disable(); __drain_pages(cpu); - - /* Add dead cpu's page_states to our own. */ - dest = (unsigned long *)&__get_cpu_var(page_states); - src = (unsigned long *)&per_cpu(page_states, cpu); - - for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); - i++) { - dest[i] += src[i]; - src[i] = 0; - } - + vm_events_fold_cpu(cpu); local_irq_enable(); refresh_cpu_vm_stats(cpu); } diff --git a/mm/page_io.c b/mm/page_io.c index bb2b0d53889..88029948d00 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -101,7 +101,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) } if (wbc->sync_mode == WB_SYNC_ALL) rw |= (1 << BIO_RW_SYNC); - inc_page_state(pswpout); + count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); submit_bio(rw, bio); @@ -123,7 +123,7 @@ int swap_readpage(struct file *file, struct page *page) ret = -ENOMEM; goto out; } - inc_page_state(pswpin); + count_vm_event(PSWPIN); submit_bio(READ, bio); out: return ret; diff --git a/mm/shmem.c b/mm/shmem.c index b14ff817d16..a9c09e0ba70 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1045,12 +1045,12 @@ repeat: swappage = lookup_swap_cache(swap); if (!swappage) { shmem_swp_unmap(entry); - spin_unlock(&info->lock); /* here we actually do the io */ if (type && *type == VM_FAULT_MINOR) { - inc_page_state(pgmajfault); + __count_vm_event(PGMAJFAULT); *type = VM_FAULT_MAJOR; } + spin_unlock(&info->lock); swappage = shmem_swapin(info, swap, idx); if (!swappage) { spin_lock(&info->lock); diff --git a/mm/swap.c b/mm/swap.c index 990868afc1c..8fd095c4ae5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -87,7 +87,7 @@ int rotate_reclaimable_page(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { list_move_tail(&page->lru, &zone->inactive_list); - inc_page_state(pgrotated); + __count_vm_event(PGROTATED); } if (!test_clear_page_writeback(page)) BUG(); @@ -107,7 +107,7 @@ void fastcall activate_page(struct page *page) del_page_from_inactive_list(zone, page); SetPageActive(page); add_page_to_active_list(zone, page); - inc_page_state(pgactivate); + __count_vm_event(PGACTIVATE); } spin_unlock_irq(&zone->lru_lock); } diff --git a/mm/vmscan.c b/mm/vmscan.c index d6942436ac9..ff2ebe9458a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -215,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - mod_page_state(slabs_scanned, this_scan); + count_vm_events(SLABS_SCANNED, this_scan); total_scan -= this_scan; cond_resched(); @@ -569,7 +569,7 @@ keep: list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); - mod_page_state(pgactivate, pgactivate); + count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -659,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { - __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); - __mod_page_state(kswapd_steal, nr_freed); + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + __count_vm_events(KSWAPD_STEAL, nr_freed); } else - __mod_page_state_zone(zone, pgscan_direct, nr_scan); - __mod_page_state_zone(zone, pgsteal, nr_freed); + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + __count_vm_events(PGACTIVATE, nr_freed); if (nr_taken == 0) goto done; @@ -841,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } } zone->nr_active += pgmoved; - spin_unlock(&zone->lru_lock); - __mod_page_state_zone(zone, pgrefill, pgscanned); - __mod_page_state(pgdeactivate, pgdeactivate); - local_irq_enable(); + __count_zone_vm_events(PGREFILL, zone, pgscanned); + __count_vm_events(PGDEACTIVATE, pgdeactivate); + spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); } @@ -977,7 +976,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) .swappiness = vm_swappiness, }; - inc_page_state(allocstall); + count_vm_event(ALLOCSTALL); for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; @@ -1074,7 +1073,7 @@ loop_again: total_scanned = 0; nr_reclaimed = 0; sc.may_writepage = !laptop_mode; - inc_page_state(pageoutrun); + count_vm_event(PAGEOUTRUN); for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; diff --git a/mm/vmstat.c b/mm/vmstat.c index ee7f8966625..73b83d67bab 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -13,66 +13,6 @@ #include #include -/* - * Accumulate the page_state information across all CPUs. - * The result is unavoidably approximate - it can change - * during and after execution of this function. - */ -DEFINE_PER_CPU(struct page_state, page_states) = {0}; - -static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) -{ - unsigned cpu; - - memset(ret, 0, nr * sizeof(unsigned long)); - cpus_and(*cpumask, *cpumask, cpu_online_map); - - for_each_cpu_mask(cpu, *cpumask) { - unsigned long *in; - unsigned long *out; - unsigned off; - unsigned next_cpu; - - in = (unsigned long *)&per_cpu(page_states, cpu); - - next_cpu = next_cpu(cpu, *cpumask); - if (likely(next_cpu < NR_CPUS)) - prefetch(&per_cpu(page_states, next_cpu)); - - out = (unsigned long *)ret; - for (off = 0; off < nr; off++) - *out++ += *in++; - } -} - -void get_full_page_state(struct page_state *ret) -{ - cpumask_t mask = CPU_MASK_ALL; - - __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); -} - -void __mod_page_state_offset(unsigned long offset, unsigned long delta) -{ - void *ptr; - - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; -} -EXPORT_SYMBOL(__mod_page_state_offset); - -void mod_page_state_offset(unsigned long offset, unsigned long delta) -{ - unsigned long flags; - void *ptr; - - local_irq_save(flags); - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; - local_irq_restore(flags); -} -EXPORT_SYMBOL(mod_page_state_offset); - void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) { @@ -106,6 +46,63 @@ void get_zone_counts(unsigned long *active, } } +#ifdef CONFIG_VM_EVENT_COUNTERS +DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; +EXPORT_PER_CPU_SYMBOL(vm_event_states); + +static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) +{ + int cpu = 0; + int i; + + memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); + + cpu = first_cpu(*cpumask); + while (cpu < NR_CPUS) { + struct vm_event_state *this = &per_cpu(vm_event_states, cpu); + + cpu = next_cpu(cpu, *cpumask); + + if (cpu < NR_CPUS) + prefetch(&per_cpu(vm_event_states, cpu)); + + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + ret[i] += this->event[i]; + } +} + +/* + * Accumulate the vm event counters across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. +*/ +void all_vm_events(unsigned long *ret) +{ + sum_vm_events(ret, &cpu_online_map); +} + +#ifdef CONFIG_HOTPLUG +/* + * Fold the foreign cpu events into our own. + * + * This is adding to the events on one processor + * but keeps the global counts constant. + */ +void vm_events_fold_cpu(int cpu) +{ + struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); + int i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + count_vm_events(i, fold_state->event[i]); + fold_state->event[i] = 0; + } +} +#endif /* CONFIG_HOTPLUG */ + +#endif /* CONFIG_VM_EVENT_COUNTERS */ + /* * Manage combined zone based / global counters * @@ -405,16 +402,16 @@ static char *vmstat_text[] = { "numa_other", #endif - /* Event counters */ +#ifdef CONFIG_VM_EVENT_COUNTERS "pgpgin", "pgpgout", "pswpin", "pswpout", - "pgalloc_high", - "pgalloc_normal", - "pgalloc_dma32", "pgalloc_dma", + "pgalloc_dma32", + "pgalloc_normal", + "pgalloc_high", "pgfree", "pgactivate", @@ -423,25 +420,25 @@ static char *vmstat_text[] = { "pgfault", "pgmajfault", - "pgrefill_high", - "pgrefill_normal", - "pgrefill_dma32", "pgrefill_dma", + "pgrefill_dma32", + "pgrefill_normal", + "pgrefill_high", - "pgsteal_high", - "pgsteal_normal", - "pgsteal_dma32", "pgsteal_dma", + "pgsteal_dma32", + "pgsteal_normal", + "pgsteal_high", - "pgscan_kswapd_high", - "pgscan_kswapd_normal", - "pgscan_kswapd_dma32", "pgscan_kswapd_dma", + "pgscan_kswapd_dma32", + "pgscan_kswapd_normal", + "pgscan_kswapd_high", - "pgscan_direct_high", - "pgscan_direct_normal", - "pgscan_direct_dma32", "pgscan_direct_dma", + "pgscan_direct_dma32", + "pgscan_direct_normal", + "pgscan_direct_high", "pginodesteal", "slabs_scanned", @@ -451,6 +448,7 @@ static char *vmstat_text[] = { "allocstall", "pgrotated", +#endif }; /* @@ -553,23 +551,32 @@ struct seq_operations zoneinfo_op = { static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; - struct page_state *ps; +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *e; +#endif int i; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; +#ifdef CONFIG_VM_EVENT_COUNTERS v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(*ps), GFP_KERNEL); + + sizeof(struct vm_event_state), GFP_KERNEL); +#else + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), + GFP_KERNEL); +#endif m->private = v; if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) v[i] = global_page_state(i); - ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS); - get_full_page_state(ps); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; +#ifdef CONFIG_VM_EVENT_COUNTERS + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; +#endif return v + *pos; } -- cgit v1.2.3 From ed11d9eb2228acc483c819ab353e3c41bcb158fa Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 30 Jun 2006 01:55:45 -0700 Subject: [PATCH] slab: consolidate code to free slabs from freelist Post and discussion: http://marc.theaimsgroup.com/?t=115074342800003&r=1&w=2 Code in __shrink_node() duplicates code in cache_reap() Add a new function drain_freelist that removes slabs with objects that are already free and use that in various places. This eliminates the __node_shrink() function and provides the interrupt holdoff reduction from slab_free to code that used to call __node_shrink. [akpm@osdl.org: build fixes] Signed-off-by: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 119 +++++++++++++++++++++++++++----------------------------------- 1 file changed, 51 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 5dcfb904480..3936af34454 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -309,6 +309,13 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define SIZE_AC 1 #define SIZE_L3 (1 + MAX_NUMNODES) +static int drain_freelist(struct kmem_cache *cache, + struct kmem_list3 *l3, int tofree); +static void free_block(struct kmem_cache *cachep, void **objpp, int len, + int node); +static void enable_cpucache(struct kmem_cache *cachep); +static void cache_reap(void *unused); + /* * This function must be completely optimized away if a constant is passed to * it. Mostly the same as what is in linux/slab.h except it returns an index. @@ -456,7 +463,7 @@ struct kmem_cache { #define STATS_DEC_ACTIVE(x) ((x)->num_active--) #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) #define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_INC_REAPED(x) ((x)->reaped++) +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) #define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \ @@ -480,7 +487,7 @@ struct kmem_cache { #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) #define STATS_INC_GROWN(x) do { } while (0) -#define STATS_INC_REAPED(x) do { } while (0) +#define STATS_ADD_REAPED(x,y) do { } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -700,12 +707,6 @@ int slab_is_available(void) static DEFINE_PER_CPU(struct work_struct, reap_work); -static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); -static void enable_cpucache(struct kmem_cache *cachep); -static void cache_reap(void *unused); -static int __node_shrink(struct kmem_cache *cachep, int node); - static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { return cachep->array[smp_processor_id()]; @@ -1241,10 +1242,7 @@ free_array_cache: l3 = cachep->nodelists[node]; if (!l3) continue; - spin_lock_irq(&l3->list_lock); - /* free slabs belonging to this node */ - __node_shrink(cachep, node); - spin_unlock_irq(&l3->list_lock); + drain_freelist(cachep, l3, l3->free_objects); } mutex_unlock(&cache_chain_mutex); break; @@ -2248,32 +2246,45 @@ static void drain_cpu_caches(struct kmem_cache *cachep) } } -static int __node_shrink(struct kmem_cache *cachep, int node) +/* + * Remove slabs from the list of free slabs. + * Specify the number of slabs to drain in tofree. + * + * Returns the actual number of slabs released. + */ +static int drain_freelist(struct kmem_cache *cache, + struct kmem_list3 *l3, int tofree) { + struct list_head *p; + int nr_freed; struct slab *slabp; - struct kmem_list3 *l3 = cachep->nodelists[node]; - int ret; - for (;;) { - struct list_head *p; + nr_freed = 0; + while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { + spin_lock_irq(&l3->list_lock); p = l3->slabs_free.prev; - if (p == &l3->slabs_free) - break; + if (p == &l3->slabs_free) { + spin_unlock_irq(&l3->list_lock); + goto out; + } - slabp = list_entry(l3->slabs_free.prev, struct slab, list); + slabp = list_entry(p, struct slab, list); #if DEBUG BUG_ON(slabp->inuse); #endif list_del(&slabp->list); - - l3->free_objects -= cachep->num; + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. + */ + l3->free_objects -= cache->num; spin_unlock_irq(&l3->list_lock); - slab_destroy(cachep, slabp); - spin_lock_irq(&l3->list_lock); + slab_destroy(cache, slabp); + nr_freed++; } - ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); - return ret; +out: + return nr_freed; } static int __cache_shrink(struct kmem_cache *cachep) @@ -2286,11 +2297,13 @@ static int __cache_shrink(struct kmem_cache *cachep) check_irq_on(); for_each_online_node(i) { l3 = cachep->nodelists[i]; - if (l3) { - spin_lock_irq(&l3->list_lock); - ret += __node_shrink(cachep, i); - spin_unlock_irq(&l3->list_lock); - } + if (!l3) + continue; + + drain_freelist(cachep, l3, l3->free_objects); + + ret += !list_empty(&l3->slabs_full) || + !list_empty(&l3->slabs_partial); } return (ret ? 1 : 0); } @@ -3694,10 +3707,6 @@ static void cache_reap(void *unused) } list_for_each_entry(searchp, &cache_chain, next) { - struct list_head *p; - int tofree; - struct slab *slabp; - check_irq_on(); /* @@ -3722,41 +3731,15 @@ static void cache_reap(void *unused) drain_array(searchp, l3, l3->shared, 0, node); - if (l3->free_touched) { + if (l3->free_touched) l3->free_touched = 0; - goto next; - } - - tofree = (l3->free_limit + 5 * searchp->num - 1) / - (5 * searchp->num); - do { - /* - * Do not lock if there are no free blocks. - */ - if (list_empty(&l3->slabs_free)) - break; - - spin_lock_irq(&l3->list_lock); - p = l3->slabs_free.next; - if (p == &(l3->slabs_free)) { - spin_unlock_irq(&l3->list_lock); - break; - } + else { + int freed; - slabp = list_entry(p, struct slab, list); - BUG_ON(slabp->inuse); - list_del(&slabp->list); - STATS_INC_REAPED(searchp); - - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. searchp cannot disappear, we hold - * cache_chain_lock - */ - l3->free_objects -= searchp->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(searchp, slabp); - } while (--tofree > 0); + freed = drain_freelist(searchp, l3, (l3->free_limit + + 5 * searchp->num - 1) / (5 * searchp->num)); + STATS_ADD_REAPED(searchp, freed); + } next: cond_resched(); } -- cgit v1.2.3